First commit

Signed-off-by: Mattias Andrée <maandree@kth.se>
author: Mattias Andrée <maandree@kth.se> 2021-04-17 12:09:12 +0200
committer: Mattias Andrée <maandree@kth.se> 2021-04-17 12:09:12 +0200
commit: a5218fce6b19591c9412a84a5088c260c0676f61 (patch)
tree: 038f8b8e1064b4f61c831907ac4f20b77711f289
download: libparser-a5218fce6b19591c9412a84a5088c260c0676f61.tar.gz
libparser-a5218fce6b19591c9412a84a5088c260c0676f61.tar.bz2
libparser-a5218fce6b19591c9412a84a5088c260c0676f61.tar.xz
10 files changed, 1179 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..9988a52
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+*\#*
+*~
+*.o
+*.a
+*.lo
+*.so
+*.su
+/libparser-generate
+/calc-example/calc
+/calc-example/calc-syntax.c
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..c44b2d9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,15 @@
+ISC License
+
+© 2021 Mattias Andrée <maandree@kth.se>
+
+Permission to use, copy, modify, and/or distribute this software for any
+purpose with or without fee is hereby granted, provided that the above
+copyright notice and this permission notice appear in all copies.
+
+THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..c8628b0
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,37 @@
+.POSIX:
+
+CONFIGFILE = config.mk
+include $(CONFIGFILE)
+
+
+all: libparser.a libparser-generate calc-example/calc
+libparser.o: libparser.c libparser.h
+calc-example/calc-syntax.o: calc-example/calc-syntax.c libparser.h
+
+.c.o:
+	$(CC) -c -o $@ $< $(CPPFLAGS) $(CFLAGS)
+
+.c.lo:
+	$(CC) -fPIC -c -o $@ $< $(CPPFLAGS) $(CFLAGS)
+
+libparser-generate: libparser-generate.o
+	$(CC) -o $@ libparser-generate.o $(LDFLAGS)
+
+libparser.a: libparser.o
+	$(AR) rc $@ libparser.o
+	$(AR) -s $@
+
+calc-example/calc: calc-example/calc.o calc-example/calc-syntax.o libparser.a
+	$(CC) -o $@ calc-example/calc.o calc-example/calc-syntax.o libparser.a $(LDFLAGS)
+
+calc-example/calc-syntax.c: libparser-generate calc-example/calc.syntax
+	./libparser-generate _expr < calc-example/calc.syntax > $@
+
+clean:
+	-rm -f -- *.o *.lo *.a *.so *.su *-example/*.o *-example/*.su *-example/*-syntax.c libparser-generate
+	-rm -f -- calc-example/calc
+
+.SUFFIXES:
+.SUFFIXES: .c .o .lo
+
+.PHONY: all clean
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..f7199a5
--- /dev/null
+++ b/TODO
@@ -0,0 +1 @@
+Add support for prelexed.
diff --git a/calc-example/calc.c b/calc-example/calc.c
new file mode 100644
index 0000000..adda2ba
--- /dev/null
+++ b/calc-example/calc.c
@@ -0,0 +1,142 @@
+/* See LICENSE file for copyright and license details. */
+#include <libparser.h>
+#include <libsimple.h>
+#include <libsimple-arg.h>
+
+USAGE("");
+
+
+static void
+free_input(struct libparser_unit *node)
+{
+	struct libparser_unit *next;
+	for (; node; next = node->next, free(node), node = next)
+		free_input(node->in);
+}
+
+
+static intmax_t
+calculate(struct libparser_unit *node, const char *line)
+{
+	struct libparser_unit *next;
+	intmax_t value = 0;
+	int op;
+	if (!node->rule) {
+		next = node->in->next;
+		value = calculate(node->in, line);
+		free_input(next);
+	} else if (!strcmp(node->rule, "DIGIT")) {
+		value = (intmax_t)(line[node->start] - '0');
+	} else if (!strcmp(node->rule, "sign")) {
+		value = !strcmp(node->in->rule, "SUB") ? -1 : +1;
+		free(node->in);
+	} else if (!strcmp(node->rule, "unsigned")) {
+		value = 0;
+		next = node->in;
+		free(node);
+		for (node = next; node; node = next) {
+			next = node->next;
+			value *= 10;
+			value += calculate(node, line);
+		}
+	} else if (!strcmp(node->rule, "number")) {
+		next = node->in->next;
+		value = calculate(node->in, line);
+		free(node);
+		for (node = next; node; node = next) {
+			next = node->next;
+			value *= calculate(node, line);
+		}
+	} else if (!strcmp(node->rule, "value")) {
+		next = node->in->next;
+		value = calculate(node->in, line);
+		if (next)
+			value *= calculate(next, line);
+	} else if (!strcmp(node->rule, "hyper1")) {
+		next = node->in->next;
+		value = calculate(node->in, line);
+		free(node);
+		node = next;
+		while (node) {
+			next = node->next;
+			op = !strcmp(node->rule, "SUB") ? -1 : +1;
+			free(node);
+			node = next;
+			next = node->next;
+			if (op < 0)
+				value -= calculate(node, line);
+			else
+				value += calculate(node, line);
+			node = next;
+		}
+	} else if (!strcmp(node->rule, "hyper2")) {
+		next = node->in->next;
+		value = calculate(node->in, line);
+		free(node);
+		node = next;
+		while (node) {
+			next = node->next;
+			op = !strcmp(node->rule, "DIV") ? -1 : +1;
+			free(node);
+			node = next;
+			next = node->next;
+			if (op < 0)
+				value /= calculate(node, line);
+			else
+				value *= calculate(node, line);
+			node = next;
+		}
+	} else if (node->rule[0] != '@') {
+		abort();
+	} else if (node->in) {
+		next = node->in->next;
+		value = calculate(node->in, line);
+		if (next)
+			free_input(next);
+	}
+	free(node);
+	return value;
+}
+
+
+int
+main(int argc, char *argv[])
+{
+	struct libparser_unit *input;
+	char *line = NULL;
+	size_t size = 0;
+	ssize_t len;
+	intmax_t res;
+	int exception;
+
+	ARGBEGIN {
+	default:
+		usage();
+	} ARGEND;
+	if (argc)
+		usage();
+
+	while ((len = getline(&line, &size, stdin)) >= 0) {
+		if (len && line[len - 1] == '\n')
+			line[--len] = '\0';
+		input = libparser_parse_file(libparser_rule_table, line, (size_t)len, &exception);
+		if (!input) {
+			weprintf("didn't find anything to parse\n");
+			free_input(input);
+			continue;
+		} else if (input->end != (size_t)len) {
+			weprintf("line could not be parsed, stopped at column %zu\n", input->end);
+			free_input(input);
+			continue;
+		} else if (exception) {
+			weprintf("premature end of line\n");
+			free_input(input);
+			continue;
+		}
+		res = calculate(input, line);
+		printf("%ji\n", res);
+	}
+
+	free(line);
+	return 0;
+}
diff --git a/calc-example/calc.syntax b/calc-example/calc.syntax
new file mode 100644
index 0000000..ef7a9f4
--- /dev/null
+++ b/calc-example/calc.syntax
@@ -0,0 +1,27 @@
+_WHITESPACE = " " | "\t" | " ";
+_ = {_WHITESPACE};
+
+
+DIGIT = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9";
+
+ADD = _, ("+"),             _;
+SUB = _, ("-" | "−"),       _;
+MUL = _, ("*" | "⋅" | "×"), _;
+DIV = _, ("/" | "∕" | "÷"), _;
+
+
+sign = ADD | SUB;
+
+unsigned = DIGIT, {DIGIT | _WHITESPACE | "_" | "'"};
+
+_number = unsigned | "(", _expr, (")" | -);
+
+number = _number, {_, _number}; (* optionally with implicit multiplication *)
+
+value = [sign], number;
+
+_expr = hyper1;
+
+
+hyper1 = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _;
+hyper2 = _, value, {(MUL | DIV), (value | -)}, _;
diff --git a/config.mk b/config.mk
new file mode 100644
index 0000000..ac056a6
--- /dev/null
+++ b/config.mk
@@ -0,0 +1,8 @@
+PREFIX    = /usr
+MANPREFIX = $(PREFIX)/share/man
+
+CC = cc
+
+CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -I"$$(pwd)"
+CFLAGS   = -std=c99 -Wall -O2
+LDFLAGS  = -s -lsimple
diff --git a/libparser-generate.c b/libparser-generate.c
new file mode 100644
index 0000000..f5f19ae
--- /dev/null
+++ b/libparser-generate.c
@@ -0,0 +1,664 @@
+/* See LICENSE file for copyright and license details. */
+#include <libsimple.h>
+#include <libsimple-arg.h>
+
+USAGE("main-rule");
+
+
+struct token {
+	size_t lineno;
+	size_t column;
+	size_t character;
+	char s[];
+};
+
+struct node {
+	struct token *token;
+	struct node *parent;
+	struct node *next;
+	struct node *data;
+	struct node **head;
+};
+
+
+static char **rule_names = NULL;
+static size_t nrule_names = 0;
+static size_t rule_names_size = 0;
+
+static char **want_rules = NULL;
+static size_t nwant_rules = 0;
+static size_t want_rules_size = 0;
+
+
+static int
+strpcmp(const void *av, const void *bv)
+{
+	const char *const *a = av;
+	const char *const *b = bv;
+	return strcmp(*a, *b);
+}
+
+
+static int
+isidentifier(char c)
+{
+	return isalpha(c) || isdigit(c) || !isascii(c) || c == '_';
+}
+
+
+static int
+check_utf8(char *buf, size_t *ip, size_t len)
+{
+	size_t req, i;
+	uint32_t cp;
+	if ((buf[*ip] & 0xE0) == 0xC0) {
+		cp = (uint32_t)(unsigned char)(buf[*ip] ^ 0xC0);
+		req = 2;
+	} else if ((buf[*ip] & 0xF0) == 0xE0) {
+		cp = (uint32_t)(unsigned char)(buf[*ip] ^ 0xE0);
+		req = 3;
+	} else if ((buf[*ip] & 0xF8) == 0xF0) {
+		cp = (uint32_t)(unsigned char)(buf[*ip] ^ 0xF0);
+		req = 4;
+	} else {
+		return 0;
+	}
+	if (req > len - *ip)
+		return 0;
+	for (i = 1; i < req; i++) {
+		cp <<= 6;
+		if ((buf[*ip + i] & 0xC0) != 0x80)
+			return 0;
+		cp |= (uint32_t)(unsigned char)(buf[*ip + i] ^ 0x80);
+	}
+	*ip += req;
+	if ((cp & UINT32_C(0xD8000)) == UINT32_C(0xD8000))
+		return 0;
+	if (cp < (uint32_t)1 << (7 + 0 * 6))
+		return 0;
+	if (cp < (uint32_t)1 << (5 + 1 * 6))
+		return req == 2;
+	if (cp < (uint32_t)1 << (4 + 2 * 6))
+		return req == 3;
+	if (cp <= UINT32_C(0x10FFFF))
+		return req == 4;
+	return 0;
+}
+
+
+static char *
+readall_and_validate(int fd, const char *fname)
+{
+	size_t lineno = 1, column = 0, character = 0;
+	size_t size = 0, len = 0, i;
+	char *buf = NULL;
+	ssize_t r;
+
+	for (;; len += (size_t)r) {
+		if (len == size)
+			buf = erealloc(buf, size += 1024);
+		r = read(fd, &buf[len], size - len);
+		if (r <= 0) {
+			if (!r)
+				break;
+			eprintf("read %s:", fname);
+		}
+	}
+
+	for (i = 0; i < len; i++) {
+		if (buf[i] == '\n') {
+			lineno += 1;
+			column = 0;
+			character = 0;
+		} else if (buf[i] == '\t') {
+			column += 8 - column % 8;
+			character += 1;
+		} else if (buf[i] == '\r') {
+			eprintf("%s contains a CR character on line %zu at column %zu (character %zu)\n",
+			        fname, lineno, column, character);
+		} else if ((0 < buf[i] && buf[i] < ' ') || buf[i] == 0x7F) {
+			eprintf("%s contains a illegal character on line %zu at column %zu (character %zu)\n",
+			        fname, lineno, column, character);
+		} else if (buf[i] == '\0') {
+			eprintf("%s contains a NUL byte on line %zu at column %zu (character %zu)\n",
+			        fname, lineno, column, character);
+		} else if (!(buf[i] & 0x80)) {
+			character += 1;
+			column += 1;
+		} else if ((buf[i] & 0xC0) == 0x80) {
+			eprintf("%s contains a illegal byte on line %zu at column %zu (character %zu)\n",
+			        fname, lineno, column, character);
+		} else {
+			if (!check_utf8(buf, &i, len)) {
+				eprintf("%s contains a illegal byte sequence on line %zu at column %zu (character %zu)\n",
+				        fname, lineno, column, character);
+			}
+			i--;
+			character += 1;
+			column += 1;
+		}
+	}
+
+	buf = erealloc(buf, len + 1);
+	buf[len] = '\0';
+
+	return buf;
+}
+
+
+static struct token **
+tokenise(const char *data)
+{
+	enum {
+		NEW_TOKEN,
+		IDENTIFIER,
+		STRING,
+		STRING_ESC,
+		SPACE
+	} state = NEW_TOKEN;
+	size_t lineno = 1, column = 0, character = 0;
+	size_t token_lineno = 0, token_column = 0, token_character = 0;
+	struct token **tokens = NULL;
+	char *token = NULL;
+	size_t i, ntokens = 0, tokens_size = 0;
+	size_t token_len = 0, token_size = 0;
+
+	for (i = 0; data[i]; i++) {
+	again:
+		switch (state) {
+		case NEW_TOKEN:
+			token_lineno = lineno;
+			token_column = column;
+			token_character = character;
+			if (token_len == token_size)
+				token = erealloc(token, token_size += 16);
+			token[token_len++] = data[i];
+			if (isidentifier(data[i])) {
+				state = IDENTIFIER;
+			} else if (isspace(data[i])) {
+				state = SPACE;
+			} else if (data[i] == '"') {
+				state = STRING;
+				if (data[i + 1] == '"') {
+					eprintf("empty string token on line %zu at column %zu (character %zu)\n",
+					        lineno, column, character);
+				}
+			} else {
+			add_token:
+				if (token_len == token_size)
+					token = erealloc(token, token_size += 16);
+				token[token_len++] = '\0';
+				if (ntokens == tokens_size)
+					tokens = ereallocn(tokens, tokens_size += 16, sizeof(*tokens), 0);
+				tokens[ntokens] = emalloc(offsetof(struct token, s) + token_len);
+				tokens[ntokens]->lineno = token_lineno;
+				tokens[ntokens]->column = token_column;
+				tokens[ntokens]->character = token_character;
+				stpcpy(tokens[ntokens++]->s, token);
+				token_len = 0;
+				state = NEW_TOKEN;
+			}
+			break;
+
+		case IDENTIFIER:
+			if (isidentifier(data[i]) || data[i] == '-') {
+			add_char:
+				if (token_len == token_size)
+					token = erealloc(token, token_size += 16);
+				token[token_len++] = data[i];
+			} else {
+			add_token_and_do_again:
+				if (token_len == token_size)
+					token = erealloc(token, token_size += 16);
+				token[token_len++] = '\0';
+				if (ntokens == tokens_size)
+					tokens = ereallocn(tokens, tokens_size += 16, sizeof(*tokens), 0);
+				tokens[ntokens] = emalloc(offsetof(struct token, s) + token_len);
+				tokens[ntokens]->lineno = token_lineno;
+				tokens[ntokens]->column = token_column;
+				tokens[ntokens]->character = token_character;
+				stpcpy(tokens[ntokens++]->s, token);
+				token_len = 0;
+				state = NEW_TOKEN;
+				goto again;
+			}
+			break;
+
+		case STRING:
+			if (data[i] == '\n' || data[i] == '\t') {
+				eprintf("illegal whitespace on line %zu at column %zu (character %zu)\n",
+				        lineno, column, character);
+			} else if (data[i] == '"') {
+				goto add_token;
+			} else if (data[i] == '\\') {
+				state = STRING_ESC;
+				goto add_char;
+			} else {
+				goto add_char;
+			}
+			break;
+
+		case STRING_ESC:
+			if (data[i] == '\n' || data[i] == '\t') {
+				eprintf("illegal whitespace on line %zu at column %zu (character %zu)\n",
+				        lineno, column, character);
+			}
+			if (token_len == token_size)
+				token = erealloc(token, token_size += 16);
+			token[token_len++] = data[i];
+			state = STRING;
+			break;
+
+		case SPACE:
+			if (isspace(data[i])) {
+				goto add_char;
+			} else {
+				goto add_token_and_do_again;
+			}
+			break;
+
+		default:
+			abort();
+		};
+
+		if (data[i] == '\n') {
+			lineno += 1;
+			column = 0;
+			character = 0;
+		} else if (data[i] == '\t') {
+			column += 8 - column % 8;
+			character += 1;
+		} else {
+			character += (data[i] & 0xC0) != 0x80;
+			column += 1;
+		}
+	}
+	if (state != NEW_TOKEN && state != SPACE)
+		eprintf("premature end of file\n");
+
+	tokens = ereallocn(tokens, ntokens + 1, sizeof(*tokens), 0);
+	tokens[ntokens] = NULL;
+	free(token);
+
+	return tokens;
+}
+
+
+static void
+emit_and_free_sentence(struct node *node, size_t *indexp)
+{
+	size_t index = (*indexp)++, left, right;
+	struct node *next;
+
+	for (; node->token->s[0] == '('; node = next) {
+		next = node->data;
+		free(node->token);
+		free(node);
+	}
+
+	if (node->token->s[0] == '[' || node->token->s[0] == '{') {
+		emit_and_free_sentence(node->data, indexp);
+		printf("static union libparser_sentence sentence_%zu_%zu = {.unary = {"
+		           ".type = LIBPARSER_SENTENCE_TYPE_%s, .sentence = &sentence_%zu_%zu"
+		       "}};\n",
+		       nrule_names, index, node->token->s[0] == '[' ? "OPTIONAL" : "REPEATED", nrule_names, index + 1);
+	} else if (node->token->s[0] == '|' || node->token->s[0] == ',') {
+		right = *indexp;
+		emit_and_free_sentence(node->data->next, indexp);
+		left = *indexp;
+		emit_and_free_sentence(node->data, indexp);
+		printf("static union libparser_sentence sentence_%zu_%zu = {.binary = {"
+		           ".type = LIBPARSER_SENTENCE_TYPE_%s, "
+		           ".left = &sentence_%zu_%zu, .right = &sentence_%zu_%zu"
+		       "}};\n",
+		       nrule_names, index, node->token->s[0] == '|' ? "ALTERNATION" : "CONCATENATION",
+		       nrule_names, left, nrule_names, right);
+	} else if (node->token->s[0] == '"') {
+		printf("static union libparser_sentence sentence_%zu_%zu = {.string = {"
+		           ".type = LIBPARSER_SENTENCE_TYPE_STRING, "
+		           ".string = %s\", .length = sizeof(%s\") - 1"
+		       "}};\n",
+		       nrule_names, index, node->token->s, node->token->s);
+	} else if (node->token->s[0] == '-') {
+		printf("static union libparser_sentence sentence_%zu_%zu = {.type = LIBPARSER_SENTENCE_TYPE_EXCEPTION};\n",
+		       nrule_names, index);
+	} else {
+		if (nwant_rules == want_rules_size)
+			want_rules = ereallocn(want_rules, want_rules_size += 16, sizeof(*want_rules), 0);
+		want_rules[nwant_rules++] = estrdup(node->token->s);
+		printf("static union libparser_sentence sentence_%zu_%zu = {.rule = {"
+		           ".type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"%s\""
+		       "}};\n",
+		       nrule_names, index, node->token->s);
+	}
+
+	free(node->token);
+	free(node);
+}
+
+
+static struct node *
+order_sentences(struct node *node)
+{
+	struct node *tail = NULL, **head = &tail;
+	struct node *stack = NULL;
+	struct node *next, *prev;
+
+	for (; node; node = next) {
+		next = node->next;
+		if (node->token->s[0] == '(' || node->token->s[0] == '[' || node->token->s[0] == '{') {
+			node->data = order_sentences(node->data);
+			*head = node;
+			head = &node->next;
+		} else if (node->token->s[0] == '|' || node->token->s[0] == ',') {
+		again_operators:
+			if (!stack) {
+				node->next = stack;
+				stack = node;
+			} else if (node->token->s[0] == ',' && stack->token->s[0] == '|') {
+				node->next = stack;
+				stack = node;
+			} else if (node->token->s[0] == stack->token->s[0]) {
+				*head = stack;
+				head = &stack->next;
+				stack = stack->next;
+				node->next = stack;
+				stack = node;
+			} else {
+				*head = stack;
+				head = &stack->next;
+				stack = stack->next;
+				goto again_operators;
+			}
+		} else {
+			*head = node;
+			head = &node->next;
+		}
+	}
+
+	for (; stack; stack = next) {
+		next = stack->next;
+		*head = stack;
+		head = &stack->next;
+	}
+
+	*head = NULL;
+
+	for (stack = tail, prev = NULL; stack; prev = stack, stack = next) {
+		next = stack->next;
+		stack->next = prev;
+		if (stack->token->s[0] == '|' || stack->token->s[0] == ',') {
+			prev = stack->next->next->next;
+			stack->data = stack->next->next;
+			stack->data->next = stack->next;
+			stack->next = prev;
+		}
+	}
+
+	return prev;
+}
+
+
+static void
+emit_and_free_rule(struct node *rule)
+{
+	size_t index = 0;
+
+	rule->data = order_sentences(rule->data);
+	emit_and_free_sentence(rule->data, &index);
+
+	printf("static struct libparser_rule rule_%zu = {\"%s\", &sentence_%zu_0};\n", nrule_names, rule->token->s, nrule_names);
+
+	if (nrule_names == rule_names_size)
+		rule_names = ereallocn(rule_names, rule_names_size += 16, sizeof(*rule_names), 0);
+	rule_names[nrule_names++] = estrdup(rule->token->s);
+	free(rule->token);
+	free(rule);
+}
+
+
+int
+main(int argc, char *argv[])
+{
+	enum {
+		IDENTIFIER,
+		STRING,
+		SYMBOL,
+	} type;
+	enum {
+		NEW_RULE,
+		EXPECT_EQUALS,
+		EXPECT_OPERAND,
+		EXPECT_OPERATOR
+	} state = NEW_RULE;
+	struct node *stack = NULL, *parent_node, *node;
+	char *data;
+	struct token **tokens;
+	size_t i, j;
+	int cmp, err;
+
+	ARGBEGIN {
+	default:
+		usage();
+	} ARGEND;
+
+	if (argc != 1 || !isidentifier(argv[0][0]))
+		usage();
+	for (i = 0; argv[0][i]; i++)
+		if (!isidentifier(argv[0][i]) && argv[0][i] != '-')
+			usage();
+
+	data = readall_and_validate(STDIN_FILENO, "<stdin>");
+	tokens = tokenise(data);
+	free(data);
+
+	printf("#include <libparser.h>\n");
+
+	i = 0;
+again:
+	for (; tokens[i]; i++) {
+		if (tokens[i + 1] && tokens[i]->s[0] == '(' && tokens[i + 1]->s[0] == '*') {
+			for (i += 2; tokens[i] && tokens[i + 1]; i++) {
+				if (tokens[i]->s[0] == '*' && tokens[i + 1]->s[0] == ')') {
+					i += 2;
+					goto again;
+				}
+			}
+			eprintf("premature end of file\n");
+		}
+
+		if (tokens[i]->s[0] == '"') {
+			type = STRING;
+		} else if (isidentifier(tokens[i]->s[0])) {
+			type = IDENTIFIER;
+		} else if (isspace(tokens[i]->s[0])) {
+			free(tokens[i]);
+			continue;
+		} else {
+			type = SYMBOL;
+		}
+
+		switch (state) {
+		case NEW_RULE:
+			if (type != IDENTIFIER) {
+				eprintf("expected an identifier on line %zu at column %zu (character %zu)\n",
+				        tokens[i]->lineno, tokens[i]->column, tokens[i]->character);
+			}
+			stack = calloc(1, sizeof(*stack));
+			stack->token = tokens[i];
+			stack->head = &stack->data;
+			state = EXPECT_EQUALS;
+			for (j = 0; j < nrule_names; j++) {
+				if (!strcmp(rule_names[j], tokens[i]->s)) {
+					eprintf("duplicate definition of \"%s\" on line %zu at column %zu (character %zu)\n",
+					        tokens[i]->s, tokens[i]->lineno, tokens[i]->column, tokens[i]->character);
+				}
+			}
+			break;
+
+		case EXPECT_EQUALS:
+			if (type != SYMBOL || tokens[i]->s[0] != '=') {
+				eprintf("expected an '=' on line %zu at column %zu (character %zu)\n",
+				        tokens[i]->lineno, tokens[i]->column, tokens[i]->character);
+			}
+			free(tokens[i]);
+			state = EXPECT_OPERAND;
+			break;
+
+		case EXPECT_OPERAND:
+			if (type == SYMBOL) {
+				if (tokens[i]->s[0] == '(' || tokens[i]->s[0] == '[' || tokens[i]->s[0] == '{') {
+					parent_node = stack;
+					stack = ecalloc(1, sizeof(*stack));
+					stack->parent = parent_node;
+					stack->token = tokens[i];
+					stack->head = &stack->data;
+				} else if (tokens[i]->s[0] == '-') {
+					goto add;
+				} else {
+				stray:
+					eprintf("stray '%c' on line %zu at column %zu (character %zu)\n",
+					        tokens[i]->s[0], tokens[i]->lineno, tokens[i]->column, tokens[i]->character);
+				}
+			} else {
+			add:
+				state = EXPECT_OPERATOR;
+				goto add_singleton;
+			}
+			break;
+
+		case EXPECT_OPERATOR:
+			if (tokens[i]->s[0] == '|' || tokens[i]->s[0] == ',') {
+				state = EXPECT_OPERAND;
+			add_singleton:
+				node = calloc(1, sizeof(*node));
+				node->token = tokens[i];
+				*stack->head = node;
+				stack->head = &node->next;
+			} else if (tokens[i]->s[0] == ')') {
+				if (stack->token->s[0] != '(')
+					goto stray;
+				goto pop;
+			} else if (tokens[i]->s[0] == ']') {
+				if (stack->token->s[0] != '[')
+					goto stray;
+				goto pop;
+			} else if (tokens[i]->s[0] == '}') {
+				if (stack->token->s[0] != '{')
+					goto stray;
+			pop:
+				free(tokens[i]);
+				*stack->parent->head = stack;
+				stack->parent->head = &stack->next;
+				stack = stack->parent;
+			} else if (tokens[i]->s[0] == ';') {
+				if (stack->token->s[0] == ')' || stack->token->s[0] == ']' || stack->token->s[0] == '}')
+					eprintf("premature end of rule on line %zu at column %zu (character %zu): "
+					        "'%s' on line %zu at column %zu (character %zu) not closed\n",
+					        tokens[i]->lineno, tokens[i]->column, tokens[i]->character, stack->token->s,
+					        stack->token->lineno, stack->token->column, stack->token->character);
+				emit_and_free_rule(stack);
+				free(tokens[i]);
+				state = NEW_RULE;
+			} else {
+				eprintf("expected a '|', ',', or '%c' on line %zu at column %zu (character %zu)\n",
+				        stack->token->s[0] == '(' ? ')' :
+				        stack->token->s[0] == '[' ? ']' :
+				        stack->token->s[0] == '{' ? '}' : ';',
+				        tokens[i]->lineno, tokens[i]->column, tokens[i]->character);
+			}
+			break;
+
+		default:
+			abort();
+		}
+	}
+	free(tokens);
+	if (state != NEW_RULE)
+		eprintf("premature end of file\n");
+
+	err = 0;
+	qsort(rule_names, nrule_names, sizeof(*rule_names), strpcmp);
+	qsort(want_rules, nwant_rules, sizeof(*want_rules), strpcmp);
+	for (i = j = 0; i < nrule_names && j < nwant_rules;) {
+		cmp = strcmp(rule_names[i], want_rules[j]);
+		if (!cmp) {
+			i++;
+			for (j++; j < nwant_rules && !strcmp(want_rules[j - 1], want_rules[j]); j++);
+		} else if (!strcmp(rule_names[i], argv[0])) {
+			i++;
+		} else if (cmp < 0) {
+			weprintf("rule \"%s\" defined but not used\n", rule_names[i]);
+			i++;
+			err = 1;
+		} else {
+			weprintf("rule \"%s\" used but not defined\n", want_rules[j]);
+			for (j++; j < nwant_rules && !strcmp(want_rules[j - 1], want_rules[j]); j++);
+			err = 1;
+		}
+	}
+	for (; i < nrule_names; i++) {
+		if (strcmp(rule_names[i], argv[0])) {
+			weprintf("rule \"%s\" defined but not used\n", rule_names[i]);
+			err = 1;
+		}
+	}
+	while (j < nwant_rules) {
+		weprintf("rule \"%s\" used but not defined\n", want_rules[j]);
+		for (j++; j < nwant_rules && !strcmp(want_rules[j - 1], want_rules[j]); j++);
+		err = 1;
+	}
+	if (err)
+		exit(1);
+
+	for (i = 0; i < nrule_names; i++)
+		if (!strcmp(rule_names[i], argv[0]))
+			goto found_main;
+	eprintf("specified main rule (\"%s\") was not defined\n", argv[0]);
+
+found_main:
+	printf("static union libparser_sentence noeof_sentence = {.type = LIBPARSER_SENTENCE_TYPE_EXCEPTION};\n");
+	printf("static struct libparser_rule noeof_rule = {\"@noeof\", &noeof_sentence};\n");
+	printf("static union libparser_sentence noeof_rule_sentence = {.rule = "
+	           "{.type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"@noeof\"}"
+	       "};\n");
+
+	printf("static union libparser_sentence eof_sentence = {.type = LIBPARSER_SENTENCE_TYPE_EOF};\n");
+	printf("static struct libparser_rule eof_rule = {\"@eof\", &eof_sentence};\n");
+	printf("static union libparser_sentence eof_rule_sentence = {.rule = "
+	           "{.type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"@eof\"}"
+	       "};\n");
+
+	printf("static union libparser_sentence end_sentence = {.binary = {"
+	           ".type = LIBPARSER_SENTENCE_TYPE_ALTERNATION, "
+	           ".left = &eof_rule_sentence, .right = &noeof_rule_sentence"
+	       "}};\n");
+
+	printf("static union libparser_sentence main_rule_sentence = {.rule = "
+	           "{.type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"%s\"}"
+	       "};\n", argv[0]);
+
+	printf("static union libparser_sentence main_sentence = {.binary = {"
+	           ".type = LIBPARSER_SENTENCE_TYPE_CONCATENATION, "
+	           ".left = &main_rule_sentence, .right = &end_sentence"
+	       "}};\n");
+	printf("static struct libparser_rule main_rule = {\"@start\", &main_sentence};\n");
+
+	printf("const struct libparser_rule *const libparser_rule_table[] = {\n");
+	for (i = 0; i < nrule_names; i++) {
+		printf("\t&rule_%zu,\n", i);
+		free(rule_names[i]);
+	}
+	printf("\t&eof_rule,\n");
+	printf("\t&noeof_rule,\n");
+	printf("\t&main_rule,\n");
+	printf("\tNULL\n};\n");
+	free(rule_names);
+	for (i = 0; i < nwant_rules; i++)
+		free(want_rules[i]);
+	free(want_rules);
+
+	if (ferror(stdout) || fflush(stdout) || fclose(stdout))
+		eprintf("printf:");
+	return 0;
+}
diff --git a/libparser.c b/libparser.c
new file mode 100644
index 0000000..9ee22a1
--- /dev/null
+++ b/libparser.c
@@ -0,0 +1,202 @@
+/* See LICENSE file for copyright and license details. */
+#include "libparser.h"
+#include <libsimple.h>
+
+
+struct context {
+	const struct libparser_rule *const *rules;
+	struct libparser_unit *cache;
+	const char *data;
+	size_t length;
+	size_t position;
+	int done;
+	int exception;
+};
+
+
+static void
+free_unit(struct libparser_unit *unit, struct context *ctx)
+{
+	struct libparser_unit *prev;
+	while (unit) {
+		free_unit(unit->in, ctx);
+		prev = unit;
+		unit = unit->next;
+		prev->next = ctx->cache;
+		ctx->cache = prev;
+	}
+}
+
+
+static struct libparser_unit *
+try_match(const char *rule, const union libparser_sentence *sentence, struct context *ctx)
+{
+	struct libparser_unit *unit, *next;
+	struct libparser_unit **head;
+	unsigned char c;
+	size_t i;
+
+	if (!ctx->cache) {
+		unit = ecalloc(1, sizeof(*unit));
+	} else {
+		unit = ctx->cache;
+		ctx->cache = unit->next;
+		unit->in = unit->next = NULL;
+	}
+
+	unit->rule = rule;
+	unit->start = ctx->position;
+
+	switch (sentence->type) {
+	case LIBPARSER_SENTENCE_TYPE_CONCATENATION:
+		unit->in = try_match(NULL, sentence->binary.left, ctx);
+		if (!unit->in)
+			goto mismatch;
+		if (ctx->done)
+			break;
+		unit->in->next = try_match(NULL, sentence->binary.right, ctx);
+		if (!unit->in->next) {
+			free_unit(unit->in, ctx);
+			goto mismatch;
+		}
+		if (!unit->in->next->rule || unit->in->next->rule[0] == '_') {
+			unit->in->next->next = ctx->cache;
+			ctx->cache = unit->in->next;
+			unit->in->next = unit->in->next->in;
+		}
+		if (!unit->in->rule || unit->in->rule[0] == '_') {
+			next = unit->in->next;
+			unit->in->next = ctx->cache;
+			ctx->cache = unit->in;
+			unit->in = unit->in->in;
+			if (unit->in) {
+				for (head = &unit->in->next; *head; head = &(*head)->next);
+				*head = next;
+			} else {
+				unit->in = next;
+			}
+		}
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_ALTERNATION:
+		unit->in = try_match(NULL, sentence->binary.left, ctx);
+		if (!unit->in) {
+			unit->in = try_match(NULL, sentence->binary.right, ctx);
+			if (!unit->in)
+				goto mismatch;
+		}
+	prone:
+		if (unit->in && (!unit->in->rule || unit->in->rule[0] == '_')) {
+			unit->in->next = ctx->cache;
+			ctx->cache = unit->in;
+			unit->in = unit->in->in;
+		}
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
+		unit->in = try_match(NULL, sentence->unary.sentence, ctx);
+		goto prone;
+
+	case LIBPARSER_SENTENCE_TYPE_REPEATED:
+		head = &unit->in;
+		while ((*head = try_match(NULL, sentence->unary.sentence, ctx))) {
+			if (!(*head)->rule || (*head)->rule[0] == '_') {
+				(*head)->next = ctx->cache;
+				ctx->cache = *head;
+				*head = (*head)->in;
+				while (*head)
+					head = &(*head)->next;
+			} else {
+				head = &(*head)->next;
+			}
+			if (ctx->done)
+				break;
+		}
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_STRING:
+		if (sentence->string.length > ctx->length - ctx->position)
+			goto mismatch;
+		if (memcmp(&ctx->data[ctx->position], sentence->string.string, sentence->string.length))
+			goto mismatch;
+		ctx->position += sentence->string.length;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_CHAR_RANGE:
+		if (ctx->position == ctx->length)
+			goto mismatch;
+		c = ((const unsigned char *)ctx->data)[ctx->position];
+		if (sentence->char_range.low > c || c > sentence->char_range.high)
+			goto mismatch;
+		ctx->position += 1;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_RULE:
+		for (i = 0; ctx->rules[i]; i++)
+			if (!strcmp(ctx->rules[i]->name, sentence->rule.rule))
+				break;
+		if (!ctx->rules[i])
+			abort();
+		unit->in = try_match(ctx->rules[i]->name, ctx->rules[i]->sentence, ctx);
+		if (!unit->in)
+			goto mismatch;
+		goto prone;
+
+	case LIBPARSER_SENTENCE_TYPE_EXCEPTION:
+		ctx->done = 1;
+		ctx->exception = 1;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_EOF:
+		if (ctx->position != ctx->length)
+			goto mismatch;
+		ctx->done = 1;
+		break;
+
+	default:
+		abort();
+	}
+
+	unit->end = ctx->position;
+	return unit;
+
+mismatch:
+	ctx->position = unit->start;
+	unit->next = ctx->cache;
+	ctx->cache = unit;
+	return NULL;
+}
+
+
+struct libparser_unit *
+libparser_parse_file(const struct libparser_rule *const rules[], const char *data, size_t length, int *exceptionp)
+{
+	struct libparser_unit *ret, *t;
+	struct context ctx;
+	size_t i;
+
+	ctx.rules = rules;
+	ctx.cache = NULL;
+	ctx.data = data;
+	ctx.length = length;
+	ctx.position = 0;
+	ctx.done = 0;
+	ctx.exception = 0;
+
+	for (i = 0; rules[i]; i++)
+		if (!strcmp(rules[i]->name, "@start"))
+			break;
+	if (!rules[i])
+		abort();
+
+	ret = try_match(rules[i]->name, rules[i]->sentence, &ctx);
+	*exceptionp = ctx.exception;
+
+	while (ctx.cache) {
+		t = ctx.cache;
+		ctx.cache = t->next;
+		free(t);
+	}
+
+	return ret;
+}
diff --git a/libparser.h b/libparser.h
new file mode 100644
index 0000000..74ab319
--- /dev/null
+++ b/libparser.h
@@ -0,0 +1,73 @@
+/* See LICENSE file for copyright and license details. */
+#include <stddef.h>
+
+union libparser_sentence;
+
+enum libparser_sentence_type {
+	LIBPARSER_SENTENCE_TYPE_CONCATENATION, /* .binary */
+	LIBPARSER_SENTENCE_TYPE_ALTERNATION,   /* .binary */
+	LIBPARSER_SENTENCE_TYPE_OPTIONAL,      /* .unary */
+	LIBPARSER_SENTENCE_TYPE_REPEATED,      /* .unary */
+	LIBPARSER_SENTENCE_TYPE_STRING,        /* .string */
+	LIBPARSER_SENTENCE_TYPE_CHAR_RANGE,    /* .char_range */ /* TODO not supported yet: <low, high> */
+	LIBPARSER_SENTENCE_TYPE_RULE,          /* .rule */
+	LIBPARSER_SENTENCE_TYPE_EXCEPTION,     /* (none) */
+	LIBPARSER_SENTENCE_TYPE_EOF            /* (none) */
+};
+
+struct libparser_sentence_binary {
+	enum libparser_sentence_type type;
+	const union libparser_sentence *left;
+	const union libparser_sentence *right;
+};
+
+struct libparser_sentence_unary {
+	enum libparser_sentence_type type;
+	const union libparser_sentence *sentence;
+};
+
+struct libparser_sentence_string {
+	enum libparser_sentence_type type;
+	const char *string;
+	size_t length;
+};
+
+struct libparser_sentence_char_range {
+	enum libparser_sentence_type type;
+	unsigned char low;
+	unsigned char high;
+};
+
+struct libparser_sentence_rule {
+	enum libparser_sentence_type type;
+	const char *rule;
+};
+
+union libparser_sentence { 
+	enum libparser_sentence_type type;
+	struct libparser_sentence_binary binary;
+	struct libparser_sentence_unary unary;
+	struct libparser_sentence_string string;
+	struct libparser_sentence_char_range char_range;
+	struct libparser_sentence_rule rule;
+};
+
+struct libparser_rule {
+	const char *name;
+	union libparser_sentence *sentence;
+};
+
+struct libparser_unit {
+	const char *rule;
+	struct libparser_unit *in;
+	struct libparser_unit *next;
+	size_t start;
+	size_t end;
+};
+
+
+extern const struct libparser_rule *const libparser_rule_table[];
+
+
+struct libparser_unit *libparser_parse_file(const struct libparser_rule *const rules[],
+                                            const char *data, size_t length, int *exceptionp);
author	Mattias Andrée <maandree@kth.se>	2021-04-17 12:09:12 +0200
committer	Mattias Andrée <maandree@kth.se>	2021-04-17 12:09:12 +0200
commit	a5218fce6b19591c9412a84a5088c260c0676f61 (patch)
tree	038f8b8e1064b4f61c831907ac4f20b77711f289
download	libparser-a5218fce6b19591c9412a84a5088c260c0676f61.tar.gz libparser-a5218fce6b19591c9412a84a5088c260c0676f61.tar.bz2 libparser-a5218fce6b19591c9412a84a5088c260c0676f61.tar.xz