diff options
| author | Mattias Andrée <maandree@kth.se> | 2021-04-17 12:09:12 +0200 | 
|---|---|---|
| committer | Mattias Andrée <maandree@kth.se> | 2021-04-17 12:09:12 +0200 | 
| commit | a5218fce6b19591c9412a84a5088c260c0676f61 (patch) | |
| tree | 038f8b8e1064b4f61c831907ac4f20b77711f289 | |
| download | libparser-a5218fce6b19591c9412a84a5088c260c0676f61.tar.gz libparser-a5218fce6b19591c9412a84a5088c260c0676f61.tar.bz2 libparser-a5218fce6b19591c9412a84a5088c260c0676f61.tar.xz | |
First commit
Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat (limited to '')
| -rw-r--r-- | .gitignore | 10 | ||||
| -rw-r--r-- | LICENSE | 15 | ||||
| -rw-r--r-- | Makefile | 37 | ||||
| -rw-r--r-- | TODO | 1 | ||||
| -rw-r--r-- | calc-example/calc.c | 142 | ||||
| -rw-r--r-- | calc-example/calc.syntax | 27 | ||||
| -rw-r--r-- | config.mk | 8 | ||||
| -rw-r--r-- | libparser-generate.c | 664 | ||||
| -rw-r--r-- | libparser.c | 202 | ||||
| -rw-r--r-- | libparser.h | 73 | 
10 files changed, 1179 insertions, 0 deletions
| diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9988a52 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +*\#* +*~ +*.o +*.a +*.lo +*.so +*.su +/libparser-generate +/calc-example/calc +/calc-example/calc-syntax.c @@ -0,0 +1,15 @@ +ISC License + +© 2021 Mattias Andrée <maandree@kth.se> + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c8628b0 --- /dev/null +++ b/Makefile @@ -0,0 +1,37 @@ +.POSIX: + +CONFIGFILE = config.mk +include $(CONFIGFILE) + + +all: libparser.a libparser-generate calc-example/calc +libparser.o: libparser.c libparser.h +calc-example/calc-syntax.o: calc-example/calc-syntax.c libparser.h + +.c.o: +	$(CC) -c -o $@ $< $(CPPFLAGS) $(CFLAGS) + +.c.lo: +	$(CC) -fPIC -c -o $@ $< $(CPPFLAGS) $(CFLAGS) + +libparser-generate: libparser-generate.o +	$(CC) -o $@ libparser-generate.o $(LDFLAGS) + +libparser.a: libparser.o +	$(AR) rc $@ libparser.o +	$(AR) -s $@ + +calc-example/calc: calc-example/calc.o calc-example/calc-syntax.o libparser.a +	$(CC) -o $@ calc-example/calc.o calc-example/calc-syntax.o libparser.a $(LDFLAGS) + +calc-example/calc-syntax.c: libparser-generate calc-example/calc.syntax +	./libparser-generate _expr < calc-example/calc.syntax > $@ + +clean: +	-rm -f -- *.o *.lo *.a *.so *.su *-example/*.o *-example/*.su *-example/*-syntax.c libparser-generate +	-rm -f -- calc-example/calc + +.SUFFIXES: +.SUFFIXES: .c .o .lo + +.PHONY: all clean @@ -0,0 +1 @@ +Add support for prelexed. diff --git a/calc-example/calc.c b/calc-example/calc.c new file mode 100644 index 0000000..adda2ba --- /dev/null +++ b/calc-example/calc.c @@ -0,0 +1,142 @@ +/* See LICENSE file for copyright and license details. */ +#include <libparser.h> +#include <libsimple.h> +#include <libsimple-arg.h> + +USAGE(""); + + +static void +free_input(struct libparser_unit *node) +{ +	struct libparser_unit *next; +	for (; node; next = node->next, free(node), node = next) +		free_input(node->in); +} + + +static intmax_t +calculate(struct libparser_unit *node, const char *line) +{ +	struct libparser_unit *next; +	intmax_t value = 0; +	int op; +	if (!node->rule) { +		next = node->in->next; +		value = calculate(node->in, line); +		free_input(next); +	} else if (!strcmp(node->rule, "DIGIT")) { +		value = (intmax_t)(line[node->start] - '0'); +	} else if (!strcmp(node->rule, "sign")) { +		value = !strcmp(node->in->rule, "SUB") ? -1 : +1; +		free(node->in); +	} else if (!strcmp(node->rule, "unsigned")) { +		value = 0; +		next = node->in; +		free(node); +		for (node = next; node; node = next) { +			next = node->next; +			value *= 10; +			value += calculate(node, line); +		} +	} else if (!strcmp(node->rule, "number")) { +		next = node->in->next; +		value = calculate(node->in, line); +		free(node); +		for (node = next; node; node = next) { +			next = node->next; +			value *= calculate(node, line); +		} +	} else if (!strcmp(node->rule, "value")) { +		next = node->in->next; +		value = calculate(node->in, line); +		if (next) +			value *= calculate(next, line); +	} else if (!strcmp(node->rule, "hyper1")) { +		next = node->in->next; +		value = calculate(node->in, line); +		free(node); +		node = next; +		while (node) { +			next = node->next; +			op = !strcmp(node->rule, "SUB") ? -1 : +1; +			free(node); +			node = next; +			next = node->next; +			if (op < 0) +				value -= calculate(node, line); +			else +				value += calculate(node, line); +			node = next; +		} +	} else if (!strcmp(node->rule, "hyper2")) { +		next = node->in->next; +		value = calculate(node->in, line); +		free(node); +		node = next; +		while (node) { +			next = node->next; +			op = !strcmp(node->rule, "DIV") ? -1 : +1; +			free(node); +			node = next; +			next = node->next; +			if (op < 0) +				value /= calculate(node, line); +			else +				value *= calculate(node, line); +			node = next; +		} +	} else if (node->rule[0] != '@') { +		abort(); +	} else if (node->in) { +		next = node->in->next; +		value = calculate(node->in, line); +		if (next) +			free_input(next); +	} +	free(node); +	return value; +} + + +int +main(int argc, char *argv[]) +{ +	struct libparser_unit *input; +	char *line = NULL; +	size_t size = 0; +	ssize_t len; +	intmax_t res; +	int exception; + +	ARGBEGIN { +	default: +		usage(); +	} ARGEND; +	if (argc) +		usage(); + +	while ((len = getline(&line, &size, stdin)) >= 0) { +		if (len && line[len - 1] == '\n') +			line[--len] = '\0'; +		input = libparser_parse_file(libparser_rule_table, line, (size_t)len, &exception); +		if (!input) { +			weprintf("didn't find anything to parse\n"); +			free_input(input); +			continue; +		} else if (input->end != (size_t)len) { +			weprintf("line could not be parsed, stopped at column %zu\n", input->end); +			free_input(input); +			continue; +		} else if (exception) { +			weprintf("premature end of line\n"); +			free_input(input); +			continue; +		} +		res = calculate(input, line); +		printf("%ji\n", res); +	} + +	free(line); +	return 0; +} diff --git a/calc-example/calc.syntax b/calc-example/calc.syntax new file mode 100644 index 0000000..ef7a9f4 --- /dev/null +++ b/calc-example/calc.syntax @@ -0,0 +1,27 @@ +_WHITESPACE = " " | "\t" | " "; +_ = {_WHITESPACE}; + + +DIGIT = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"; + +ADD = _, ("+"),             _; +SUB = _, ("-" | "−"),       _; +MUL = _, ("*" | "⋅" | "×"), _; +DIV = _, ("/" | "∕" | "÷"), _; + + +sign = ADD | SUB; + +unsigned = DIGIT, {DIGIT | _WHITESPACE | "_" | "'"}; + +_number = unsigned | "(", _expr, (")" | -); + +number = _number, {_, _number}; (* optionally with implicit multiplication *) + +value = [sign], number; + +_expr = hyper1; + + +hyper1 = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _; +hyper2 = _, value, {(MUL | DIV), (value | -)}, _; diff --git a/config.mk b/config.mk new file mode 100644 index 0000000..ac056a6 --- /dev/null +++ b/config.mk @@ -0,0 +1,8 @@ +PREFIX    = /usr +MANPREFIX = $(PREFIX)/share/man + +CC = cc + +CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -I"$$(pwd)" +CFLAGS   = -std=c99 -Wall -O2 +LDFLAGS  = -s -lsimple diff --git a/libparser-generate.c b/libparser-generate.c new file mode 100644 index 0000000..f5f19ae --- /dev/null +++ b/libparser-generate.c @@ -0,0 +1,664 @@ +/* See LICENSE file for copyright and license details. */ +#include <libsimple.h> +#include <libsimple-arg.h> + +USAGE("main-rule"); + + +struct token { +	size_t lineno; +	size_t column; +	size_t character; +	char s[]; +}; + +struct node { +	struct token *token; +	struct node *parent; +	struct node *next; +	struct node *data; +	struct node **head; +}; + + +static char **rule_names = NULL; +static size_t nrule_names = 0; +static size_t rule_names_size = 0; + +static char **want_rules = NULL; +static size_t nwant_rules = 0; +static size_t want_rules_size = 0; + + +static int +strpcmp(const void *av, const void *bv) +{ +	const char *const *a = av; +	const char *const *b = bv; +	return strcmp(*a, *b); +} + + +static int +isidentifier(char c) +{ +	return isalpha(c) || isdigit(c) || !isascii(c) || c == '_'; +} + + +static int +check_utf8(char *buf, size_t *ip, size_t len) +{ +	size_t req, i; +	uint32_t cp; +	if ((buf[*ip] & 0xE0) == 0xC0) { +		cp = (uint32_t)(unsigned char)(buf[*ip] ^ 0xC0); +		req = 2; +	} else if ((buf[*ip] & 0xF0) == 0xE0) { +		cp = (uint32_t)(unsigned char)(buf[*ip] ^ 0xE0); +		req = 3; +	} else if ((buf[*ip] & 0xF8) == 0xF0) { +		cp = (uint32_t)(unsigned char)(buf[*ip] ^ 0xF0); +		req = 4; +	} else { +		return 0; +	} +	if (req > len - *ip) +		return 0; +	for (i = 1; i < req; i++) { +		cp <<= 6; +		if ((buf[*ip + i] & 0xC0) != 0x80) +			return 0; +		cp |= (uint32_t)(unsigned char)(buf[*ip + i] ^ 0x80); +	} +	*ip += req; +	if ((cp & UINT32_C(0xD8000)) == UINT32_C(0xD8000)) +		return 0; +	if (cp < (uint32_t)1 << (7 + 0 * 6)) +		return 0; +	if (cp < (uint32_t)1 << (5 + 1 * 6)) +		return req == 2; +	if (cp < (uint32_t)1 << (4 + 2 * 6)) +		return req == 3; +	if (cp <= UINT32_C(0x10FFFF)) +		return req == 4; +	return 0; +} + + +static char * +readall_and_validate(int fd, const char *fname) +{ +	size_t lineno = 1, column = 0, character = 0; +	size_t size = 0, len = 0, i; +	char *buf = NULL; +	ssize_t r; + +	for (;; len += (size_t)r) { +		if (len == size) +			buf = erealloc(buf, size += 1024); +		r = read(fd, &buf[len], size - len); +		if (r <= 0) { +			if (!r) +				break; +			eprintf("read %s:", fname); +		} +	} + +	for (i = 0; i < len; i++) { +		if (buf[i] == '\n') { +			lineno += 1; +			column = 0; +			character = 0; +		} else if (buf[i] == '\t') { +			column += 8 - column % 8; +			character += 1; +		} else if (buf[i] == '\r') { +			eprintf("%s contains a CR character on line %zu at column %zu (character %zu)\n", +			        fname, lineno, column, character); +		} else if ((0 < buf[i] && buf[i] < ' ') || buf[i] == 0x7F) { +			eprintf("%s contains a illegal character on line %zu at column %zu (character %zu)\n", +			        fname, lineno, column, character); +		} else if (buf[i] == '\0') { +			eprintf("%s contains a NUL byte on line %zu at column %zu (character %zu)\n", +			        fname, lineno, column, character); +		} else if (!(buf[i] & 0x80)) { +			character += 1; +			column += 1; +		} else if ((buf[i] & 0xC0) == 0x80) { +			eprintf("%s contains a illegal byte on line %zu at column %zu (character %zu)\n", +			        fname, lineno, column, character); +		} else { +			if (!check_utf8(buf, &i, len)) { +				eprintf("%s contains a illegal byte sequence on line %zu at column %zu (character %zu)\n", +				        fname, lineno, column, character); +			} +			i--; +			character += 1; +			column += 1; +		} +	} + +	buf = erealloc(buf, len + 1); +	buf[len] = '\0'; + +	return buf; +} + + +static struct token ** +tokenise(const char *data) +{ +	enum { +		NEW_TOKEN, +		IDENTIFIER, +		STRING, +		STRING_ESC, +		SPACE +	} state = NEW_TOKEN; +	size_t lineno = 1, column = 0, character = 0; +	size_t token_lineno = 0, token_column = 0, token_character = 0; +	struct token **tokens = NULL; +	char *token = NULL; +	size_t i, ntokens = 0, tokens_size = 0; +	size_t token_len = 0, token_size = 0; + +	for (i = 0; data[i]; i++) { +	again: +		switch (state) { +		case NEW_TOKEN: +			token_lineno = lineno; +			token_column = column; +			token_character = character; +			if (token_len == token_size) +				token = erealloc(token, token_size += 16); +			token[token_len++] = data[i]; +			if (isidentifier(data[i])) { +				state = IDENTIFIER; +			} else if (isspace(data[i])) { +				state = SPACE; +			} else if (data[i] == '"') { +				state = STRING; +				if (data[i + 1] == '"') { +					eprintf("empty string token on line %zu at column %zu (character %zu)\n", +					        lineno, column, character); +				} +			} else { +			add_token: +				if (token_len == token_size) +					token = erealloc(token, token_size += 16); +				token[token_len++] = '\0'; +				if (ntokens == tokens_size) +					tokens = ereallocn(tokens, tokens_size += 16, sizeof(*tokens), 0); +				tokens[ntokens] = emalloc(offsetof(struct token, s) + token_len); +				tokens[ntokens]->lineno = token_lineno; +				tokens[ntokens]->column = token_column; +				tokens[ntokens]->character = token_character; +				stpcpy(tokens[ntokens++]->s, token); +				token_len = 0; +				state = NEW_TOKEN; +			} +			break; + +		case IDENTIFIER: +			if (isidentifier(data[i]) || data[i] == '-') { +			add_char: +				if (token_len == token_size) +					token = erealloc(token, token_size += 16); +				token[token_len++] = data[i]; +			} else { +			add_token_and_do_again: +				if (token_len == token_size) +					token = erealloc(token, token_size += 16); +				token[token_len++] = '\0'; +				if (ntokens == tokens_size) +					tokens = ereallocn(tokens, tokens_size += 16, sizeof(*tokens), 0); +				tokens[ntokens] = emalloc(offsetof(struct token, s) + token_len); +				tokens[ntokens]->lineno = token_lineno; +				tokens[ntokens]->column = token_column; +				tokens[ntokens]->character = token_character; +				stpcpy(tokens[ntokens++]->s, token); +				token_len = 0; +				state = NEW_TOKEN; +				goto again; +			} +			break; + +		case STRING: +			if (data[i] == '\n' || data[i] == '\t') { +				eprintf("illegal whitespace on line %zu at column %zu (character %zu)\n", +				        lineno, column, character); +			} else if (data[i] == '"') { +				goto add_token; +			} else if (data[i] == '\\') { +				state = STRING_ESC; +				goto add_char; +			} else { +				goto add_char; +			} +			break; + +		case STRING_ESC: +			if (data[i] == '\n' || data[i] == '\t') { +				eprintf("illegal whitespace on line %zu at column %zu (character %zu)\n", +				        lineno, column, character); +			} +			if (token_len == token_size) +				token = erealloc(token, token_size += 16); +			token[token_len++] = data[i]; +			state = STRING; +			break; + +		case SPACE: +			if (isspace(data[i])) { +				goto add_char; +			} else { +				goto add_token_and_do_again; +			} +			break; + +		default: +			abort(); +		}; + +		if (data[i] == '\n') { +			lineno += 1; +			column = 0; +			character = 0; +		} else if (data[i] == '\t') { +			column += 8 - column % 8; +			character += 1; +		} else { +			character += (data[i] & 0xC0) != 0x80; +			column += 1; +		} +	} +	if (state != NEW_TOKEN && state != SPACE) +		eprintf("premature end of file\n"); + +	tokens = ereallocn(tokens, ntokens + 1, sizeof(*tokens), 0); +	tokens[ntokens] = NULL; +	free(token); + +	return tokens; +} + + +static void +emit_and_free_sentence(struct node *node, size_t *indexp) +{ +	size_t index = (*indexp)++, left, right; +	struct node *next; + +	for (; node->token->s[0] == '('; node = next) { +		next = node->data; +		free(node->token); +		free(node); +	} + +	if (node->token->s[0] == '[' || node->token->s[0] == '{') { +		emit_and_free_sentence(node->data, indexp); +		printf("static union libparser_sentence sentence_%zu_%zu = {.unary = {" +		           ".type = LIBPARSER_SENTENCE_TYPE_%s, .sentence = &sentence_%zu_%zu" +		       "}};\n", +		       nrule_names, index, node->token->s[0] == '[' ? "OPTIONAL" : "REPEATED", nrule_names, index + 1); +	} else if (node->token->s[0] == '|' || node->token->s[0] == ',') { +		right = *indexp; +		emit_and_free_sentence(node->data->next, indexp); +		left = *indexp; +		emit_and_free_sentence(node->data, indexp); +		printf("static union libparser_sentence sentence_%zu_%zu = {.binary = {" +		           ".type = LIBPARSER_SENTENCE_TYPE_%s, " +		           ".left = &sentence_%zu_%zu, .right = &sentence_%zu_%zu" +		       "}};\n", +		       nrule_names, index, node->token->s[0] == '|' ? "ALTERNATION" : "CONCATENATION", +		       nrule_names, left, nrule_names, right); +	} else if (node->token->s[0] == '"') { +		printf("static union libparser_sentence sentence_%zu_%zu = {.string = {" +		           ".type = LIBPARSER_SENTENCE_TYPE_STRING, " +		           ".string = %s\", .length = sizeof(%s\") - 1" +		       "}};\n", +		       nrule_names, index, node->token->s, node->token->s); +	} else if (node->token->s[0] == '-') { +		printf("static union libparser_sentence sentence_%zu_%zu = {.type = LIBPARSER_SENTENCE_TYPE_EXCEPTION};\n", +		       nrule_names, index); +	} else { +		if (nwant_rules == want_rules_size) +			want_rules = ereallocn(want_rules, want_rules_size += 16, sizeof(*want_rules), 0); +		want_rules[nwant_rules++] = estrdup(node->token->s); +		printf("static union libparser_sentence sentence_%zu_%zu = {.rule = {" +		           ".type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"%s\"" +		       "}};\n", +		       nrule_names, index, node->token->s); +	} + +	free(node->token); +	free(node); +} + + +static struct node * +order_sentences(struct node *node) +{ +	struct node *tail = NULL, **head = &tail; +	struct node *stack = NULL; +	struct node *next, *prev; + +	for (; node; node = next) { +		next = node->next; +		if (node->token->s[0] == '(' || node->token->s[0] == '[' || node->token->s[0] == '{') { +			node->data = order_sentences(node->data); +			*head = node; +			head = &node->next; +		} else if (node->token->s[0] == '|' || node->token->s[0] == ',') { +		again_operators: +			if (!stack) { +				node->next = stack; +				stack = node; +			} else if (node->token->s[0] == ',' && stack->token->s[0] == '|') { +				node->next = stack; +				stack = node; +			} else if (node->token->s[0] == stack->token->s[0]) { +				*head = stack; +				head = &stack->next; +				stack = stack->next; +				node->next = stack; +				stack = node; +			} else { +				*head = stack; +				head = &stack->next; +				stack = stack->next; +				goto again_operators; +			} +		} else { +			*head = node; +			head = &node->next; +		} +	} + +	for (; stack; stack = next) { +		next = stack->next; +		*head = stack; +		head = &stack->next; +	} + +	*head = NULL; + +	for (stack = tail, prev = NULL; stack; prev = stack, stack = next) { +		next = stack->next; +		stack->next = prev; +		if (stack->token->s[0] == '|' || stack->token->s[0] == ',') { +			prev = stack->next->next->next; +			stack->data = stack->next->next; +			stack->data->next = stack->next; +			stack->next = prev; +		} +	} + +	return prev; +} + + +static void +emit_and_free_rule(struct node *rule) +{ +	size_t index = 0; + +	rule->data = order_sentences(rule->data); +	emit_and_free_sentence(rule->data, &index); + +	printf("static struct libparser_rule rule_%zu = {\"%s\", &sentence_%zu_0};\n", nrule_names, rule->token->s, nrule_names); + +	if (nrule_names == rule_names_size) +		rule_names = ereallocn(rule_names, rule_names_size += 16, sizeof(*rule_names), 0); +	rule_names[nrule_names++] = estrdup(rule->token->s); +	free(rule->token); +	free(rule); +} + + +int +main(int argc, char *argv[]) +{ +	enum { +		IDENTIFIER, +		STRING, +		SYMBOL, +	} type; +	enum { +		NEW_RULE, +		EXPECT_EQUALS, +		EXPECT_OPERAND, +		EXPECT_OPERATOR +	} state = NEW_RULE; +	struct node *stack = NULL, *parent_node, *node; +	char *data; +	struct token **tokens; +	size_t i, j; +	int cmp, err; + +	ARGBEGIN { +	default: +		usage(); +	} ARGEND; + +	if (argc != 1 || !isidentifier(argv[0][0])) +		usage(); +	for (i = 0; argv[0][i]; i++) +		if (!isidentifier(argv[0][i]) && argv[0][i] != '-') +			usage(); + +	data = readall_and_validate(STDIN_FILENO, "<stdin>"); +	tokens = tokenise(data); +	free(data); + +	printf("#include <libparser.h>\n"); + +	i = 0; +again: +	for (; tokens[i]; i++) { +		if (tokens[i + 1] && tokens[i]->s[0] == '(' && tokens[i + 1]->s[0] == '*') { +			for (i += 2; tokens[i] && tokens[i + 1]; i++) { +				if (tokens[i]->s[0] == '*' && tokens[i + 1]->s[0] == ')') { +					i += 2; +					goto again; +				} +			} +			eprintf("premature end of file\n"); +		} + +		if (tokens[i]->s[0] == '"') { +			type = STRING; +		} else if (isidentifier(tokens[i]->s[0])) { +			type = IDENTIFIER; +		} else if (isspace(tokens[i]->s[0])) { +			free(tokens[i]); +			continue; +		} else { +			type = SYMBOL; +		} + +		switch (state) { +		case NEW_RULE: +			if (type != IDENTIFIER) { +				eprintf("expected an identifier on line %zu at column %zu (character %zu)\n", +				        tokens[i]->lineno, tokens[i]->column, tokens[i]->character); +			} +			stack = calloc(1, sizeof(*stack)); +			stack->token = tokens[i]; +			stack->head = &stack->data; +			state = EXPECT_EQUALS; +			for (j = 0; j < nrule_names; j++) { +				if (!strcmp(rule_names[j], tokens[i]->s)) { +					eprintf("duplicate definition of \"%s\" on line %zu at column %zu (character %zu)\n", +					        tokens[i]->s, tokens[i]->lineno, tokens[i]->column, tokens[i]->character); +				} +			} +			break; + +		case EXPECT_EQUALS: +			if (type != SYMBOL || tokens[i]->s[0] != '=') { +				eprintf("expected an '=' on line %zu at column %zu (character %zu)\n", +				        tokens[i]->lineno, tokens[i]->column, tokens[i]->character); +			} +			free(tokens[i]); +			state = EXPECT_OPERAND; +			break; + +		case EXPECT_OPERAND: +			if (type == SYMBOL) { +				if (tokens[i]->s[0] == '(' || tokens[i]->s[0] == '[' || tokens[i]->s[0] == '{') { +					parent_node = stack; +					stack = ecalloc(1, sizeof(*stack)); +					stack->parent = parent_node; +					stack->token = tokens[i]; +					stack->head = &stack->data; +				} else if (tokens[i]->s[0] == '-') { +					goto add; +				} else { +				stray: +					eprintf("stray '%c' on line %zu at column %zu (character %zu)\n", +					        tokens[i]->s[0], tokens[i]->lineno, tokens[i]->column, tokens[i]->character); +				} +			} else { +			add: +				state = EXPECT_OPERATOR; +				goto add_singleton; +			} +			break; + +		case EXPECT_OPERATOR: +			if (tokens[i]->s[0] == '|' || tokens[i]->s[0] == ',') { +				state = EXPECT_OPERAND; +			add_singleton: +				node = calloc(1, sizeof(*node)); +				node->token = tokens[i]; +				*stack->head = node; +				stack->head = &node->next; +			} else if (tokens[i]->s[0] == ')') { +				if (stack->token->s[0] != '(') +					goto stray; +				goto pop; +			} else if (tokens[i]->s[0] == ']') { +				if (stack->token->s[0] != '[') +					goto stray; +				goto pop; +			} else if (tokens[i]->s[0] == '}') { +				if (stack->token->s[0] != '{') +					goto stray; +			pop: +				free(tokens[i]); +				*stack->parent->head = stack; +				stack->parent->head = &stack->next; +				stack = stack->parent; +			} else if (tokens[i]->s[0] == ';') { +				if (stack->token->s[0] == ')' || stack->token->s[0] == ']' || stack->token->s[0] == '}') +					eprintf("premature end of rule on line %zu at column %zu (character %zu): " +					        "'%s' on line %zu at column %zu (character %zu) not closed\n", +					        tokens[i]->lineno, tokens[i]->column, tokens[i]->character, stack->token->s, +					        stack->token->lineno, stack->token->column, stack->token->character); +				emit_and_free_rule(stack); +				free(tokens[i]); +				state = NEW_RULE; +			} else { +				eprintf("expected a '|', ',', or '%c' on line %zu at column %zu (character %zu)\n", +				        stack->token->s[0] == '(' ? ')' : +				        stack->token->s[0] == '[' ? ']' : +				        stack->token->s[0] == '{' ? '}' : ';', +				        tokens[i]->lineno, tokens[i]->column, tokens[i]->character); +			} +			break; + +		default: +			abort(); +		} +	} +	free(tokens); +	if (state != NEW_RULE) +		eprintf("premature end of file\n"); + +	err = 0; +	qsort(rule_names, nrule_names, sizeof(*rule_names), strpcmp); +	qsort(want_rules, nwant_rules, sizeof(*want_rules), strpcmp); +	for (i = j = 0; i < nrule_names && j < nwant_rules;) { +		cmp = strcmp(rule_names[i], want_rules[j]); +		if (!cmp) { +			i++; +			for (j++; j < nwant_rules && !strcmp(want_rules[j - 1], want_rules[j]); j++); +		} else if (!strcmp(rule_names[i], argv[0])) { +			i++; +		} else if (cmp < 0) { +			weprintf("rule \"%s\" defined but not used\n", rule_names[i]); +			i++; +			err = 1; +		} else { +			weprintf("rule \"%s\" used but not defined\n", want_rules[j]); +			for (j++; j < nwant_rules && !strcmp(want_rules[j - 1], want_rules[j]); j++); +			err = 1; +		} +	} +	for (; i < nrule_names; i++) { +		if (strcmp(rule_names[i], argv[0])) { +			weprintf("rule \"%s\" defined but not used\n", rule_names[i]); +			err = 1; +		} +	} +	while (j < nwant_rules) { +		weprintf("rule \"%s\" used but not defined\n", want_rules[j]); +		for (j++; j < nwant_rules && !strcmp(want_rules[j - 1], want_rules[j]); j++); +		err = 1; +	} +	if (err) +		exit(1); + +	for (i = 0; i < nrule_names; i++) +		if (!strcmp(rule_names[i], argv[0])) +			goto found_main; +	eprintf("specified main rule (\"%s\") was not defined\n", argv[0]); + +found_main: +	printf("static union libparser_sentence noeof_sentence = {.type = LIBPARSER_SENTENCE_TYPE_EXCEPTION};\n"); +	printf("static struct libparser_rule noeof_rule = {\"@noeof\", &noeof_sentence};\n"); +	printf("static union libparser_sentence noeof_rule_sentence = {.rule = " +	           "{.type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"@noeof\"}" +	       "};\n"); + +	printf("static union libparser_sentence eof_sentence = {.type = LIBPARSER_SENTENCE_TYPE_EOF};\n"); +	printf("static struct libparser_rule eof_rule = {\"@eof\", &eof_sentence};\n"); +	printf("static union libparser_sentence eof_rule_sentence = {.rule = " +	           "{.type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"@eof\"}" +	       "};\n"); + +	printf("static union libparser_sentence end_sentence = {.binary = {" +	           ".type = LIBPARSER_SENTENCE_TYPE_ALTERNATION, " +	           ".left = &eof_rule_sentence, .right = &noeof_rule_sentence" +	       "}};\n"); + +	printf("static union libparser_sentence main_rule_sentence = {.rule = " +	           "{.type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"%s\"}" +	       "};\n", argv[0]); + +	printf("static union libparser_sentence main_sentence = {.binary = {" +	           ".type = LIBPARSER_SENTENCE_TYPE_CONCATENATION, " +	           ".left = &main_rule_sentence, .right = &end_sentence" +	       "}};\n"); +	printf("static struct libparser_rule main_rule = {\"@start\", &main_sentence};\n"); + +	printf("const struct libparser_rule *const libparser_rule_table[] = {\n"); +	for (i = 0; i < nrule_names; i++) { +		printf("\t&rule_%zu,\n", i); +		free(rule_names[i]); +	} +	printf("\t&eof_rule,\n"); +	printf("\t&noeof_rule,\n"); +	printf("\t&main_rule,\n"); +	printf("\tNULL\n};\n"); +	free(rule_names); +	for (i = 0; i < nwant_rules; i++) +		free(want_rules[i]); +	free(want_rules); + +	if (ferror(stdout) || fflush(stdout) || fclose(stdout)) +		eprintf("printf:"); +	return 0; +} diff --git a/libparser.c b/libparser.c new file mode 100644 index 0000000..9ee22a1 --- /dev/null +++ b/libparser.c @@ -0,0 +1,202 @@ +/* See LICENSE file for copyright and license details. */ +#include "libparser.h" +#include <libsimple.h> + + +struct context { +	const struct libparser_rule *const *rules; +	struct libparser_unit *cache; +	const char *data; +	size_t length; +	size_t position; +	int done; +	int exception; +}; + + +static void +free_unit(struct libparser_unit *unit, struct context *ctx) +{ +	struct libparser_unit *prev; +	while (unit) { +		free_unit(unit->in, ctx); +		prev = unit; +		unit = unit->next; +		prev->next = ctx->cache; +		ctx->cache = prev; +	} +} + + +static struct libparser_unit * +try_match(const char *rule, const union libparser_sentence *sentence, struct context *ctx) +{ +	struct libparser_unit *unit, *next; +	struct libparser_unit **head; +	unsigned char c; +	size_t i; + +	if (!ctx->cache) { +		unit = ecalloc(1, sizeof(*unit)); +	} else { +		unit = ctx->cache; +		ctx->cache = unit->next; +		unit->in = unit->next = NULL; +	} + +	unit->rule = rule; +	unit->start = ctx->position; + +	switch (sentence->type) { +	case LIBPARSER_SENTENCE_TYPE_CONCATENATION: +		unit->in = try_match(NULL, sentence->binary.left, ctx); +		if (!unit->in) +			goto mismatch; +		if (ctx->done) +			break; +		unit->in->next = try_match(NULL, sentence->binary.right, ctx); +		if (!unit->in->next) { +			free_unit(unit->in, ctx); +			goto mismatch; +		} +		if (!unit->in->next->rule || unit->in->next->rule[0] == '_') { +			unit->in->next->next = ctx->cache; +			ctx->cache = unit->in->next; +			unit->in->next = unit->in->next->in; +		} +		if (!unit->in->rule || unit->in->rule[0] == '_') { +			next = unit->in->next; +			unit->in->next = ctx->cache; +			ctx->cache = unit->in; +			unit->in = unit->in->in; +			if (unit->in) { +				for (head = &unit->in->next; *head; head = &(*head)->next); +				*head = next; +			} else { +				unit->in = next; +			} +		} +		break; + +	case LIBPARSER_SENTENCE_TYPE_ALTERNATION: +		unit->in = try_match(NULL, sentence->binary.left, ctx); +		if (!unit->in) { +			unit->in = try_match(NULL, sentence->binary.right, ctx); +			if (!unit->in) +				goto mismatch; +		} +	prone: +		if (unit->in && (!unit->in->rule || unit->in->rule[0] == '_')) { +			unit->in->next = ctx->cache; +			ctx->cache = unit->in; +			unit->in = unit->in->in; +		} +		break; + +	case LIBPARSER_SENTENCE_TYPE_OPTIONAL: +		unit->in = try_match(NULL, sentence->unary.sentence, ctx); +		goto prone; + +	case LIBPARSER_SENTENCE_TYPE_REPEATED: +		head = &unit->in; +		while ((*head = try_match(NULL, sentence->unary.sentence, ctx))) { +			if (!(*head)->rule || (*head)->rule[0] == '_') { +				(*head)->next = ctx->cache; +				ctx->cache = *head; +				*head = (*head)->in; +				while (*head) +					head = &(*head)->next; +			} else { +				head = &(*head)->next; +			} +			if (ctx->done) +				break; +		} +		break; + +	case LIBPARSER_SENTENCE_TYPE_STRING: +		if (sentence->string.length > ctx->length - ctx->position) +			goto mismatch; +		if (memcmp(&ctx->data[ctx->position], sentence->string.string, sentence->string.length)) +			goto mismatch; +		ctx->position += sentence->string.length; +		break; + +	case LIBPARSER_SENTENCE_TYPE_CHAR_RANGE: +		if (ctx->position == ctx->length) +			goto mismatch; +		c = ((const unsigned char *)ctx->data)[ctx->position]; +		if (sentence->char_range.low > c || c > sentence->char_range.high) +			goto mismatch; +		ctx->position += 1; +		break; + +	case LIBPARSER_SENTENCE_TYPE_RULE: +		for (i = 0; ctx->rules[i]; i++) +			if (!strcmp(ctx->rules[i]->name, sentence->rule.rule)) +				break; +		if (!ctx->rules[i]) +			abort(); +		unit->in = try_match(ctx->rules[i]->name, ctx->rules[i]->sentence, ctx); +		if (!unit->in) +			goto mismatch; +		goto prone; + +	case LIBPARSER_SENTENCE_TYPE_EXCEPTION: +		ctx->done = 1; +		ctx->exception = 1; +		break; + +	case LIBPARSER_SENTENCE_TYPE_EOF: +		if (ctx->position != ctx->length) +			goto mismatch; +		ctx->done = 1; +		break; + +	default: +		abort(); +	} + +	unit->end = ctx->position; +	return unit; + +mismatch: +	ctx->position = unit->start; +	unit->next = ctx->cache; +	ctx->cache = unit; +	return NULL; +} + + +struct libparser_unit * +libparser_parse_file(const struct libparser_rule *const rules[], const char *data, size_t length, int *exceptionp) +{ +	struct libparser_unit *ret, *t; +	struct context ctx; +	size_t i; + +	ctx.rules = rules; +	ctx.cache = NULL; +	ctx.data = data; +	ctx.length = length; +	ctx.position = 0; +	ctx.done = 0; +	ctx.exception = 0; + +	for (i = 0; rules[i]; i++) +		if (!strcmp(rules[i]->name, "@start")) +			break; +	if (!rules[i]) +		abort(); + +	ret = try_match(rules[i]->name, rules[i]->sentence, &ctx); +	*exceptionp = ctx.exception; + +	while (ctx.cache) { +		t = ctx.cache; +		ctx.cache = t->next; +		free(t); +	} + +	return ret; +} diff --git a/libparser.h b/libparser.h new file mode 100644 index 0000000..74ab319 --- /dev/null +++ b/libparser.h @@ -0,0 +1,73 @@ +/* See LICENSE file for copyright and license details. */ +#include <stddef.h> + +union libparser_sentence; + +enum libparser_sentence_type { +	LIBPARSER_SENTENCE_TYPE_CONCATENATION, /* .binary */ +	LIBPARSER_SENTENCE_TYPE_ALTERNATION,   /* .binary */ +	LIBPARSER_SENTENCE_TYPE_OPTIONAL,      /* .unary */ +	LIBPARSER_SENTENCE_TYPE_REPEATED,      /* .unary */ +	LIBPARSER_SENTENCE_TYPE_STRING,        /* .string */ +	LIBPARSER_SENTENCE_TYPE_CHAR_RANGE,    /* .char_range */ /* TODO not supported yet: <low, high> */ +	LIBPARSER_SENTENCE_TYPE_RULE,          /* .rule */ +	LIBPARSER_SENTENCE_TYPE_EXCEPTION,     /* (none) */ +	LIBPARSER_SENTENCE_TYPE_EOF            /* (none) */ +}; + +struct libparser_sentence_binary { +	enum libparser_sentence_type type; +	const union libparser_sentence *left; +	const union libparser_sentence *right; +}; + +struct libparser_sentence_unary { +	enum libparser_sentence_type type; +	const union libparser_sentence *sentence; +}; + +struct libparser_sentence_string { +	enum libparser_sentence_type type; +	const char *string; +	size_t length; +}; + +struct libparser_sentence_char_range { +	enum libparser_sentence_type type; +	unsigned char low; +	unsigned char high; +}; + +struct libparser_sentence_rule { +	enum libparser_sentence_type type; +	const char *rule; +}; + +union libparser_sentence {  +	enum libparser_sentence_type type; +	struct libparser_sentence_binary binary; +	struct libparser_sentence_unary unary; +	struct libparser_sentence_string string; +	struct libparser_sentence_char_range char_range; +	struct libparser_sentence_rule rule; +}; + +struct libparser_rule { +	const char *name; +	union libparser_sentence *sentence; +}; + +struct libparser_unit { +	const char *rule; +	struct libparser_unit *in; +	struct libparser_unit *next; +	size_t start; +	size_t end; +}; + + +extern const struct libparser_rule *const libparser_rule_table[]; + + +struct libparser_unit *libparser_parse_file(const struct libparser_rule *const rules[], +                                            const char *data, size_t length, int *exceptionp); | 
