diff options
author | Mattias Andrée <maandree@kth.se> | 2021-04-17 12:09:12 +0200 |
---|---|---|
committer | Mattias Andrée <maandree@kth.se> | 2021-04-17 12:09:12 +0200 |
commit | a5218fce6b19591c9412a84a5088c260c0676f61 (patch) | |
tree | 038f8b8e1064b4f61c831907ac4f20b77711f289 | |
download | libparser-a5218fce6b19591c9412a84a5088c260c0676f61.tar.gz libparser-a5218fce6b19591c9412a84a5088c260c0676f61.tar.bz2 libparser-a5218fce6b19591c9412a84a5088c260c0676f61.tar.xz |
First commit
Signed-off-by: Mattias Andrée <maandree@kth.se>
Diffstat (limited to '')
-rw-r--r-- | .gitignore | 10 | ||||
-rw-r--r-- | LICENSE | 15 | ||||
-rw-r--r-- | Makefile | 37 | ||||
-rw-r--r-- | TODO | 1 | ||||
-rw-r--r-- | calc-example/calc.c | 142 | ||||
-rw-r--r-- | calc-example/calc.syntax | 27 | ||||
-rw-r--r-- | config.mk | 8 | ||||
-rw-r--r-- | libparser-generate.c | 664 | ||||
-rw-r--r-- | libparser.c | 202 | ||||
-rw-r--r-- | libparser.h | 73 |
10 files changed, 1179 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9988a52 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +*\#* +*~ +*.o +*.a +*.lo +*.so +*.su +/libparser-generate +/calc-example/calc +/calc-example/calc-syntax.c @@ -0,0 +1,15 @@ +ISC License + +© 2021 Mattias Andrée <maandree@kth.se> + +Permission to use, copy, modify, and/or distribute this software for any +purpose with or without fee is hereby granted, provided that the above +copyright notice and this permission notice appear in all copies. + +THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c8628b0 --- /dev/null +++ b/Makefile @@ -0,0 +1,37 @@ +.POSIX: + +CONFIGFILE = config.mk +include $(CONFIGFILE) + + +all: libparser.a libparser-generate calc-example/calc +libparser.o: libparser.c libparser.h +calc-example/calc-syntax.o: calc-example/calc-syntax.c libparser.h + +.c.o: + $(CC) -c -o $@ $< $(CPPFLAGS) $(CFLAGS) + +.c.lo: + $(CC) -fPIC -c -o $@ $< $(CPPFLAGS) $(CFLAGS) + +libparser-generate: libparser-generate.o + $(CC) -o $@ libparser-generate.o $(LDFLAGS) + +libparser.a: libparser.o + $(AR) rc $@ libparser.o + $(AR) -s $@ + +calc-example/calc: calc-example/calc.o calc-example/calc-syntax.o libparser.a + $(CC) -o $@ calc-example/calc.o calc-example/calc-syntax.o libparser.a $(LDFLAGS) + +calc-example/calc-syntax.c: libparser-generate calc-example/calc.syntax + ./libparser-generate _expr < calc-example/calc.syntax > $@ + +clean: + -rm -f -- *.o *.lo *.a *.so *.su *-example/*.o *-example/*.su *-example/*-syntax.c libparser-generate + -rm -f -- calc-example/calc + +.SUFFIXES: +.SUFFIXES: .c .o .lo + +.PHONY: all clean @@ -0,0 +1 @@ +Add support for prelexed. diff --git a/calc-example/calc.c b/calc-example/calc.c new file mode 100644 index 0000000..adda2ba --- /dev/null +++ b/calc-example/calc.c @@ -0,0 +1,142 @@ +/* See LICENSE file for copyright and license details. */ +#include <libparser.h> +#include <libsimple.h> +#include <libsimple-arg.h> + +USAGE(""); + + +static void +free_input(struct libparser_unit *node) +{ + struct libparser_unit *next; + for (; node; next = node->next, free(node), node = next) + free_input(node->in); +} + + +static intmax_t +calculate(struct libparser_unit *node, const char *line) +{ + struct libparser_unit *next; + intmax_t value = 0; + int op; + if (!node->rule) { + next = node->in->next; + value = calculate(node->in, line); + free_input(next); + } else if (!strcmp(node->rule, "DIGIT")) { + value = (intmax_t)(line[node->start] - '0'); + } else if (!strcmp(node->rule, "sign")) { + value = !strcmp(node->in->rule, "SUB") ? -1 : +1; + free(node->in); + } else if (!strcmp(node->rule, "unsigned")) { + value = 0; + next = node->in; + free(node); + for (node = next; node; node = next) { + next = node->next; + value *= 10; + value += calculate(node, line); + } + } else if (!strcmp(node->rule, "number")) { + next = node->in->next; + value = calculate(node->in, line); + free(node); + for (node = next; node; node = next) { + next = node->next; + value *= calculate(node, line); + } + } else if (!strcmp(node->rule, "value")) { + next = node->in->next; + value = calculate(node->in, line); + if (next) + value *= calculate(next, line); + } else if (!strcmp(node->rule, "hyper1")) { + next = node->in->next; + value = calculate(node->in, line); + free(node); + node = next; + while (node) { + next = node->next; + op = !strcmp(node->rule, "SUB") ? -1 : +1; + free(node); + node = next; + next = node->next; + if (op < 0) + value -= calculate(node, line); + else + value += calculate(node, line); + node = next; + } + } else if (!strcmp(node->rule, "hyper2")) { + next = node->in->next; + value = calculate(node->in, line); + free(node); + node = next; + while (node) { + next = node->next; + op = !strcmp(node->rule, "DIV") ? -1 : +1; + free(node); + node = next; + next = node->next; + if (op < 0) + value /= calculate(node, line); + else + value *= calculate(node, line); + node = next; + } + } else if (node->rule[0] != '@') { + abort(); + } else if (node->in) { + next = node->in->next; + value = calculate(node->in, line); + if (next) + free_input(next); + } + free(node); + return value; +} + + +int +main(int argc, char *argv[]) +{ + struct libparser_unit *input; + char *line = NULL; + size_t size = 0; + ssize_t len; + intmax_t res; + int exception; + + ARGBEGIN { + default: + usage(); + } ARGEND; + if (argc) + usage(); + + while ((len = getline(&line, &size, stdin)) >= 0) { + if (len && line[len - 1] == '\n') + line[--len] = '\0'; + input = libparser_parse_file(libparser_rule_table, line, (size_t)len, &exception); + if (!input) { + weprintf("didn't find anything to parse\n"); + free_input(input); + continue; + } else if (input->end != (size_t)len) { + weprintf("line could not be parsed, stopped at column %zu\n", input->end); + free_input(input); + continue; + } else if (exception) { + weprintf("premature end of line\n"); + free_input(input); + continue; + } + res = calculate(input, line); + printf("%ji\n", res); + } + + free(line); + return 0; +} diff --git a/calc-example/calc.syntax b/calc-example/calc.syntax new file mode 100644 index 0000000..ef7a9f4 --- /dev/null +++ b/calc-example/calc.syntax @@ -0,0 +1,27 @@ +_WHITESPACE = " " | "\t" | " "; +_ = {_WHITESPACE}; + + +DIGIT = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"; + +ADD = _, ("+"), _; +SUB = _, ("-" | "−"), _; +MUL = _, ("*" | "⋅" | "×"), _; +DIV = _, ("/" | "∕" | "÷"), _; + + +sign = ADD | SUB; + +unsigned = DIGIT, {DIGIT | _WHITESPACE | "_" | "'"}; + +_number = unsigned | "(", _expr, (")" | -); + +number = _number, {_, _number}; (* optionally with implicit multiplication *) + +value = [sign], number; + +_expr = hyper1; + + +hyper1 = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _; +hyper2 = _, value, {(MUL | DIV), (value | -)}, _; diff --git a/config.mk b/config.mk new file mode 100644 index 0000000..ac056a6 --- /dev/null +++ b/config.mk @@ -0,0 +1,8 @@ +PREFIX = /usr +MANPREFIX = $(PREFIX)/share/man + +CC = cc + +CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -I"$$(pwd)" +CFLAGS = -std=c99 -Wall -O2 +LDFLAGS = -s -lsimple diff --git a/libparser-generate.c b/libparser-generate.c new file mode 100644 index 0000000..f5f19ae --- /dev/null +++ b/libparser-generate.c @@ -0,0 +1,664 @@ +/* See LICENSE file for copyright and license details. */ +#include <libsimple.h> +#include <libsimple-arg.h> + +USAGE("main-rule"); + + +struct token { + size_t lineno; + size_t column; + size_t character; + char s[]; +}; + +struct node { + struct token *token; + struct node *parent; + struct node *next; + struct node *data; + struct node **head; +}; + + +static char **rule_names = NULL; +static size_t nrule_names = 0; +static size_t rule_names_size = 0; + +static char **want_rules = NULL; +static size_t nwant_rules = 0; +static size_t want_rules_size = 0; + + +static int +strpcmp(const void *av, const void *bv) +{ + const char *const *a = av; + const char *const *b = bv; + return strcmp(*a, *b); +} + + +static int +isidentifier(char c) +{ + return isalpha(c) || isdigit(c) || !isascii(c) || c == '_'; +} + + +static int +check_utf8(char *buf, size_t *ip, size_t len) +{ + size_t req, i; + uint32_t cp; + if ((buf[*ip] & 0xE0) == 0xC0) { + cp = (uint32_t)(unsigned char)(buf[*ip] ^ 0xC0); + req = 2; + } else if ((buf[*ip] & 0xF0) == 0xE0) { + cp = (uint32_t)(unsigned char)(buf[*ip] ^ 0xE0); + req = 3; + } else if ((buf[*ip] & 0xF8) == 0xF0) { + cp = (uint32_t)(unsigned char)(buf[*ip] ^ 0xF0); + req = 4; + } else { + return 0; + } + if (req > len - *ip) + return 0; + for (i = 1; i < req; i++) { + cp <<= 6; + if ((buf[*ip + i] & 0xC0) != 0x80) + return 0; + cp |= (uint32_t)(unsigned char)(buf[*ip + i] ^ 0x80); + } + *ip += req; + if ((cp & UINT32_C(0xD8000)) == UINT32_C(0xD8000)) + return 0; + if (cp < (uint32_t)1 << (7 + 0 * 6)) + return 0; + if (cp < (uint32_t)1 << (5 + 1 * 6)) + return req == 2; + if (cp < (uint32_t)1 << (4 + 2 * 6)) + return req == 3; + if (cp <= UINT32_C(0x10FFFF)) + return req == 4; + return 0; +} + + +static char * +readall_and_validate(int fd, const char *fname) +{ + size_t lineno = 1, column = 0, character = 0; + size_t size = 0, len = 0, i; + char *buf = NULL; + ssize_t r; + + for (;; len += (size_t)r) { + if (len == size) + buf = erealloc(buf, size += 1024); + r = read(fd, &buf[len], size - len); + if (r <= 0) { + if (!r) + break; + eprintf("read %s:", fname); + } + } + + for (i = 0; i < len; i++) { + if (buf[i] == '\n') { + lineno += 1; + column = 0; + character = 0; + } else if (buf[i] == '\t') { + column += 8 - column % 8; + character += 1; + } else if (buf[i] == '\r') { + eprintf("%s contains a CR character on line %zu at column %zu (character %zu)\n", + fname, lineno, column, character); + } else if ((0 < buf[i] && buf[i] < ' ') || buf[i] == 0x7F) { + eprintf("%s contains a illegal character on line %zu at column %zu (character %zu)\n", + fname, lineno, column, character); + } else if (buf[i] == '\0') { + eprintf("%s contains a NUL byte on line %zu at column %zu (character %zu)\n", + fname, lineno, column, character); + } else if (!(buf[i] & 0x80)) { + character += 1; + column += 1; + } else if ((buf[i] & 0xC0) == 0x80) { + eprintf("%s contains a illegal byte on line %zu at column %zu (character %zu)\n", + fname, lineno, column, character); + } else { + if (!check_utf8(buf, &i, len)) { + eprintf("%s contains a illegal byte sequence on line %zu at column %zu (character %zu)\n", + fname, lineno, column, character); + } + i--; + character += 1; + column += 1; + } + } + + buf = erealloc(buf, len + 1); + buf[len] = '\0'; + + return buf; +} + + +static struct token ** +tokenise(const char *data) +{ + enum { + NEW_TOKEN, + IDENTIFIER, + STRING, + STRING_ESC, + SPACE + } state = NEW_TOKEN; + size_t lineno = 1, column = 0, character = 0; + size_t token_lineno = 0, token_column = 0, token_character = 0; + struct token **tokens = NULL; + char *token = NULL; + size_t i, ntokens = 0, tokens_size = 0; + size_t token_len = 0, token_size = 0; + + for (i = 0; data[i]; i++) { + again: + switch (state) { + case NEW_TOKEN: + token_lineno = lineno; + token_column = column; + token_character = character; + if (token_len == token_size) + token = erealloc(token, token_size += 16); + token[token_len++] = data[i]; + if (isidentifier(data[i])) { + state = IDENTIFIER; + } else if (isspace(data[i])) { + state = SPACE; + } else if (data[i] == '"') { + state = STRING; + if (data[i + 1] == '"') { + eprintf("empty string token on line %zu at column %zu (character %zu)\n", + lineno, column, character); + } + } else { + add_token: + if (token_len == token_size) + token = erealloc(token, token_size += 16); + token[token_len++] = '\0'; + if (ntokens == tokens_size) + tokens = ereallocn(tokens, tokens_size += 16, sizeof(*tokens), 0); + tokens[ntokens] = emalloc(offsetof(struct token, s) + token_len); + tokens[ntokens]->lineno = token_lineno; + tokens[ntokens]->column = token_column; + tokens[ntokens]->character = token_character; + stpcpy(tokens[ntokens++]->s, token); + token_len = 0; + state = NEW_TOKEN; + } + break; + + case IDENTIFIER: + if (isidentifier(data[i]) || data[i] == '-') { + add_char: + if (token_len == token_size) + token = erealloc(token, token_size += 16); + token[token_len++] = data[i]; + } else { + add_token_and_do_again: + if (token_len == token_size) + token = erealloc(token, token_size += 16); + token[token_len++] = '\0'; + if (ntokens == tokens_size) + tokens = ereallocn(tokens, tokens_size += 16, sizeof(*tokens), 0); + tokens[ntokens] = emalloc(offsetof(struct token, s) + token_len); + tokens[ntokens]->lineno = token_lineno; + tokens[ntokens]->column = token_column; + tokens[ntokens]->character = token_character; + stpcpy(tokens[ntokens++]->s, token); + token_len = 0; + state = NEW_TOKEN; + goto again; + } + break; + + case STRING: + if (data[i] == '\n' || data[i] == '\t') { + eprintf("illegal whitespace on line %zu at column %zu (character %zu)\n", + lineno, column, character); + } else if (data[i] == '"') { + goto add_token; + } else if (data[i] == '\\') { + state = STRING_ESC; + goto add_char; + } else { + goto add_char; + } + break; + + case STRING_ESC: + if (data[i] == '\n' || data[i] == '\t') { + eprintf("illegal whitespace on line %zu at column %zu (character %zu)\n", + lineno, column, character); + } + if (token_len == token_size) + token = erealloc(token, token_size += 16); + token[token_len++] = data[i]; + state = STRING; + break; + + case SPACE: + if (isspace(data[i])) { + goto add_char; + } else { + goto add_token_and_do_again; + } + break; + + default: + abort(); + }; + + if (data[i] == '\n') { + lineno += 1; + column = 0; + character = 0; + } else if (data[i] == '\t') { + column += 8 - column % 8; + character += 1; + } else { + character += (data[i] & 0xC0) != 0x80; + column += 1; + } + } + if (state != NEW_TOKEN && state != SPACE) + eprintf("premature end of file\n"); + + tokens = ereallocn(tokens, ntokens + 1, sizeof(*tokens), 0); + tokens[ntokens] = NULL; + free(token); + + return tokens; +} + + +static void +emit_and_free_sentence(struct node *node, size_t *indexp) +{ + size_t index = (*indexp)++, left, right; + struct node *next; + + for (; node->token->s[0] == '('; node = next) { + next = node->data; + free(node->token); + free(node); + } + + if (node->token->s[0] == '[' || node->token->s[0] == '{') { + emit_and_free_sentence(node->data, indexp); + printf("static union libparser_sentence sentence_%zu_%zu = {.unary = {" + ".type = LIBPARSER_SENTENCE_TYPE_%s, .sentence = &sentence_%zu_%zu" + "}};\n", + nrule_names, index, node->token->s[0] == '[' ? "OPTIONAL" : "REPEATED", nrule_names, index + 1); + } else if (node->token->s[0] == '|' || node->token->s[0] == ',') { + right = *indexp; + emit_and_free_sentence(node->data->next, indexp); + left = *indexp; + emit_and_free_sentence(node->data, indexp); + printf("static union libparser_sentence sentence_%zu_%zu = {.binary = {" + ".type = LIBPARSER_SENTENCE_TYPE_%s, " + ".left = &sentence_%zu_%zu, .right = &sentence_%zu_%zu" + "}};\n", + nrule_names, index, node->token->s[0] == '|' ? "ALTERNATION" : "CONCATENATION", + nrule_names, left, nrule_names, right); + } else if (node->token->s[0] == '"') { + printf("static union libparser_sentence sentence_%zu_%zu = {.string = {" + ".type = LIBPARSER_SENTENCE_TYPE_STRING, " + ".string = %s\", .length = sizeof(%s\") - 1" + "}};\n", + nrule_names, index, node->token->s, node->token->s); + } else if (node->token->s[0] == '-') { + printf("static union libparser_sentence sentence_%zu_%zu = {.type = LIBPARSER_SENTENCE_TYPE_EXCEPTION};\n", + nrule_names, index); + } else { + if (nwant_rules == want_rules_size) + want_rules = ereallocn(want_rules, want_rules_size += 16, sizeof(*want_rules), 0); + want_rules[nwant_rules++] = estrdup(node->token->s); + printf("static union libparser_sentence sentence_%zu_%zu = {.rule = {" + ".type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"%s\"" + "}};\n", + nrule_names, index, node->token->s); + } + + free(node->token); + free(node); +} + + +static struct node * +order_sentences(struct node *node) +{ + struct node *tail = NULL, **head = &tail; + struct node *stack = NULL; + struct node *next, *prev; + + for (; node; node = next) { + next = node->next; + if (node->token->s[0] == '(' || node->token->s[0] == '[' || node->token->s[0] == '{') { + node->data = order_sentences(node->data); + *head = node; + head = &node->next; + } else if (node->token->s[0] == '|' || node->token->s[0] == ',') { + again_operators: + if (!stack) { + node->next = stack; + stack = node; + } else if (node->token->s[0] == ',' && stack->token->s[0] == '|') { + node->next = stack; + stack = node; + } else if (node->token->s[0] == stack->token->s[0]) { + *head = stack; + head = &stack->next; + stack = stack->next; + node->next = stack; + stack = node; + } else { + *head = stack; + head = &stack->next; + stack = stack->next; + goto again_operators; + } + } else { + *head = node; + head = &node->next; + } + } + + for (; stack; stack = next) { + next = stack->next; + *head = stack; + head = &stack->next; + } + + *head = NULL; + + for (stack = tail, prev = NULL; stack; prev = stack, stack = next) { + next = stack->next; + stack->next = prev; + if (stack->token->s[0] == '|' || stack->token->s[0] == ',') { + prev = stack->next->next->next; + stack->data = stack->next->next; + stack->data->next = stack->next; + stack->next = prev; + } + } + + return prev; +} + + +static void +emit_and_free_rule(struct node *rule) +{ + size_t index = 0; + + rule->data = order_sentences(rule->data); + emit_and_free_sentence(rule->data, &index); + + printf("static struct libparser_rule rule_%zu = {\"%s\", &sentence_%zu_0};\n", nrule_names, rule->token->s, nrule_names); + + if (nrule_names == rule_names_size) + rule_names = ereallocn(rule_names, rule_names_size += 16, sizeof(*rule_names), 0); + rule_names[nrule_names++] = estrdup(rule->token->s); + free(rule->token); + free(rule); +} + + +int +main(int argc, char *argv[]) +{ + enum { + IDENTIFIER, + STRING, + SYMBOL, + } type; + enum { + NEW_RULE, + EXPECT_EQUALS, + EXPECT_OPERAND, + EXPECT_OPERATOR + } state = NEW_RULE; + struct node *stack = NULL, *parent_node, *node; + char *data; + struct token **tokens; + size_t i, j; + int cmp, err; + + ARGBEGIN { + default: + usage(); + } ARGEND; + + if (argc != 1 || !isidentifier(argv[0][0])) + usage(); + for (i = 0; argv[0][i]; i++) + if (!isidentifier(argv[0][i]) && argv[0][i] != '-') + usage(); + + data = readall_and_validate(STDIN_FILENO, "<stdin>"); + tokens = tokenise(data); + free(data); + + printf("#include <libparser.h>\n"); + + i = 0; +again: + for (; tokens[i]; i++) { + if (tokens[i + 1] && tokens[i]->s[0] == '(' && tokens[i + 1]->s[0] == '*') { + for (i += 2; tokens[i] && tokens[i + 1]; i++) { + if (tokens[i]->s[0] == '*' && tokens[i + 1]->s[0] == ')') { + i += 2; + goto again; + } + } + eprintf("premature end of file\n"); + } + + if (tokens[i]->s[0] == '"') { + type = STRING; + } else if (isidentifier(tokens[i]->s[0])) { + type = IDENTIFIER; + } else if (isspace(tokens[i]->s[0])) { + free(tokens[i]); + continue; + } else { + type = SYMBOL; + } + + switch (state) { + case NEW_RULE: + if (type != IDENTIFIER) { + eprintf("expected an identifier on line %zu at column %zu (character %zu)\n", + tokens[i]->lineno, tokens[i]->column, tokens[i]->character); + } + stack = calloc(1, sizeof(*stack)); + stack->token = tokens[i]; + stack->head = &stack->data; + state = EXPECT_EQUALS; + for (j = 0; j < nrule_names; j++) { + if (!strcmp(rule_names[j], tokens[i]->s)) { + eprintf("duplicate definition of \"%s\" on line %zu at column %zu (character %zu)\n", + tokens[i]->s, tokens[i]->lineno, tokens[i]->column, tokens[i]->character); + } + } + break; + + case EXPECT_EQUALS: + if (type != SYMBOL || tokens[i]->s[0] != '=') { + eprintf("expected an '=' on line %zu at column %zu (character %zu)\n", + tokens[i]->lineno, tokens[i]->column, tokens[i]->character); + } + free(tokens[i]); + state = EXPECT_OPERAND; + break; + + case EXPECT_OPERAND: + if (type == SYMBOL) { + if (tokens[i]->s[0] == '(' || tokens[i]->s[0] == '[' || tokens[i]->s[0] == '{') { + parent_node = stack; + stack = ecalloc(1, sizeof(*stack)); + stack->parent = parent_node; + stack->token = tokens[i]; + stack->head = &stack->data; + } else if (tokens[i]->s[0] == '-') { + goto add; + } else { + stray: + eprintf("stray '%c' on line %zu at column %zu (character %zu)\n", + tokens[i]->s[0], tokens[i]->lineno, tokens[i]->column, tokens[i]->character); + } + } else { + add: + state = EXPECT_OPERATOR; + goto add_singleton; + } + break; + + case EXPECT_OPERATOR: + if (tokens[i]->s[0] == '|' || tokens[i]->s[0] == ',') { + state = EXPECT_OPERAND; + add_singleton: + node = calloc(1, sizeof(*node)); + node->token = tokens[i]; + *stack->head = node; + stack->head = &node->next; + } else if (tokens[i]->s[0] == ')') { + if (stack->token->s[0] != '(') + goto stray; + goto pop; + } else if (tokens[i]->s[0] == ']') { + if (stack->token->s[0] != '[') + goto stray; + goto pop; + } else if (tokens[i]->s[0] == '}') { + if (stack->token->s[0] != '{') + goto stray; + pop: + free(tokens[i]); + *stack->parent->head = stack; + stack->parent->head = &stack->next; + stack = stack->parent; + } else if (tokens[i]->s[0] == ';') { + if (stack->token->s[0] == ')' || stack->token->s[0] == ']' || stack->token->s[0] == '}') + eprintf("premature end of rule on line %zu at column %zu (character %zu): " + "'%s' on line %zu at column %zu (character %zu) not closed\n", + tokens[i]->lineno, tokens[i]->column, tokens[i]->character, stack->token->s, + stack->token->lineno, stack->token->column, stack->token->character); + emit_and_free_rule(stack); + free(tokens[i]); + state = NEW_RULE; + } else { + eprintf("expected a '|', ',', or '%c' on line %zu at column %zu (character %zu)\n", + stack->token->s[0] == '(' ? ')' : + stack->token->s[0] == '[' ? ']' : + stack->token->s[0] == '{' ? '}' : ';', + tokens[i]->lineno, tokens[i]->column, tokens[i]->character); + } + break; + + default: + abort(); + } + } + free(tokens); + if (state != NEW_RULE) + eprintf("premature end of file\n"); + + err = 0; + qsort(rule_names, nrule_names, sizeof(*rule_names), strpcmp); + qsort(want_rules, nwant_rules, sizeof(*want_rules), strpcmp); + for (i = j = 0; i < nrule_names && j < nwant_rules;) { + cmp = strcmp(rule_names[i], want_rules[j]); + if (!cmp) { + i++; + for (j++; j < nwant_rules && !strcmp(want_rules[j - 1], want_rules[j]); j++); + } else if (!strcmp(rule_names[i], argv[0])) { + i++; + } else if (cmp < 0) { + weprintf("rule \"%s\" defined but not used\n", rule_names[i]); + i++; + err = 1; + } else { + weprintf("rule \"%s\" used but not defined\n", want_rules[j]); + for (j++; j < nwant_rules && !strcmp(want_rules[j - 1], want_rules[j]); j++); + err = 1; + } + } + for (; i < nrule_names; i++) { + if (strcmp(rule_names[i], argv[0])) { + weprintf("rule \"%s\" defined but not used\n", rule_names[i]); + err = 1; + } + } + while (j < nwant_rules) { + weprintf("rule \"%s\" used but not defined\n", want_rules[j]); + for (j++; j < nwant_rules && !strcmp(want_rules[j - 1], want_rules[j]); j++); + err = 1; + } + if (err) + exit(1); + + for (i = 0; i < nrule_names; i++) + if (!strcmp(rule_names[i], argv[0])) + goto found_main; + eprintf("specified main rule (\"%s\") was not defined\n", argv[0]); + +found_main: + printf("static union libparser_sentence noeof_sentence = {.type = LIBPARSER_SENTENCE_TYPE_EXCEPTION};\n"); + printf("static struct libparser_rule noeof_rule = {\"@noeof\", &noeof_sentence};\n"); + printf("static union libparser_sentence noeof_rule_sentence = {.rule = " + "{.type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"@noeof\"}" + "};\n"); + + printf("static union libparser_sentence eof_sentence = {.type = LIBPARSER_SENTENCE_TYPE_EOF};\n"); + printf("static struct libparser_rule eof_rule = {\"@eof\", &eof_sentence};\n"); + printf("static union libparser_sentence eof_rule_sentence = {.rule = " + "{.type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"@eof\"}" + "};\n"); + + printf("static union libparser_sentence end_sentence = {.binary = {" + ".type = LIBPARSER_SENTENCE_TYPE_ALTERNATION, " + ".left = &eof_rule_sentence, .right = &noeof_rule_sentence" + "}};\n"); + + printf("static union libparser_sentence main_rule_sentence = {.rule = " + "{.type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"%s\"}" + "};\n", argv[0]); + + printf("static union libparser_sentence main_sentence = {.binary = {" + ".type = LIBPARSER_SENTENCE_TYPE_CONCATENATION, " + ".left = &main_rule_sentence, .right = &end_sentence" + "}};\n"); + printf("static struct libparser_rule main_rule = {\"@start\", &main_sentence};\n"); + + printf("const struct libparser_rule *const libparser_rule_table[] = {\n"); + for (i = 0; i < nrule_names; i++) { + printf("\t&rule_%zu,\n", i); + free(rule_names[i]); + } + printf("\t&eof_rule,\n"); + printf("\t&noeof_rule,\n"); + printf("\t&main_rule,\n"); + printf("\tNULL\n};\n"); + free(rule_names); + for (i = 0; i < nwant_rules; i++) + free(want_rules[i]); + free(want_rules); + + if (ferror(stdout) || fflush(stdout) || fclose(stdout)) + eprintf("printf:"); + return 0; +} diff --git a/libparser.c b/libparser.c new file mode 100644 index 0000000..9ee22a1 --- /dev/null +++ b/libparser.c @@ -0,0 +1,202 @@ +/* See LICENSE file for copyright and license details. */ +#include "libparser.h" +#include <libsimple.h> + + +struct context { + const struct libparser_rule *const *rules; + struct libparser_unit *cache; + const char *data; + size_t length; + size_t position; + int done; + int exception; +}; + + +static void +free_unit(struct libparser_unit *unit, struct context *ctx) +{ + struct libparser_unit *prev; + while (unit) { + free_unit(unit->in, ctx); + prev = unit; + unit = unit->next; + prev->next = ctx->cache; + ctx->cache = prev; + } +} + + +static struct libparser_unit * +try_match(const char *rule, const union libparser_sentence *sentence, struct context *ctx) +{ + struct libparser_unit *unit, *next; + struct libparser_unit **head; + unsigned char c; + size_t i; + + if (!ctx->cache) { + unit = ecalloc(1, sizeof(*unit)); + } else { + unit = ctx->cache; + ctx->cache = unit->next; + unit->in = unit->next = NULL; + } + + unit->rule = rule; + unit->start = ctx->position; + + switch (sentence->type) { + case LIBPARSER_SENTENCE_TYPE_CONCATENATION: + unit->in = try_match(NULL, sentence->binary.left, ctx); + if (!unit->in) + goto mismatch; + if (ctx->done) + break; + unit->in->next = try_match(NULL, sentence->binary.right, ctx); + if (!unit->in->next) { + free_unit(unit->in, ctx); + goto mismatch; + } + if (!unit->in->next->rule || unit->in->next->rule[0] == '_') { + unit->in->next->next = ctx->cache; + ctx->cache = unit->in->next; + unit->in->next = unit->in->next->in; + } + if (!unit->in->rule || unit->in->rule[0] == '_') { + next = unit->in->next; + unit->in->next = ctx->cache; + ctx->cache = unit->in; + unit->in = unit->in->in; + if (unit->in) { + for (head = &unit->in->next; *head; head = &(*head)->next); + *head = next; + } else { + unit->in = next; + } + } + break; + + case LIBPARSER_SENTENCE_TYPE_ALTERNATION: + unit->in = try_match(NULL, sentence->binary.left, ctx); + if (!unit->in) { + unit->in = try_match(NULL, sentence->binary.right, ctx); + if (!unit->in) + goto mismatch; + } + prone: + if (unit->in && (!unit->in->rule || unit->in->rule[0] == '_')) { + unit->in->next = ctx->cache; + ctx->cache = unit->in; + unit->in = unit->in->in; + } + break; + + case LIBPARSER_SENTENCE_TYPE_OPTIONAL: + unit->in = try_match(NULL, sentence->unary.sentence, ctx); + goto prone; + + case LIBPARSER_SENTENCE_TYPE_REPEATED: + head = &unit->in; + while ((*head = try_match(NULL, sentence->unary.sentence, ctx))) { + if (!(*head)->rule || (*head)->rule[0] == '_') { + (*head)->next = ctx->cache; + ctx->cache = *head; + *head = (*head)->in; + while (*head) + head = &(*head)->next; + } else { + head = &(*head)->next; + } + if (ctx->done) + break; + } + break; + + case LIBPARSER_SENTENCE_TYPE_STRING: + if (sentence->string.length > ctx->length - ctx->position) + goto mismatch; + if (memcmp(&ctx->data[ctx->position], sentence->string.string, sentence->string.length)) + goto mismatch; + ctx->position += sentence->string.length; + break; + + case LIBPARSER_SENTENCE_TYPE_CHAR_RANGE: + if (ctx->position == ctx->length) + goto mismatch; + c = ((const unsigned char *)ctx->data)[ctx->position]; + if (sentence->char_range.low > c || c > sentence->char_range.high) + goto mismatch; + ctx->position += 1; + break; + + case LIBPARSER_SENTENCE_TYPE_RULE: + for (i = 0; ctx->rules[i]; i++) + if (!strcmp(ctx->rules[i]->name, sentence->rule.rule)) + break; + if (!ctx->rules[i]) + abort(); + unit->in = try_match(ctx->rules[i]->name, ctx->rules[i]->sentence, ctx); + if (!unit->in) + goto mismatch; + goto prone; + + case LIBPARSER_SENTENCE_TYPE_EXCEPTION: + ctx->done = 1; + ctx->exception = 1; + break; + + case LIBPARSER_SENTENCE_TYPE_EOF: + if (ctx->position != ctx->length) + goto mismatch; + ctx->done = 1; + break; + + default: + abort(); + } + + unit->end = ctx->position; + return unit; + +mismatch: + ctx->position = unit->start; + unit->next = ctx->cache; + ctx->cache = unit; + return NULL; +} + + +struct libparser_unit * +libparser_parse_file(const struct libparser_rule *const rules[], const char *data, size_t length, int *exceptionp) +{ + struct libparser_unit *ret, *t; + struct context ctx; + size_t i; + + ctx.rules = rules; + ctx.cache = NULL; + ctx.data = data; + ctx.length = length; + ctx.position = 0; + ctx.done = 0; + ctx.exception = 0; + + for (i = 0; rules[i]; i++) + if (!strcmp(rules[i]->name, "@start")) + break; + if (!rules[i]) + abort(); + + ret = try_match(rules[i]->name, rules[i]->sentence, &ctx); + *exceptionp = ctx.exception; + + while (ctx.cache) { + t = ctx.cache; + ctx.cache = t->next; + free(t); + } + + return ret; +} diff --git a/libparser.h b/libparser.h new file mode 100644 index 0000000..74ab319 --- /dev/null +++ b/libparser.h @@ -0,0 +1,73 @@ +/* See LICENSE file for copyright and license details. */ +#include <stddef.h> + +union libparser_sentence; + +enum libparser_sentence_type { + LIBPARSER_SENTENCE_TYPE_CONCATENATION, /* .binary */ + LIBPARSER_SENTENCE_TYPE_ALTERNATION, /* .binary */ + LIBPARSER_SENTENCE_TYPE_OPTIONAL, /* .unary */ + LIBPARSER_SENTENCE_TYPE_REPEATED, /* .unary */ + LIBPARSER_SENTENCE_TYPE_STRING, /* .string */ + LIBPARSER_SENTENCE_TYPE_CHAR_RANGE, /* .char_range */ /* TODO not supported yet: <low, high> */ + LIBPARSER_SENTENCE_TYPE_RULE, /* .rule */ + LIBPARSER_SENTENCE_TYPE_EXCEPTION, /* (none) */ + LIBPARSER_SENTENCE_TYPE_EOF /* (none) */ +}; + +struct libparser_sentence_binary { + enum libparser_sentence_type type; + const union libparser_sentence *left; + const union libparser_sentence *right; +}; + +struct libparser_sentence_unary { + enum libparser_sentence_type type; + const union libparser_sentence *sentence; +}; + +struct libparser_sentence_string { + enum libparser_sentence_type type; + const char *string; + size_t length; +}; + +struct libparser_sentence_char_range { + enum libparser_sentence_type type; + unsigned char low; + unsigned char high; +}; + +struct libparser_sentence_rule { + enum libparser_sentence_type type; + const char *rule; +}; + +union libparser_sentence { + enum libparser_sentence_type type; + struct libparser_sentence_binary binary; + struct libparser_sentence_unary unary; + struct libparser_sentence_string string; + struct libparser_sentence_char_range char_range; + struct libparser_sentence_rule rule; +}; + +struct libparser_rule { + const char *name; + union libparser_sentence *sentence; +}; + +struct libparser_unit { + const char *rule; + struct libparser_unit *in; + struct libparser_unit *next; + size_t start; + size_t end; +}; + + +extern const struct libparser_rule *const libparser_rule_table[]; + + +struct libparser_unit *libparser_parse_file(const struct libparser_rule *const rules[], + const char *data, size_t length, int *exceptionp); |