diff options
author | Mattias Andrée <maandree@kth.se> | 2021-04-17 16:46:27 +0200 |
---|---|---|
committer | Mattias Andrée <maandree@kth.se> | 2021-04-17 16:46:27 +0200 |
commit | f38ab6d87c82c2ffd88a23341689dddd62d36ea6 (patch) | |
tree | 1340d65c00234c70e768e400ebd95019e781bbbf | |
parent | Fix surrogate rejection (diff) | |
download | libparser-f38ab6d87c82c2ffd88a23341689dddd62d36ea6.tar.gz libparser-f38ab6d87c82c2ffd88a23341689dddd62d36ea6.tar.bz2 libparser-f38ab6d87c82c2ffd88a23341689dddd62d36ea6.tar.xz |
Add implement support for character ranges
Signed-off-by: Mattias Andrée <maandree@kth.se>
-rw-r--r-- | TODO | 1 | ||||
-rw-r--r-- | config.mk | 4 | ||||
-rw-r--r-- | libparser-generate.c | 127 | ||||
-rw-r--r-- | libparser.h | 2 |
4 files changed, 126 insertions, 8 deletions
@@ -1,3 +1,4 @@ Add support for prelexed Add README Add man pages +Add tests @@ -4,5 +4,5 @@ MANPREFIX = $(PREFIX)/share/man CC = cc CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -I"$$(pwd)" -CFLAGS = -std=c99 -Wall -O2 -LDFLAGS = -s +CFLAGS = -std=c99 -Wall -Og -g +LDFLAGS = diff --git a/libparser-generate.c b/libparser-generate.c index ef2177c..a07ca4f 100644 --- a/libparser-generate.c +++ b/libparser-generate.c @@ -80,7 +80,7 @@ ereallocarray(void *ptr, size_t n, size_t m) void *ret; if (n && m > SIZE_MAX / n) eprintf("%s: realloc %p %zu*%zu: %s\n", argv0, ptr, n, m, strerror(EOVERFLOW)); - ret = realloc(ptr, n * n); + ret = realloc(ptr, n * m); if (!ret) eprintf("%s: realloc %p %zu*%zu: %s\n", argv0, ptr, n, m, strerror(errno)); return ret; @@ -354,7 +354,7 @@ static void emit_and_free_sentence(struct node *node, size_t *indexp) { size_t index = (*indexp)++, left, right; - struct node *next; + struct node *next, *low, *high; for (; node->token->s[0] == '('; node = next) { next = node->data; @@ -368,6 +368,23 @@ emit_and_free_sentence(struct node *node, size_t *indexp) ".type = LIBPARSER_SENTENCE_TYPE_%s, .sentence = &sentence_%zu_%zu" "}};\n", nrule_names, index, node->token->s[0] == '[' ? "OPTIONAL" : "REPEATED", nrule_names, index + 1); + } else if (node->token->s[0] == '<') { + low = node->data; + high = node->data->next; + if ((unsigned char)low->token->s[0] > (unsigned char)high->token->s[0]) { + eprintf("%s: lower character range bound on line %zu at column %zu (character %zu) " + "is greater than upper bound on line %zu at column %zu (character %zu)\n", + argv0, low->token->lineno, low->token->column, low->token->character, + high->token->lineno, high->token->column, high->token->character); + } + printf("static union libparser_sentence sentence_%zu_%zu = {.unary = {" + ".type = LIBPARSER_SENTENCE_TYPE_CHAR_RANGE, .low = %hhu, .high = %hhu" + "}};\n", + nrule_names, index, (unsigned char)low->token->s[0], (unsigned char)high->token->s[0]); + free(low->token); + free(high->token); + free(low); + free(high); } else if (node->token->s[0] == '|' || node->token->s[0] == ',') { right = *indexp; emit_and_free_sentence(node->data->next, indexp); @@ -470,7 +487,8 @@ emit_and_free_rule(struct node *rule) { size_t index = 0; - rule->data = order_sentences(rule->data); + if (rule->data->token->s[0] != '<') + rule->data = order_sentences(rule->data); emit_and_free_sentence(rule->data, &index); printf("static struct libparser_rule rule_%zu = {\"%s\", &sentence_%zu_0};\n", nrule_names, rule->token->s, nrule_names); @@ -495,13 +513,17 @@ main(int argc, char *argv[]) NEW_RULE, EXPECT_EQUALS, EXPECT_OPERAND, - EXPECT_OPERATOR + EXPECT_OPERATOR, + EXPECT_RANGE_LOW, + EXPECT_RANGE_DELIM, + EXPECT_RANGE_HIGH, + EXPECT_RANGE_CLOSE } state = NEW_RULE; struct node *stack = NULL, *parent_node, *node; char *data; struct token **tokens; size_t i, j; - int cmp, err; + int cmp, err, val; if (argc) { argv0 = *argv++; @@ -580,6 +602,10 @@ again: case EXPECT_OPERAND: if (type == SYMBOL) { if (tokens[i]->s[0] == '(' || tokens[i]->s[0] == '[' || tokens[i]->s[0] == '{') { + goto push_stack; + } else if (tokens[i]->s[0] == '<') { + state = EXPECT_RANGE_LOW; + push_stack: parent_node = stack; stack = ecalloc(1, sizeof(*stack)); stack->parent = parent_node; @@ -642,6 +668,97 @@ again: } break; + case EXPECT_RANGE_LOW: + state = EXPECT_RANGE_DELIM; + goto add_range_bound; + + case EXPECT_RANGE_DELIM: + if (type != SYMBOL || tokens[i]->s[0] != ',') { + eprintf("%s: expected an ',' on line %zu at column %zu (character %zu)\n", + argv0, tokens[i]->lineno, tokens[i]->column, tokens[i]->character); + } + free(tokens[i]); + state = EXPECT_RANGE_HIGH; + break; + + case EXPECT_RANGE_HIGH: + state = EXPECT_RANGE_CLOSE; + add_range_bound: + if (type == IDENTIFIER) { + val = 0; + if (tokens[i]->s[0] == '0' && (tokens[i]->s[1] == 'x' || tokens[i]->s[1] == 'X')) { + for (j = 2; isxdigit(tokens[i]->s[j]) && val < 255; j++) + val = (val * 16) | ((tokens[i]->s[j] & 15) + (tokens[i]->s[j] > '9' ? 9 : 0)); + } else { + for (j = 0; isdigit(tokens[i]->s[j]) && val < 255; j++) + val = val * 10 + (tokens[i]->s[j] & 15); + } + if (val > 255 || tokens[i]->s[j]) + goto invalid_range; + tokens[i]->s[0] = (char)val; + tokens[i]->s[1] = '\0'; + } else if (type == STRING) { + /* tokens[i]->s[0] is '"' */ + if (!tokens[i]->s[1]) { + goto invalid_range; + } else if (tokens[i]->s[1] == '\\') { + j = 3; + if (tokens[i]->s[2] == 'a') { + tokens[i]->s[1] = '\a'; + } else if (tokens[i]->s[2] == 'b') { + tokens[i]->s[1] = '\b'; + } else if (tokens[i]->s[2] == 'e') { + tokens[i]->s[1] = '\x1b'; + } else if (tokens[i]->s[2] == 'E') { + tokens[i]->s[1] = '\x1b'; + } else if (tokens[i]->s[2] == 'f') { + tokens[i]->s[1] = '\f'; + } else if (tokens[i]->s[2] == 'n') { + tokens[i]->s[1] = '\n'; + } else if (tokens[i]->s[2] == 'r') { + tokens[i]->s[1] = '\r'; + } else if (tokens[i]->s[2] == 'v') { + tokens[i]->s[1] = '\v'; + } else if (tokens[i]->s[2] == 'x' && isxdigit(tokens[i]->s[3]) && isxdigit(tokens[i]->s[4])) { + val = ((tokens[i]->s[3] & 15) + (tokens[i]->s[3] > '9' ? 9 : 0)) * 16; + val |= (tokens[i]->s[4] & 15) + (tokens[i]->s[4] > '9' ? 9 : 0); + tokens[i]->s[0] = (char)val; + j = 5; + } else if ('0' <= tokens[i]->s[2] && tokens[i]->s[2] <= '7') { + val = 0; + for (j = 2; '0' <= tokens[i]->s[j] && tokens[i]->s[j] <= '7' && val < 255; j++) + val = (val * 8) | (tokens[i]->s[j] & 15); + if (val > 255) + goto invalid_range; + tokens[i]->s[0] = (char)val; + } else { + goto invalid_range; + } + if (tokens[i]->s[j]) + goto invalid_range; + tokens[i]->s[1] = '\0'; + } else if (tokens[i]->s[2]) { + goto invalid_range; + } else { + tokens[i]->s[0] = tokens[i]->s[1]; + tokens[i]->s[1] = '\0'; + } + } else { + invalid_range: + eprintf("%s: expected a [0, 255] integer or single byte string " + "on line %zu at column %zu (character %zu)\n", + argv0, tokens[i]->lineno, tokens[i]->column, tokens[i]->character); + } + goto add_singleton; + + case EXPECT_RANGE_CLOSE: + if (type != SYMBOL || tokens[i]->s[0] != '>') { + eprintf("%s: expected an '>' on line %zu at column %zu (character %zu)\n", + argv0, tokens[i]->lineno, tokens[i]->column, tokens[i]->character); + } + state = EXPECT_OPERATOR; + goto pop; + default: abort(); } diff --git a/libparser.h b/libparser.h index 646c8df..67ba11a 100644 --- a/libparser.h +++ b/libparser.h @@ -12,7 +12,7 @@ enum libparser_sentence_type { LIBPARSER_SENTENCE_TYPE_OPTIONAL, /* .unary */ LIBPARSER_SENTENCE_TYPE_REPEATED, /* .unary */ LIBPARSER_SENTENCE_TYPE_STRING, /* .string */ - LIBPARSER_SENTENCE_TYPE_CHAR_RANGE, /* .char_range */ /* TODO not supported in libparser-generate yet: <low, high> */ + LIBPARSER_SENTENCE_TYPE_CHAR_RANGE, /* .char_range */ LIBPARSER_SENTENCE_TYPE_RULE, /* .rule */ LIBPARSER_SENTENCE_TYPE_EXCEPTION, /* (none) */ LIBPARSER_SENTENCE_TYPE_EOF /* (none) */ |