diff options
Diffstat (limited to 'libparser.c')
| -rw-r--r-- | libparser.c | 1293 |
1 files changed, 1157 insertions, 136 deletions
diff --git a/libparser.c b/libparser.c index 545aaee..56ce703 100644 --- a/libparser.c +++ b/libparser.c @@ -1,28 +1,480 @@ /* See LICENSE file for copyright and license details. */ #include "libparser.h" +#include <errno.h> +#include <stdint.h> #include <stdlib.h> #include <string.h> -#define IS_HIDDEN(RULE) (!(RULE) || (RULE)[0] == '_') +#define PRINT_ACTIONS 0 + +#if PRINT_ACTIONS +# include <ctype.h> +# include <stdio.h> +#endif + + +#define IS_ANONYMOUS(RULE) (!(RULE) || (RULE)[0] == '_') /* NULL is used when evaluating a subsentence, * and rule with the prefix "_" are embedded * as subsentences */ +#define IS_COMPLETION(FOLLOWUP) (!!(FOLLOWUP).unit) + +#define EXHAUSTED SIZE_MAX + + +/** + * Follow-up stack node + * + * `.unit` decides whether it is a completion + * marker or a subsequent sentence; it is a + * completion marker if and only iff `.unit` + * is non-`NULL`. + */ +struct followup { + /** + * The unit being completed; if `NULL` + * it is a subsequent sentence rather + * than a completion marker and, + * `.sentence` and `.rule` should be used + * instead + */ + struct libparser_unit *unit; + + /** + * Set if and only if `.unit` is non-`NULL`: + * the + * index of the node + */ + size_t unit_index; + + /* -- or -- */ + + /** + * Set if and only if `.unit` is `NULL`: + * the `rule` argument `add_unit` should + * be called with + */ + const char *rule; + + /** + * Set if and only if `.unit` is `NULL`: + * the `sentence` argument `add_unit` + * should be called with + */ + const union libparser_sentence *sentence; +}; + + +/** + * Branching choice + */ +struct choice { + /** + * The current choice value, `EXHAUSTED` + * when the choices are exhausted and the + * sentence cannot be matched + */ + size_t choice; + + /** + * The position in the input text + */ + size_t position; + + /** + * The `rule` argument `add_unit` was + * called with + */ + const char *rule; + + /** + * The `sentence` argument `add_unit` + * was called with + */ + const union libparser_sentence *sentence; + /** + * The value `.units.count` of the `ctx` + * argument `add_unit` was called with + */ + size_t unit_index; + + /** + * Save state of the follow-up stack + */ + struct { + /** + * The follow-ups + */ + struct followup *followups; + + /** + * The number of follow-ups; this is + * also the number of allocated elements + */ + size_t count; + } followups; +}; + + +/** + * Parsing state and context + */ struct context { - const struct libparser_rule *const *rules; /* rule table */ - struct libparser_unit *cache; /* memory allocation cache */ - const char *data; /* text being parsed */ - size_t length; /* length of text */ - size_t position; /* current position in the text */ - char done; /* end reached or .exception or .error set */ - char exception; /* exception-statement reached */ - char error; /* error encountered */ + /** + * The rule table + */ + const struct libparser_rule *const *rules; + + /** + * Memory pool for parsing units + */ + struct libparser_unit *unit_pool; + + /** + * Current choices + */ + struct { + /** + * The branching points and the choices made there + */ + struct choice *choices; + + /** + * Number of allocated choices + */ + size_t size; + + /** + * Number of made choices (it is be temporarily + * one more then .next when amending a choice) + */ + size_t count; + + /** + * The index of the next choice + */ + size_t next; + } choices; + + /** + * Follow-up stack; it is popped any time + * a sentence (including literals and + * subsentences) has ended + */ + struct { + /** + * The follow-ups + */ + struct followup *followups; + + /** + * The number of allocated slots + */ + size_t size; + + /** + * The number of used slots + */ + size_t count; + } followups; + + /** + * List of units leading up to the end + * of the input + */ + struct { + /** + * The units + */ + struct libparser_unit **units; + + /** + * The number of allocated slots + */ + size_t size; + + /** + * The number of used slots + */ + size_t count; + } units; + + /** + * The text being parsed + */ + const char *data; + + /** + * The length of the text + */ + size_t length; + + /** + * The current position in the text + */ + size_t position; + + /** + * Set when an exception-statement reached is encountered; + * cleared when rejected the exception is rejected + */ + char exception; }; /** - * Recursively place move a unit into + * Epsilon rule, used when en OPTIONAL is not take + */ +static union libparser_sentence epsilon = {.type = LIBPARSER_SENTENCE_TYPE_EPSILON}; + + +#if PRINT_ACTIONS + + +static const char * +sentence_type_string(enum libparser_sentence_type type) +{ + switch (type) { + case LIBPARSER_SENTENCE_TYPE_CONCATENATION: return "CONCATENATION"; + case LIBPARSER_SENTENCE_TYPE_ALTERNATION: return "ALTERNATION"; + case LIBPARSER_SENTENCE_TYPE_REJECTION: return "REJECTION"; + case LIBPARSER_SENTENCE_TYPE_OPTIONAL: return "OPTIONAL"; + case LIBPARSER_SENTENCE_TYPE_REPEATED: return "REPEATED"; + case LIBPARSER_SENTENCE_TYPE_STRING: return "STRING"; + case LIBPARSER_SENTENCE_TYPE_CHAR_RANGE: return "CHAR_RANGE"; + case LIBPARSER_SENTENCE_TYPE_RULE: return "RULE"; + case LIBPARSER_SENTENCE_TYPE_EXCEPTION: return "EXCEPTION"; + case LIBPARSER_SENTENCE_TYPE_EOF: return "EOF"; + case LIBPARSER_SENTENCE_TYPE_EPSILON: return "EPSILON"; + default: return "<invalid>"; + } +} + + +static void +print_tree(struct libparser_unit *unit, int indent) +{ + for (; unit; unit = unit->next) { + fprintf(stderr, "%*.s", indent, ""); + fprintf(stderr, "%s: %s%s%s [%zu, %zu) {%p}\n", + unit->rule, sentence_type_string(unit->sentence->type), + unit->sentence->type == LIBPARSER_SENTENCE_TYPE_RULE ? " " : "", + unit->sentence->type == LIBPARSER_SENTENCE_TYPE_RULE + ? unit->sentence->rule.rule : "", + unit->start, unit->end, (void *)unit); + print_tree(unit->in, indent + 2); + } +} + + +static void +print_state(struct context *ctx) +{ + struct libparser_unit *unit; + const union libparser_sentence *sentence; + size_t i; + + fprintf(stderr, "Position in text: %zu of %zu\n", ctx->position, ctx->length); + fprintf(stderr, "Exception state: %i\n", (int)ctx->exception); + + fprintf(stderr, "Have %zu units:\n", ctx->units.count); + for (i = 0; i < ctx->units.count; i++) { + if (ctx->units.units[i]->next) + fprintf(stderr, " (\n"); + print_tree(ctx->units.units[i], 4); + if (ctx->units.units[i]->next) + fprintf(stderr, " )\n"); + } + + fprintf(stderr, "Have %zu choices:\n", ctx->choices.count); + for (i = 0; i < ctx->choices.count; i++) { + fprintf(stderr, " For unit %zu with %zu followups at %zu: %s: %s, ", + ctx->choices.choices[i].unit_index, + ctx->choices.choices[i].followups.count, + ctx->choices.choices[i].position, + sentence_type_string(ctx->choices.choices[i].sentence->type), + ctx->choices.choices[i].rule); + if (ctx->choices.choices[i].choice == EXHAUSTED) { + fprintf(stderr, "choices EXHAUSTED\n"); + } else { + fprintf(stderr, "using choice %zu\n", ctx->choices.choices[i].choice); + } + } + + fprintf(stderr, "Have %zu follow-ups pending (outermost at top):\n", + ctx->followups.count); + for (i = 0; i < ctx->followups.count; i++) { + if (IS_COMPLETION(ctx->followups.followups[i])) { + unit = ctx->followups.followups[i].unit; + sentence = unit->sentence; + fprintf(stderr, " Completion: unit %zu: %s: %s%s%s [%zu, %zu)\n", + ctx->followups.followups[i].unit_index, + unit->rule, sentence_type_string(sentence->type), + sentence->type == LIBPARSER_SENTENCE_TYPE_RULE ? " " : "", + sentence->type == LIBPARSER_SENTENCE_TYPE_RULE + ? sentence->rule.rule : "", + unit->start, unit->end); + } else { + sentence = ctx->followups.followups[i].sentence; + fprintf(stderr, " (%s, %s%s%s)\n", + ctx->followups.followups[i].rule, + sentence_type_string(sentence->type), + sentence->type == LIBPARSER_SENTENCE_TYPE_RULE ? " " : "", + sentence->type == LIBPARSER_SENTENCE_TYPE_RULE + ? sentence->rule.rule : ""); + } + } +} + + +static int +print_sentence(const union libparser_sentence *sentence, int indent) +{ + unsigned char low, high; + int len; + + switch (sentence->type) { + case LIBPARSER_SENTENCE_TYPE_CONCATENATION: + fprintf(stderr, "("); + indent = print_sentence(sentence->binary.left, indent + 1); + fprintf(stderr, ", "); + indent = print_sentence(sentence->binary.right, indent + 2); + fprintf(stderr, ")"); + indent += 1; + break; + + case LIBPARSER_SENTENCE_TYPE_ALTERNATION: + fprintf(stderr, "("); + print_sentence(sentence->binary.left, indent + 1); + fprintf(stderr, " | \n%*.s", indent + 1, ""); + indent = print_sentence(sentence->binary.right, indent + 1); + fprintf(stderr, ")"); + indent += 1; + break; + + case LIBPARSER_SENTENCE_TYPE_REJECTION: + fprintf(stderr, "!("); + indent = print_sentence(sentence->unary.sentence, indent + 2); + fprintf(stderr, ")"); + indent += 1; + break; + + case LIBPARSER_SENTENCE_TYPE_OPTIONAL: + fprintf(stderr, "["); + indent = print_sentence(sentence->unary.sentence, indent + 1); + fprintf(stderr, "]"); + indent += 1; + break; + + case LIBPARSER_SENTENCE_TYPE_REPEATED: + fprintf(stderr, "{"); + indent = print_sentence(sentence->unary.sentence, indent + 1); + fprintf(stderr, "}"); + indent += 1; + break; + + case LIBPARSER_SENTENCE_TYPE_STRING: + fprintf(stderr, "\"%.*s\"%n", (int)sentence->string.length, sentence->string.string, &len); + indent += len; + break; + + case LIBPARSER_SENTENCE_TYPE_CHAR_RANGE: + low = sentence->char_range.low; + high = sentence->char_range.high; + if (isprint(low) && isprint(high)) + fprintf(stderr, "<\"%c\", \"%c\">%n", low, high, &len); + else if (isprint(sentence->char_range.low)) + fprintf(stderr, "<\"%c\", 0x%02x>%n", low, high, &len); + else if (isprint(sentence->char_range.high)) + fprintf(stderr, "<0x%02x, \"%c\">%n", low, high, &len); + else + fprintf(stderr, "<0x%02x, 0x%02x>%n", low, high, &len); + indent += len; + break; + + case LIBPARSER_SENTENCE_TYPE_RULE: + fprintf(stderr, "%s%n", sentence->rule.rule, &len); + indent += len; + break; + + case LIBPARSER_SENTENCE_TYPE_EXCEPTION: + fprintf(stderr, "-"); + indent += 1; + break; + + case LIBPARSER_SENTENCE_TYPE_EOF: + fprintf(stderr, "%s%n", "!<0x00, 0xFF>", &len); + indent += len; + break; + + case LIBPARSER_SENTENCE_TYPE_EPSILON: + fprintf(stderr, "@EPSILON"); + indent += 8; + break; + + default: + abort(); + } + + return indent; +} + + +static void +print_grammar(const struct libparser_rule *const *rules) +{ + size_t i; + int indent; + fprintf(stderr, "Rules:\n"); + for (i = 0; rules[i]; i++) { + fprintf(stderr, " %s = %n", rules[i]->name, &indent); + print_sentence(rules[i]->sentence, indent); + fprintf(stderr, ";\n"); + } +} + + +#endif + + +/** + * Embed a rule's or subsentence's matches + * into where the rule or subsentence was + * matched + * + * @param where The pointer to the match + * @param last Optional output paramter for the + * last embedded, top-level unit + */ +static void +embed_rule(struct libparser_unit **where, struct libparser_unit **last) +{ + struct libparser_unit *old = *where; + +#if PRINT_ACTIONS + fprintf(stderr, "Embedding %s: %s [%zu, %zu)\n", old->rule, + sentence_type_string(old->sentence->type), old->start, old->end); +#endif + + if (old->next) + abort(); + if (old->in == old) + abort(); + *where = old->in; + free(old); + if (last) { + *last = *where; + if (*last) + while ((*last)->next) + *last = (*last)->next; + } +} + + +/** + * Recursively move a unit into * the memory allocation cache * * @param unit Unit to place in the cache @@ -36,8 +488,8 @@ free_unit(struct libparser_unit *unit, struct context *ctx) free_unit(unit->in, ctx); prev = unit; unit = unit->next; - prev->next = ctx->cache; - ctx->cache = prev; + prev->next = ctx->unit_pool; + ctx->unit_pool = prev; } } @@ -64,9 +516,6 @@ dealloc_unit(struct libparser_unit *unit) * new unit, but first try to reuse * from the memory cache * - * On allocation failure, the the - * error marked in the parsing context - * * @param ctx Parsing context * @return Newly allocated (or polled from the * cache) unit, `NULL` on failure @@ -75,131 +524,341 @@ static struct libparser_unit * alloc_unit(struct context *ctx) { struct libparser_unit *unit; - if (!ctx->cache) { + if (!ctx->unit_pool) { unit = malloc(sizeof(*unit)); - if (!unit) { - ctx->done = 1; - ctx->error = 1; + if (!unit) return NULL; - } } else { - unit = ctx->cache; - ctx->cache = unit->next; + unit = ctx->unit_pool; + ctx->unit_pool = unit->next; } return unit; } /** - * Embed a rule's or subsentence's matches - * into where the rule or subsentence was - * matched + * Grow the unit list (if ncessary) and return a + * pointer the next empty slot * - * @param where The pointer to the match - * @param ctx Parsing context + * @param ctx Parsing context + * @return Pointer to the unit slot */ -static void -embed_rule(struct libparser_unit **where, struct context *ctx) +static struct libparser_unit ** +new_unit(struct context *ctx) { - /* remove matched unit */ - (*where)->next = ctx->cache; - ctx->cache = *where; - /* insert interior where matched */ - *where = (*where)->in; + void *new; + if (ctx->units.count == ctx->units.size) { + if (ctx->units.size > SIZE_MAX / sizeof(*ctx->units.units) - 64u) { + enomem: + errno = ENOMEM; + return NULL; + } + ctx->units.size += 64u; + new = realloc(ctx->units.units, ctx->units.size * sizeof(*ctx->units.units)); + if (!new) + goto enomem; + ctx->units.units = new; + } +#if PRINT_ACTIONS + fprintf(stderr, "New unit slot at %zu\n", ctx->units.count); +#endif + return &ctx->units.units[ctx->units.count++]; } -static struct libparser_unit * -try_match(const char *rule, const union libparser_sentence *sentence, struct context *ctx) +/** + * Create a new follow-up node and push it + * into the follow-up stack + * + * @param ctx Parsing context + * @return The follow-up node, `NULL` on failure + */ +static struct followup * +new_followup(struct context *ctx) { - struct libparser_unit *unit, *next; - struct libparser_unit **head; - unsigned char c; - size_t i; + void *new; + if (ctx->followups.count == ctx->followups.size) { + if (ctx->followups.size > SIZE_MAX / sizeof(*ctx->followups.followups) - 64u) { + enomem: + errno = ENOMEM; + return NULL; + } + ctx->followups.size += 64u; + new = realloc(ctx->followups.followups, + ctx->followups.size * sizeof(*ctx->followups.followups)); + if (!new) + goto enomem; + ctx->followups.followups = new; + } + +#if PRINT_ACTIONS + fprintf(stderr, "New follow-up slot at %zu\n", ctx->followups.count); +#endif + + return &ctx->followups.followups[ctx->followups.count++]; +} + + +/** + * Push sentence into follow-up stack + * + * @param rule The name of the rule, or `NULL` if it is an unnamed subsentence + * @param sentence The sentence to push + * @param ctx Parsing context + * @return 0 on success, -1 on failure + */ +static int +push(const char *rule, const union libparser_sentence *sentence, struct context *ctx) +{ + struct followup *followup = new_followup(ctx); + if (!followup) + return -1; + + followup->sentence = sentence; + followup->unit = NULL; + followup->rule = rule; + +#if PRINT_ACTIONS + fprintf(stderr, "Pushing follow-up sentence: %s: %s\n", + sentence_type_string(sentence->type), rule); +#endif + + return 0; +} + + +/** + * Create a unit for a sentence and push it into + * the follow-up stack to mark the sentence as + * completed when the follow-up node is processed + * + * @param rule The name of the rule, or `NULL` if it is an unnamed subsentence + * @param sentence The sentence to push + * @param start The position in the input text where the sentence begins + * @param ctx Parsing context + * @return 0 on success, -1 on failure + */ +static int +incomplete(const char *rule, const union libparser_sentence *sentence, size_t start, struct context *ctx) +{ + struct followup *followup; + struct libparser_unit *unit; + struct libparser_unit **unit_slot; unit = alloc_unit(ctx); if (!unit) - return NULL; - unit->end = 0; + return -1; unit->in = unit->next = NULL; unit->rule = rule; - unit->start = ctx->position; + unit->start = start; + unit->end = start; + unit->sentence = sentence; - switch (sentence->type) { - case LIBPARSER_SENTENCE_TYPE_CONCATENATION: - unit->in = try_match(NULL, sentence->binary.left, ctx); - if (!unit->in) - goto mismatch; - if (ctx->done) - break; - unit->in->next = try_match(NULL, sentence->binary.right, ctx); - if (!unit->in->next) { - free_unit(unit->in, ctx); - goto mismatch; - } - if (IS_HIDDEN(unit->in->next->rule)) - embed_rule(&unit->in->next, ctx); - if (IS_HIDDEN(unit->in->rule)) { - next = unit->in->next; - embed_rule(&unit->in, ctx); - /* rejoin with right-hand */ - if (unit->in) { - for (head = &unit->in->next; *head; head = &(*head)->next); - *head = next; - } else { - unit->in = next; + followup = new_followup(ctx); + if (!followup) { + free_unit(unit, ctx); + return -1; + } + followup->unit = unit; + followup->unit_index = ctx->units.count; + +#if PRINT_ACTIONS + fprintf(stderr, "Pushing follow-up completion for unit %zu: %s: %s\n", + ctx->units.count, sentence_type_string(sentence->type), rule); +#endif + + unit_slot = new_unit(ctx); + if (!unit_slot) { + free_unit(unit, ctx); + return -1; + } + *unit_slot = unit; +#if PRINT_ACTIONS + fprintf(stderr, "Writing unit to slot: %s: %s, spanning [%zu, %zu)\n", + unit->rule, sentence_type_string(unit->sentence->type), unit->start, unit->end); +#endif + + return 0; +} + + +/** + * Add branching point to list of choices, + * or if already added, just the get the + * current choice + * + * @param rule The name of the rule that is a brachning + * point, `NULL` if a subsentence + * @param sentence The sentence that is brachning point + * @param start The position in the input text where the + * brachning begins + * @param choice_out Output parameter for the current choice value + * @return 0 on success, -1 on failure + */ +static int +braching(const char *rule, const union libparser_sentence *sentence, size_t start, struct context *ctx, size_t *choice_out) +{ + struct choice *choice; + size_t size; + void *new; + if (ctx->choices.next == ctx->choices.count) { + if (ctx->choices.count == ctx->choices.size) { + size = ctx->choices.size; + if (size > SIZE_MAX - 8u) { + enomem: + errno = ENOMEM; + return -1; } + size += 8u; + if (size > SIZE_MAX / sizeof(*ctx->choices.choices)) + goto enomem; + new = realloc(ctx->choices.choices, size * sizeof(*ctx->choices.choices)); + if (!new) + goto enomem; + ctx->choices.choices = new; + ctx->choices.size = size; } - break; + choice = &ctx->choices.choices[ctx->choices.next]; + choice->choice = 0; + choice->position = start; + choice->rule = rule; + choice->sentence = sentence; + choice->unit_index = ctx->units.count; + choice->followups.followups = NULL; + choice->followups.count = ctx->followups.count; + if (choice->followups.count) { + size = choice->followups.count * sizeof(*choice->followups.followups); + choice->followups.followups = malloc(size); + if (!choice->followups.followups) + goto enomem; + memcpy(choice->followups.followups, ctx->followups.followups, size); + } + ctx->choices.count++; + +#if PRINT_ACTIONS + fprintf(stderr, "Adding branching point at %zu for unit %zu: %s: %s, at %zu in text\n", + ctx->choices.next, ctx->units.count, + sentence_type_string(sentence->type), rule, start); + } else { + fprintf(stderr, "Revisiting branching point %zu at %zu in text for unit %zu: " + "%s: %s, at %zu in text; choice value: %zu\n", + ctx->choices.next, start, + ctx->choices.choices[ctx->choices.next].unit_index, + sentence_type_string(ctx->choices.choices[ctx->choices.next].sentence->type), + ctx->choices.choices[ctx->choices.next].rule, + ctx->choices.choices[ctx->choices.next].position, + ctx->choices.choices[ctx->choices.next].choice); +#endif + } + *choice_out = ctx->choices.choices[ctx->choices.next++].choice; + return 0; +} + +/** + * Add a sentence, as a unit, to the list of units, + * will adding to the follow-up stack, the subsentences + * the the completion of the sentence unless it is + * atomic, and for branching points, record the choice + * with backtracking data + * + * @param rule The name of the rule, `NULL` if it is a subsentence + * @param sentence The sentence to add + * @param ctx Parsing context + * @return 1 on matched or incomplete, 0 on mismatch, -1 on failure + */ +static int +add_unit(const char *rule, const union libparser_sentence *sentence, struct context *ctx) +{ + struct libparser_unit *unit; + struct libparser_unit **unit_slot; + size_t start = ctx->position; + unsigned char c; + size_t i, choice; + +#if PRINT_ACTIONS + fprintf(stderr, "Visiting %s: %s, at %zu in text\n", + sentence_type_string(sentence->type), rule, start); +#endif + + switch (sentence->type) { case LIBPARSER_SENTENCE_TYPE_ALTERNATION: - unit->in = try_match(NULL, sentence->binary.left, ctx); - if (!unit->in) { - unit->in = try_match(NULL, sentence->binary.right, ctx); - if (!unit->in) - goto mismatch; - } - prone: - if (unit->in && IS_HIDDEN(unit->in->rule)) - embed_rule(&unit->in, ctx); - break; + if (braching(rule, sentence, start, ctx, &choice)) + return -1; + if (choice > 1u) + goto exhausted; + if (incomplete(rule, sentence, start, ctx) || + push(NULL, choice ? sentence->binary.right : sentence->binary.left, ctx)) + return -1; + return 1; - case LIBPARSER_SENTENCE_TYPE_REJECTION: - unit->in = try_match(NULL, sentence->unary.sentence, ctx); - if (unit->in) { - free_unit(unit->in, ctx); - if (!ctx->exception) - goto mismatch; - ctx->exception = 0; + case LIBPARSER_SENTENCE_TYPE_REPEATED: + if (braching(rule, sentence, start, ctx, &choice)) + return -1; + if (choice == 0u) { + if (incomplete(rule, sentence, start, ctx) || + push(NULL, sentence, ctx) || + push(NULL, sentence->unary.sentence, ctx)) + return -1; + } else if (choice == 1u) { + if (incomplete(rule, sentence, start, ctx) || + push("@epsilon", &epsilon, ctx) || + push(NULL, sentence->unary.sentence, ctx)) + return -1; + } else if (choice == 2u) { + if (incomplete(rule, sentence, start, ctx) || + push("@epsilon", &epsilon, ctx) || + push("@epsilon", &epsilon, ctx)) + return -1; + } else { + goto exhausted; } - ctx->position = unit->start; - unit->rule = NULL; - break; + return 1; case LIBPARSER_SENTENCE_TYPE_OPTIONAL: - unit->in = try_match(NULL, sentence->unary.sentence, ctx); - goto prone; + if (braching(rule, sentence, start, ctx, &choice)) + return -1; + if (choice == 0u) { + if (incomplete(rule, sentence, start, ctx) || + push(NULL, sentence->unary.sentence, ctx)) + return -1; + } else if (choice == 1u) { + if (incomplete(rule, sentence, start, ctx) || + push("@epsilon", &epsilon, ctx)) + return -1; + } else { + goto exhausted; + } + return 1; - case LIBPARSER_SENTENCE_TYPE_REPEATED: - head = &unit->in; - while ((*head = try_match(NULL, sentence->unary.sentence, ctx))) { - if (IS_HIDDEN((*head)->rule)) { - embed_rule(head, ctx); - while (*head) - head = &(*head)->next; - } else { - head = &(*head)->next; - } - if (ctx->done) + case LIBPARSER_SENTENCE_TYPE_CONCATENATION: + if (incomplete(rule, sentence, start, ctx) || + push(NULL, sentence->binary.right, ctx) || + push(NULL, sentence->binary.left, ctx)) + return -1; + return 1; + + case LIBPARSER_SENTENCE_TYPE_REJECTION: + if (incomplete(NULL, sentence, start, ctx) || + push(NULL, sentence->unary.sentence, ctx)) + return -1; + return 1; + + case LIBPARSER_SENTENCE_TYPE_RULE: + for (i = 0; ctx->rules[i]; i++) + if (!strcmp(ctx->rules[i]->name, sentence->rule.rule)) break; - } - break; + if (!ctx->rules[i]) + abort(); + if (incomplete(rule, sentence, start, ctx) || + push(ctx->rules[i]->name, ctx->rules[i]->sentence, ctx)) + return -1; + return 1; case LIBPARSER_SENTENCE_TYPE_STRING: - if (sentence->string.length > ctx->length - ctx->position) - goto mismatch; - if (memcmp(&ctx->data[ctx->position], sentence->string.string, sentence->string.length)) - goto mismatch; + if (sentence->string.length > ctx->length - ctx->position || + memcmp(&ctx->data[ctx->position], sentence->string.string, sentence->string.length)) + return 0; ctx->position += sentence->string.length; break; @@ -212,59 +871,249 @@ try_match(const char *rule, const union libparser_sentence *sentence, struct con ctx->position += 1u; break; - case LIBPARSER_SENTENCE_TYPE_RULE: - for (i = 0; ctx->rules[i]; i++) - if (!strcmp(ctx->rules[i]->name, sentence->rule.rule)) - break; - if (!ctx->rules[i]) - abort(); - unit->in = try_match(ctx->rules[i]->name, ctx->rules[i]->sentence, ctx); - if (!unit->in) - goto mismatch; - goto prone; - case LIBPARSER_SENTENCE_TYPE_EXCEPTION: - ctx->done = 1; ctx->exception = 1; break; case LIBPARSER_SENTENCE_TYPE_EOF: if (ctx->position != ctx->length) goto mismatch; - ctx->done = 1; + break; + + case LIBPARSER_SENTENCE_TYPE_EPSILON: break; default: abort(); } + /* for atomic sentences */ + unit = alloc_unit(ctx); + if (!unit) + return -1; + unit->in = unit->next = NULL; + unit->rule = rule; + unit->start = start; unit->end = ctx->position; - return unit; + unit->sentence = sentence; + unit_slot = new_unit(ctx); + if (!unit_slot) { + free_unit(unit, ctx); + return -1; + } + *unit_slot = unit; +#if PRINT_ACTIONS + fprintf(stderr, "Writing atomic unit to slot: %s: %s, spanning [%zu, %zu)\n", + sentence_type_string(unit->sentence->type), unit->rule, unit->start, unit->end); +#endif + return 1; +exhausted: + ctx->choices.choices[ctx->choices.next - 1U].choice = EXHAUSTED; mismatch: - /* On mismatch, restore position in text */ - ctx->position = unit->start; - /* and place the unit in the memory cache */ - unit->next = ctx->cache; - ctx->cache = unit; - return NULL; +#if PRINT_ACTIONS + fprintf(stderr, "Returning mismatch\n"); +#endif + return 0; +} + + +/** + * Convert list of units into a tree + * + * @param ctx Parsing context + * @param unit_i Reference to current position in the unit list + * @param last Output parameter for the last unit on the same + * level as the returned node (used to deal with + * embedding of anonymous sentences) + * @return The first top level node from the current + * position in the list of units + */ +static struct libparser_unit * +make_tree(struct context *ctx, size_t *unit_i, struct libparser_unit **last) +{ + /* TODO try to flatten to call stack */ + struct libparser_unit *unit, *middle, **p; + size_t start; + +#if PRINT_ACTIONS + static int indent = 0; + fprintf(stderr, "%*.s", indent, ""); + fprintf(stderr, "{ at %zu of %zu\n", *unit_i, ctx->units.count); + indent += 4; + fprintf(stderr, "%*.s", indent, ""); +#endif + + if (*unit_i == ctx->units.count) + abort(); + + unit = ctx->units.units[(*unit_i)++]; + unit->in = NULL; + unit->next = NULL; + + switch (unit->sentence->type) { + case LIBPARSER_SENTENCE_TYPE_RULE: + case LIBPARSER_SENTENCE_TYPE_OPTIONAL: + case LIBPARSER_SENTENCE_TYPE_ALTERNATION: +#if PRINT_ACTIONS + fprintf(stderr, "Unary unit: %s: %s [%zu, %zu)\n", + sentence_type_string(unit->sentence->type), + unit->rule, unit->start, unit->end); +#endif + unit->in = make_tree(ctx, unit_i, last); + break; + + case LIBPARSER_SENTENCE_TYPE_REJECTION: +#if PRINT_ACTIONS + fprintf(stderr, "Unary lookahead unit: %s: %s [%zu, %zu)\n", + sentence_type_string(unit->sentence->type), + unit->rule, unit->start, unit->end); +#endif + unit->in = make_tree(ctx, unit_i, last); + start = unit->start; + dealloc_unit(unit); + unit = malloc(sizeof(*unit)); + unit->rule = "@epsilon"; + unit->sentence = ε + unit->in = unit->next = NULL; + unit->start = unit->end = start; + *last = unit; + goto out; + + case LIBPARSER_SENTENCE_TYPE_REPEATED: + case LIBPARSER_SENTENCE_TYPE_CONCATENATION: +#if PRINT_ACTIONS + fprintf(stderr, "Binary unit: %s: %s [%zu, %zu)\n", + sentence_type_string(unit->sentence->type), + unit->rule, unit->start, unit->end); +#endif + unit->in = make_tree(ctx, unit_i, &middle); + if (IS_ANONYMOUS(unit->in->rule)) { + embed_rule(&unit->in, &middle); + if (!middle) { + p = &unit->in; + goto p_found; + } + } + p = &middle->next; + if (*p) + while ((*p)->next) + p = &(*p)->next; + p_found: + *p = make_tree(ctx, unit_i, last); + if (IS_ANONYMOUS((*p)->rule)) + embed_rule(&*p, last); + goto out; + + case LIBPARSER_SENTENCE_TYPE_STRING: + case LIBPARSER_SENTENCE_TYPE_CHAR_RANGE: + case LIBPARSER_SENTENCE_TYPE_EXCEPTION: + case LIBPARSER_SENTENCE_TYPE_EOF: + case LIBPARSER_SENTENCE_TYPE_EPSILON: + *last = unit; +#if PRINT_ACTIONS + fprintf(stderr, "Atomic unit: %s: %s [%zu, %zu)\n", + sentence_type_string(unit->sentence->type), + unit->rule, unit->start, unit->end); +#endif + goto out; + default: + abort(); + } + + if (IS_ANONYMOUS(unit->in->rule)) + embed_rule(&unit->in, last); + +out: +#if PRINT_ACTIONS + indent -= 4; + fprintf(stderr, "%*.s", indent, ""); + fprintf(stderr, "} at %zu of %zu\n", *unit_i, ctx->units.count); +#endif + if (!unit) + abort(); + return unit; +} + + +/** + * Remove all epsilons from the tree + * + * @param unit The tree + * @return The tree + */ +static struct libparser_unit * +remove_epsilons(struct libparser_unit *unit) +{ + struct libparser_unit **node, *deleted; + + if (!unit) + return NULL; + + node = &unit; + while (*node) { + if ((*node)->sentence->type == LIBPARSER_SENTENCE_TYPE_EPSILON) { + deleted = *node; + *node = (*node)->next; + free(deleted); + } else { + (*node)->in = remove_epsilons((*node)->in); + node = &(*node)->next; + } + } + + return unit; +} + + +/** + * Check whether the inner most branching-point + * contains a rejection + * + * @param ctx Parsing context + * @reutrn 1 if branching-point contains rejection, + * 0 otherwise + */ +#ifdef __GNUC__ +__attribute__((__pure__)) +#endif +static int +choice_contains_rejection(struct context *ctx) +{ + /* TODO we should keep track of our rejections so we can do this in constant time */ + size_t i = ctx->choices.choices[ctx->choices.count - 1u].unit_index; + for (i += 1u; i < ctx->units.count; i++) + if (ctx->units.units[i]->sentence->type == LIBPARSER_SENTENCE_TYPE_REJECTION) + return 1; + return 0; } int libparser_parse_file(const struct libparser_rule *const rules[], const char *data, size_t length, struct libparser_unit **rootp) { - struct libparser_unit *ret, *t; + const char *rule; + const union libparser_sentence *sentence; + struct libparser_unit *ret = NULL, *t, *unit; struct context ctx; - size_t i; + size_t i, j, n; + int match; ctx.rules = rules; - ctx.cache = NULL; + ctx.choices.choices = NULL; + ctx.choices.count = 0; + ctx.choices.size = 0; + ctx.choices.next = 0; + ctx.followups.followups = NULL; + ctx.followups.size = 0; + ctx.followups.count = 0; + ctx.units.units = NULL; + ctx.units.size = 0; + ctx.units.count = 0; + ctx.unit_pool = NULL; ctx.data = data; ctx.length = length; ctx.position = 0; - ctx.done = 0; - ctx.error = 0; ctx.exception = 0; for (i = 0; rules[i]; i++) @@ -272,16 +1121,188 @@ libparser_parse_file(const struct libparser_rule *const rules[], const char *dat break; if (!rules[i]) abort(); + rule = rules[i]->name; + sentence = rules[i]->sentence; + +#if PRINT_ACTIONS + print_grammar(rules); +#endif - ret = try_match(rules[i]->name, rules[i]->sentence, &ctx); + /* TODO guard against left-side recursion */ + /* TODO make branching opt-in ("?"), and add commit statements ("+") */ + /* TODO break REPEATED on empty match */ +followup: +#if PRINT_ACTIONS + print_state(&ctx); +#endif + match = add_unit(rule, sentence, &ctx); + if (match < 0) { +#if PRINT_ACTIONS + fprintf(stderr, "Error encountered\n"); +#endif + goto out; + } + while (ctx.followups.count) { + i = --ctx.followups.count; + if (IS_COMPLETION(ctx.followups.followups[i])) { + unit = ctx.followups.followups[i].unit; + sentence = unit->sentence; +#if PRINT_ACTIONS + fprintf(stderr, "Popping completion of sentence: %s: %s, " + "for unit %zu, at %zu, consuming %zu subunits\n", + sentence_type_string(sentence->type), unit->rule, + ctx.followups.followups[i].unit_index, ctx.position, + ctx.units.count - ctx.followups.followups[i].unit_index - 1U); +#endif + if (sentence->type == LIBPARSER_SENTENCE_TYPE_REJECTION) { +#if PRINT_ACTIONS + fprintf(stderr, "Checking REJECTION\n"); +#endif + if (!match) { +#if PRINT_ACTIONS + fprintf(stderr, "Match set, replacing REJECTION with EPSILON\n"); + fprintf(stderr, "Before:\n"); + print_state(&ctx); +#endif + match = 1; + for (j = ctx.followups.followups[i].unit_index + 1u; j < ctx.units.count; j++) + free_unit(ctx.units.units[j], &ctx); + j = ctx.followups.followups[i].unit_index; + ctx.units.count = j + 1u; + if (unit != ctx.units.units[j]) + abort(); + if (unit->in) + abort(); + if (unit->next) + abort(); + unit->rule = "@epsilon"; + unit->sentence = ε + while (ctx.choices.count && ctx.choices.choices[ctx.choices.count - 1u].unit_index >= j) { + ctx.choices.count--; + free(ctx.choices.choices[ctx.choices.count].followups.followups); + } +#if PRINT_ACTIONS + fprintf(stderr, "After:\n"); + print_state(&ctx); + } else if (ctx.exception) { + fprintf(stderr, "Match and exception cleared\n"); + match = 0; +#endif + } else { +#if PRINT_ACTIONS + fprintf(stderr, "Match cleared\n"); +#endif + match = 0; + } + ctx.exception = 0; + ctx.position = unit->start; + } + if (!match) + goto mismatch; +#if PRINT_ACTIONS + fprintf(stderr, "Setting end of popped unit to %zu\n", ctx.position); +#endif + unit->end = ctx.position; + } else if (match) { + rule = ctx.followups.followups[i].rule; + sentence = ctx.followups.followups[i].sentence; +#if PRINT_ACTIONS + fprintf(stderr, "Popping follow-up sentence: %s: %s\n", + sentence_type_string(sentence->type), rule); +#endif + goto followup; + } else mismatch: if (ctx.choices.count && !choice_contains_rejection(&ctx)) { + i = ctx.choices.count - 1u; + ctx.choices.next = i; + n = ctx.units.count; + ctx.units.count = ctx.choices.choices[i].unit_index; + for (j = ctx.units.count; j < n; j++) + free_unit(ctx.units.units[j], &ctx); + if (ctx.choices.choices[i].followups.count > ctx.followups.size) + abort(); + ctx.followups.count = ctx.choices.choices[i].followups.count; + memcpy(ctx.followups.followups, + ctx.choices.choices[i].followups.followups, + ctx.followups.count * sizeof(*ctx.followups.followups)); + if (ctx.choices.choices[i].choice == EXHAUSTED) { + free(ctx.choices.choices[i].followups.followups); + ctx.choices.count--; + goto mismatch; + } + ctx.choices.choices[i].choice++; + ctx.position = ctx.choices.choices[i].position; + rule = ctx.choices.choices[i].rule; + sentence = ctx.choices.choices[i].sentence; +#if PRINT_ACTIONS + fprintf(stderr, "Mismatch: retrying from branching point %zu (%s, %s), " + "and unit %zu, from %zu, now with choice %zu\n", + i, sentence_type_string(sentence->type), rule, ctx.position, + ctx.units.count, ctx.choices.choices[i].choice); +#endif + goto followup; +#if PRINT_ACTIONS + } else { + fprintf(stderr, "Mismatch with %zu pending follow-ups and no choices, " + "continuing polling follow-ups\n", + ctx.followups.count); + print_state(&ctx); +#endif + } + } + +#if PRINT_ACTIONS + fprintf(stderr, "Result determined: %s\n", match ? "match" : "mismatch"); + fprintf(stderr, "Due to exception?: %s\n", ctx.exception ? "yes" : "no"); + print_state(&ctx); +#endif + + if (match) { + if (!ctx.units.count) + abort(); +#if PRINT_ACTIONS + fprintf(stderr, "Converting into tree\n"); +#endif + i = 0; + ret = make_tree(&ctx, &i, &(struct libparser_unit *){NULL}); + if (ret && IS_ANONYMOUS(ret->rule)) { +#if PRINT_ACTIONS + fprintf(stderr, "Embedding at top level:\n "); +#endif + embed_rule(&ret, NULL); + } + if (i != ctx.units.count) + abort(); + ctx.units.count = 0; + + ret = remove_epsilons(ret); + +#if PRINT_ACTIONS + fprintf(stderr, "Match tree:\n"); + print_tree(ret, 4); +#endif + } else if (ctx.choices.count) { +#if PRINT_ACTIONS + fprintf(stderr, "Result with mismatch but remaining choices, trying next alternative\n"); +#endif + ctx.exception = 0; + goto mismatch; + } - while (ctx.cache) { - t = ctx.cache; - ctx.cache = t->next; +out: + for (i = 0; i < ctx.choices.count; i++) + free(ctx.choices.choices[i].followups.followups); + free(ctx.choices.choices); + free(ctx.followups.followups); + for (i = 0; i < ctx.units.count; i++) + dealloc_unit(ctx.units.units[i]); + free(ctx.units.units); + while (ctx.unit_pool) { + t = ctx.unit_pool; + ctx.unit_pool = t->next; free(t); } - if (ctx.error) { + if (match < 0) { dealloc_unit(ret); *rootp = NULL; return -1; |
