/* See LICENSE file for copyright and license details. */ #include "libparser.h" #include #include #include #include #define PRINT_ACTIONS 0 #if PRINT_ACTIONS # include # include #endif #define IS_ANONYMOUS(RULE) (!(RULE) || (RULE)[0] == '_') /* NULL is used when evaluating a subsentence, * and rule with the prefix "_" are embedded * as subsentences */ #define IS_COMPLETION(FOLLOWUP) (!!(FOLLOWUP).unit) #define EXHAUSTED SIZE_MAX /** * Follow-up stack node * * `.unit` decides whether it is a completion * marker or a subsequent sentence; it is a * completion marker if and only iff `.unit` * is non-`NULL`. */ struct followup { /** * The unit being completed; if `NULL` * it is a subsequent sentence rather * than a completion marker and, * `.sentence` and `.rule` should be used * instead */ struct libparser_unit *unit; /** * Set if and only if `.unit` is non-`NULL`: * the index of the node */ size_t unit_index; /** * Set if and only if `.unit` is non-`NULL`: * the index of the node of the previous * rejection, `SIZE_MAX` if none */ size_t previous_rejection; /* -- or -- */ /** * Set if and only if `.unit` is `NULL`: * the `rule` argument `add_unit` should * be called with */ const char *rule; /** * Set if and only if `.unit` is `NULL`: * the `sentence` argument `add_unit` * should be called with */ const union libparser_sentence *sentence; }; /** * Branching choice */ struct choice { /** * The current choice value, `EXHAUSTED` * when the choices are exhausted and the * sentence cannot be matched */ size_t choice; /** * The position in the input text */ size_t position; /** * The `rule` argument `add_unit` was * called with */ const char *rule; /** * The `sentence` argument `add_unit` * was called with */ const union libparser_sentence *sentence; /** * The value `.units.count` of the `ctx` * argument `add_unit` was called with */ size_t unit_index; /** * The value `.previous_rejection` of the `ctx` * argument `add_unit` was called with */ size_t previous_rejection; /** * Save state of the follow-up stack */ struct { /** * The follow-ups */ struct followup *followups; /* TODO add memory pool, maybe even use a memory stack */ /** * The number of follow-ups; this is * also the number of allocated elements */ size_t count; } followups; }; /** * Parsing state and context */ struct context { /** * The rule table */ const struct libparser_rule *const *rules; /** * Memory pool for parsing units */ struct libparser_unit *unit_pool; /** * Current choices */ struct { /** * The branching points and the choices made there */ struct choice *choices; /** * Number of allocated choices */ size_t size; /** * Number of made choices (it is be temporarily * one more then .next when amending a choice) */ size_t count; /** * The index of the next choice */ size_t next; } choices; /** * Follow-up stack; it is popped any time * a sentence (including literals and * subsentences) has ended */ struct { /** * The follow-ups */ struct followup *followups; /** * The number of allocated slots */ size_t size; /** * The number of used slots */ size_t count; } followups; /** * List of units leading up to the end * of the input */ struct { /** * The units */ struct libparser_unit **units; /** * The number of allocated slots */ size_t size; /** * The number of used slots */ size_t count; } units; /** * The index of the node of the previous * rejection, `SIZE_MAX` if none */ size_t previous_rejection; /** * The text being parsed */ const char *data; /** * The length of the text */ size_t length; /** * The current position in the text */ size_t position; /** * Set when an exception-statement reached is encountered; * cleared when rejected the exception is rejected */ char exception; }; /** * Epsilon rule, used when en OPTIONAL is not take */ static union libparser_sentence epsilon = {.type = LIBPARSER_SENTENCE_TYPE_EPSILON}; #if PRINT_ACTIONS static const char * sentence_type_string(enum libparser_sentence_type type) { switch (type) { case LIBPARSER_SENTENCE_TYPE_CONCATENATION: return "CONCATENATION"; case LIBPARSER_SENTENCE_TYPE_ALTERNATION: return "ALTERNATION"; case LIBPARSER_SENTENCE_TYPE_REJECTION: return "REJECTION"; case LIBPARSER_SENTENCE_TYPE_OPTIONAL: return "OPTIONAL"; case LIBPARSER_SENTENCE_TYPE_REPEATED: return "REPEATED"; case LIBPARSER_SENTENCE_TYPE_STRING: return "STRING"; case LIBPARSER_SENTENCE_TYPE_CHAR_RANGE: return "CHAR_RANGE"; case LIBPARSER_SENTENCE_TYPE_RULE: return "RULE"; case LIBPARSER_SENTENCE_TYPE_EXCEPTION: return "EXCEPTION"; case LIBPARSER_SENTENCE_TYPE_EOF: return "EOF"; case LIBPARSER_SENTENCE_TYPE_EPSILON: return "EPSILON"; case LIBPARSER_SENTENCE_TYPE_COMMITTED: return "COMMITTED"; case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL: return "ND_OPTIONAL"; case LIBPARSER_SENTENCE_TYPE_ND_REPEATED: return "ND_REPEATED"; case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION: return "ND_ALTERNATION"; default: return ""; } } static void print_tree(struct libparser_unit *unit, int indent) { for (; unit; unit = unit->next) { fprintf(stderr, "%*.s", indent, ""); fprintf(stderr, "%s: %s%s%s [%zu, %zu) {%p}\n", unit->rule, sentence_type_string(unit->sentence->type), unit->sentence->type == LIBPARSER_SENTENCE_TYPE_RULE ? " " : "", unit->sentence->type == LIBPARSER_SENTENCE_TYPE_RULE ? unit->sentence->rule.rule : "", unit->start, unit->end, (void *)unit); print_tree(unit->in, indent + 2); } } static void print_state(struct context *ctx) { struct libparser_unit *unit; const union libparser_sentence *sentence; size_t i; fprintf(stderr, "Position in text: %zu of %zu\n", ctx->position, ctx->length); fprintf(stderr, "Exception state: %i\n", (int)ctx->exception); fprintf(stderr, "Have %zu units:\n", ctx->units.count); for (i = 0; i < ctx->units.count; i++) { if (ctx->units.units[i]->next) fprintf(stderr, " (\n"); print_tree(ctx->units.units[i], 4); if (ctx->units.units[i]->next) fprintf(stderr, " )\n"); } fprintf(stderr, "Have %zu choices:\n", ctx->choices.count); for (i = 0; i < ctx->choices.count; i++) { fprintf(stderr, " For unit %zu with %zu followups at %zu: %s: %s, ", ctx->choices.choices[i].unit_index, ctx->choices.choices[i].followups.count, ctx->choices.choices[i].position, sentence_type_string(ctx->choices.choices[i].sentence->type), ctx->choices.choices[i].rule); if (ctx->choices.choices[i].choice == EXHAUSTED) fprintf(stderr, "choices EXHAUSTED; "); else fprintf(stderr, "using choice %zu; ", ctx->choices.choices[i].choice); if (ctx->choices.choices[i].previous_rejection == SIZE_MAX) { fprintf(stderr, "no rejection stored\n"); } else { fprintf(stderr, "previous rejection: %zu\n", ctx->choices.choices[i].previous_rejection); } } fprintf(stderr, "Have %zu follow-ups pending (outermost at top):\n", ctx->followups.count); for (i = 0; i < ctx->followups.count; i++) { if (IS_COMPLETION(ctx->followups.followups[i])) { unit = ctx->followups.followups[i].unit; sentence = unit->sentence; fprintf(stderr, " Completion: unit %zu: %s: %s%s%s [%zu, %zu); ", ctx->followups.followups[i].unit_index, unit->rule, sentence_type_string(sentence->type), sentence->type == LIBPARSER_SENTENCE_TYPE_RULE ? " " : "", sentence->type == LIBPARSER_SENTENCE_TYPE_RULE ? sentence->rule.rule : "", unit->start, unit->end); if (ctx->followups.followups[i].previous_rejection == SIZE_MAX) { fprintf(stderr, "no rejection stored\n"); } else { fprintf(stderr, "previous rejection: %zu\n", ctx->followups.followups[i].previous_rejection); } } else { sentence = ctx->followups.followups[i].sentence; fprintf(stderr, " (%s, %s%s%s)\n", ctx->followups.followups[i].rule, sentence_type_string(sentence->type), sentence->type == LIBPARSER_SENTENCE_TYPE_RULE ? " " : "", sentence->type == LIBPARSER_SENTENCE_TYPE_RULE ? sentence->rule.rule : ""); } } } static int print_text(const char *text, size_t textlen) { size_t off = 0, n = 0; int len, ret = 0; while (off + n < textlen) { if (!text[off + n]) break; if (text[off + n] < ' ' || text[off + n] >= 0x7F || text[off + n] == '"' || text[off + n] == '\\') { fprintf(stderr, "%.*s%n", (int)n, &text[off], &len); ret += len; off += n; n = 0; switch (text[off]) { case '\\': fprintf(stderr, "\\\\%n", &len); break; case '\"': fprintf(stderr, "\\\"%n", &len); break; case '\a': fprintf(stderr, "\\a%n", &len); break; case '\b': fprintf(stderr, "\\b%n", &len); break; case '\f': fprintf(stderr, "\\f%n", &len); break; case '\n': fprintf(stderr, "\\n%n", &len); break; case '\r': fprintf(stderr, "\\r%n", &len); break; case '\t': fprintf(stderr, "\\t%n", &len); break; case '\v': fprintf(stderr, "\\v%n", &len); break; default: fprintf(stderr, "\\x%02X%n", +(unsigned char)text[off], &len); break; } off++; } else if (n == 4096U) { fprintf(stderr, "%.*s%n", (int)n, &text[off], &len); ret += len; off += n; n = 0; } else { n++; } } if (n) { fprintf(stderr, "%.*s%n", (int)n, &text[off], &len); ret += len; } return ret; } static int print_sentence(const union libparser_sentence *sentence, int indent) { unsigned char low, high; int len; switch (sentence->type) { case LIBPARSER_SENTENCE_TYPE_CONCATENATION: fprintf(stderr, "("); indent = print_sentence(sentence->binary.left, indent + 1); fprintf(stderr, ", "); indent = print_sentence(sentence->binary.right, indent + 2); fprintf(stderr, ")"); indent += 1; break; case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION: fprintf(stderr, "("); print_sentence(sentence->binary.left, indent + 1); fprintf(stderr, " ?| \n%*.s", indent + 1, ""); indent = print_sentence(sentence->binary.right, indent + 1); fprintf(stderr, ")"); indent += 1; break; case LIBPARSER_SENTENCE_TYPE_ALTERNATION: fprintf(stderr, "("); print_sentence(sentence->binary.left, indent + 1); fprintf(stderr, " | \n%*.s", indent + 1, ""); indent = print_sentence(sentence->binary.right, indent + 1); fprintf(stderr, ")"); indent += 1; break; case LIBPARSER_SENTENCE_TYPE_REJECTION: fprintf(stderr, "!("); indent = print_sentence(sentence->unary.sentence, indent + 2); fprintf(stderr, ")"); indent += 1; break; case LIBPARSER_SENTENCE_TYPE_COMMITTED: fprintf(stderr, "+("); indent = print_sentence(sentence->unary.sentence, indent + 2); fprintf(stderr, ")"); indent += 1; break; case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL: fprintf(stderr, "?"); /* fall through */ case LIBPARSER_SENTENCE_TYPE_OPTIONAL: fprintf(stderr, "["); indent = print_sentence(sentence->unary.sentence, indent + 1); fprintf(stderr, "]"); indent += 1; break; case LIBPARSER_SENTENCE_TYPE_ND_REPEATED: fprintf(stderr, "?"); /* fall through */ case LIBPARSER_SENTENCE_TYPE_REPEATED: fprintf(stderr, "{"); indent = print_sentence(sentence->unary.sentence, indent + 1); fprintf(stderr, "}"); indent += 1; break; case LIBPARSER_SENTENCE_TYPE_STRING: fprintf(stderr, "\"%n", &len); indent += len + print_text(sentence->string.string, sentence->string.length); fprintf(stderr, "\"%n", &len); indent += len; break; case LIBPARSER_SENTENCE_TYPE_CHAR_RANGE: low = sentence->char_range.low; high = sentence->char_range.high; if (isprint(low) && isprint(high)) fprintf(stderr, "<\"%c\", \"%c\">%n", low, high, &len); else if (isprint(sentence->char_range.low)) fprintf(stderr, "<\"%c\", 0x%02x>%n", low, high, &len); else if (isprint(sentence->char_range.high)) fprintf(stderr, "<0x%02x, \"%c\">%n", low, high, &len); else fprintf(stderr, "<0x%02x, 0x%02x>%n", low, high, &len); indent += len; break; case LIBPARSER_SENTENCE_TYPE_RULE: fprintf(stderr, "%s%n", sentence->rule.rule, &len); indent += len; break; case LIBPARSER_SENTENCE_TYPE_EXCEPTION: fprintf(stderr, "-"); indent += 1; break; case LIBPARSER_SENTENCE_TYPE_EOF: fprintf(stderr, "%s%n", "!<0x00, 0xFF>", &len); indent += len; break; case LIBPARSER_SENTENCE_TYPE_EPSILON: fprintf(stderr, "@EPSILON"); indent += 8; break; default: abort(); } return indent; } static void print_grammar(const struct libparser_rule *const *rules) { size_t i; int indent; fprintf(stderr, "Rules:\n"); for (i = 0; rules[i]; i++) { fprintf(stderr, " %s = %n", rules[i]->name, &indent); print_sentence(rules[i]->sentence, indent); fprintf(stderr, ";\n"); } } #endif /** * Embed a rule's or subsentence's matches * into where the rule or subsentence was * matched * * @param where The pointer to the match * @param last Optional output paramter for the * last embedded, top-level unit */ static void embed_rule(struct libparser_unit **where, struct libparser_unit **last) { struct libparser_unit *old = *where; #if PRINT_ACTIONS fprintf(stderr, "Embedding %s: %s [%zu, %zu)\n", old->rule, sentence_type_string(old->sentence->type), old->start, old->end); #endif if (old->next) abort(); if (old->in == old) abort(); *where = old->in; free(old); if (last) { *last = *where; if (*last) while ((*last)->next) *last = (*last)->next; } } /** * Recursively move a unit into * the memory allocation cache * * @param unit Unit to place in the cache * @param ctx Parsing context (holds the cache) */ static void free_unit(struct libparser_unit *unit, struct context *ctx) { struct libparser_unit *prev; while (unit) { free_unit(unit->in, ctx); prev = unit; unit = unit->next; prev->next = ctx->unit_pool; ctx->unit_pool = prev; } } /** * Recursively deallocate a unit * * @param unit Unit to deallocate */ static void dealloc_unit(struct libparser_unit *unit) { struct libparser_unit *next; for (; unit; unit = next) { dealloc_unit(unit->in); next = unit->next; free(unit); } } /** * Allocate, without initialising * new unit, but first try to reuse * from the memory cache * * @param ctx Parsing context * @return Newly allocated (or polled from the * cache) unit, `NULL` on failure */ static struct libparser_unit * alloc_unit(struct context *ctx) { struct libparser_unit *unit; if (!ctx->unit_pool) { unit = malloc(sizeof(*unit)); if (!unit) return NULL; } else { unit = ctx->unit_pool; ctx->unit_pool = unit->next; } return unit; } /** * Grow the unit list (if ncessary) and return a * pointer the next empty slot * * @param ctx Parsing context * @return Pointer to the unit slot */ static struct libparser_unit ** new_unit(struct context *ctx) { void *new; if (ctx->units.count == ctx->units.size) { if (ctx->units.size > SIZE_MAX / sizeof(*ctx->units.units) - 64u) { enomem: errno = ENOMEM; return NULL; } ctx->units.size += 64u; new = realloc(ctx->units.units, ctx->units.size * sizeof(*ctx->units.units)); if (!new) goto enomem; ctx->units.units = new; } #if PRINT_ACTIONS fprintf(stderr, "New unit slot at %zu\n", ctx->units.count); #endif return &ctx->units.units[ctx->units.count++]; } /** * Create a new follow-up node and push it * into the follow-up stack * * @param ctx Parsing context * @return The follow-up node, `NULL` on failure */ static struct followup * new_followup(struct context *ctx) { void *new; if (ctx->followups.count == ctx->followups.size) { if (ctx->followups.size > SIZE_MAX / sizeof(*ctx->followups.followups) - 64u) { enomem: errno = ENOMEM; return NULL; } ctx->followups.size += 64u; new = realloc(ctx->followups.followups, ctx->followups.size * sizeof(*ctx->followups.followups)); if (!new) goto enomem; ctx->followups.followups = new; } #if PRINT_ACTIONS fprintf(stderr, "New follow-up slot at %zu\n", ctx->followups.count); #endif return &ctx->followups.followups[ctx->followups.count++]; } /** * Push sentence into follow-up stack * * @param rule The name of the rule, or `NULL` if it is an unnamed subsentence * @param sentence The sentence to push * @param ctx Parsing context * @return 0 on success, -1 on failure */ static int push(const char *rule, const union libparser_sentence *sentence, struct context *ctx) { struct followup *followup = new_followup(ctx); if (!followup) return -1; followup->sentence = sentence; followup->unit = NULL; followup->rule = rule; #if PRINT_ACTIONS fprintf(stderr, "Pushing follow-up sentence: %s: %s\n", sentence_type_string(sentence->type), rule); #endif return 0; } /** * Create a unit for a sentence and push it into * the follow-up stack to mark the sentence as * completed when the follow-up node is processed * * @param rule The name of the rule, or `NULL` if it is an unnamed subsentence * @param sentence The sentence to push * @param start The position in the input text where the sentence begins * @param ctx Parsing context * @return 0 on success, -1 on failure */ static int incomplete(const char *rule, const union libparser_sentence *sentence, size_t start, struct context *ctx) { struct followup *followup; struct libparser_unit *unit; struct libparser_unit **unit_slot; unit = alloc_unit(ctx); if (!unit) return -1; unit->in = unit->next = NULL; unit->rule = rule; unit->start = start; unit->end = start; unit->sentence = sentence; followup = new_followup(ctx); if (!followup) { free_unit(unit, ctx); return -1; } followup->unit = unit; followup->unit_index = ctx->units.count; followup->previous_rejection = ctx->previous_rejection; #if PRINT_ACTIONS fprintf(stderr, "Pushing follow-up completion for unit %zu: %s: %s\n", ctx->units.count, sentence_type_string(sentence->type), rule); #endif unit_slot = new_unit(ctx); if (!unit_slot) { free_unit(unit, ctx); return -1; } *unit_slot = unit; #if PRINT_ACTIONS fprintf(stderr, "Writing unit to slot: %s: %s, spanning [%zu, %zu)\n", unit->rule, sentence_type_string(unit->sentence->type), unit->start, unit->end); #endif return 0; } /** * Add branching point to list of choices, * or if already added, just the get the * current choice * * @param rule The name of the rule that is a brachning * point, `NULL` if a subsentence * @param sentence The sentence that is brachning point * @param start The position in the input text where the * brachning begins * @param choice_out Output parameter for the current choice value * @return 0 on success, -1 on failure */ static int braching(const char *rule, const union libparser_sentence *sentence, size_t start, struct context *ctx, size_t *choice_out) { struct choice *choice; size_t size; void *new; if (ctx->choices.next == ctx->choices.count) { if (ctx->choices.count == ctx->choices.size) { size = ctx->choices.size; if (size > SIZE_MAX - 8u) { enomem: errno = ENOMEM; return -1; } size += 8u; if (size > SIZE_MAX / sizeof(*ctx->choices.choices)) goto enomem; new = realloc(ctx->choices.choices, size * sizeof(*ctx->choices.choices)); if (!new) goto enomem; ctx->choices.choices = new; ctx->choices.size = size; } choice = &ctx->choices.choices[ctx->choices.next]; choice->choice = 0; choice->position = start; choice->rule = rule; choice->sentence = sentence; choice->unit_index = ctx->units.count; choice->previous_rejection = ctx->previous_rejection; choice->followups.followups = NULL; choice->followups.count = ctx->followups.count; if (choice->followups.count) { size = choice->followups.count * sizeof(*choice->followups.followups); choice->followups.followups = malloc(size); if (!choice->followups.followups) goto enomem; memcpy(choice->followups.followups, ctx->followups.followups, size); } ctx->choices.count++; #if PRINT_ACTIONS fprintf(stderr, "Adding branching point at %zu for unit %zu: %s: %s, at %zu in text\n", ctx->choices.next, ctx->units.count, sentence_type_string(sentence->type), rule, start); } else { fprintf(stderr, "Revisiting branching point %zu at %zu in text for unit %zu: " "%s: %s, at %zu in text; choice value: %zu; ", ctx->choices.next, start, ctx->choices.choices[ctx->choices.next].unit_index, sentence_type_string(ctx->choices.choices[ctx->choices.next].sentence->type), ctx->choices.choices[ctx->choices.next].rule, ctx->choices.choices[ctx->choices.next].position, ctx->choices.choices[ctx->choices.next].choice); if (ctx->choices.choices[ctx->choices.next].previous_rejection == SIZE_MAX) { fprintf(stderr, "no rejection stored\n"); } else { fprintf(stderr, "previous rejection: %zu\n", ctx->choices.choices[ctx->choices.next].previous_rejection); } #endif } *choice_out = ctx->choices.choices[ctx->choices.next++].choice; return 0; } /** * Add a sentence, as a unit, to the list of units, * will adding to the follow-up stack, the subsentences * the the completion of the sentence unless it is * atomic, and for branching points, record the choice * with backtracking data * * @param rule The name of the rule, `NULL` if it is a subsentence * @param sentence The sentence to add * @param ctx Parsing context * @return 1 on matched or incomplete, 0 on mismatch, -1 on failure */ static int add_unit(const char *rule, const union libparser_sentence *sentence, struct context *ctx) { struct libparser_unit *unit; struct libparser_unit **unit_slot; size_t start = ctx->position; unsigned char c; size_t i, choice; #if PRINT_ACTIONS fprintf(stderr, "Visiting %s: %s, at %zu in text\n", sentence_type_string(sentence->type), rule, start); #endif switch (sentence->type) { case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION: case LIBPARSER_SENTENCE_TYPE_ALTERNATION: if (braching(rule, sentence, start, ctx, &choice)) return -1; if (choice > 1u) goto exhausted; if (incomplete(rule, sentence, start, ctx) || push(NULL, choice ? sentence->binary.right : sentence->binary.left, ctx)) return -1; return 1; case LIBPARSER_SENTENCE_TYPE_ND_REPEATED: case LIBPARSER_SENTENCE_TYPE_REPEATED: if (braching(rule, sentence, start, ctx, &choice)) return -1; if (choice == 0u) { if (incomplete(rule, sentence, start, ctx) || push(NULL, sentence, ctx) || push(NULL, sentence->unary.sentence, ctx)) return -1; } else if (choice == 1u) { if (incomplete(rule, sentence, start, ctx) || push("@epsilon", &epsilon, ctx) || push(NULL, sentence->unary.sentence, ctx)) return -1; } else if (choice == 2u) { if (incomplete(rule, sentence, start, ctx) || push("@epsilon", &epsilon, ctx) || push("@epsilon", &epsilon, ctx)) return -1; } else { goto exhausted; } return 1; case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL: case LIBPARSER_SENTENCE_TYPE_OPTIONAL: if (braching(rule, sentence, start, ctx, &choice)) return -1; if (choice == 0u) { if (incomplete(rule, sentence, start, ctx) || push(NULL, sentence->unary.sentence, ctx)) return -1; } else if (choice == 1u) { if (incomplete(rule, sentence, start, ctx) || push("@epsilon", &epsilon, ctx)) return -1; } else { goto exhausted; } return 1; case LIBPARSER_SENTENCE_TYPE_CONCATENATION: if (incomplete(rule, sentence, start, ctx) || push(NULL, sentence->binary.right, ctx) || push(NULL, sentence->binary.left, ctx)) return -1; return 1; case LIBPARSER_SENTENCE_TYPE_REJECTION: if (incomplete(NULL, sentence, start, ctx) || push(NULL, sentence->unary.sentence, ctx)) return -1; ctx->previous_rejection = ctx->units.count - 1u; return 1; case LIBPARSER_SENTENCE_TYPE_COMMITTED: if (incomplete(NULL, sentence, start, ctx) || push(NULL, sentence->unary.sentence, ctx)) return -1; return 1; case LIBPARSER_SENTENCE_TYPE_RULE: for (i = 0; ctx->rules[i]; i++) if (!strcmp(ctx->rules[i]->name, sentence->rule.rule)) break; if (!ctx->rules[i]) abort(); if (incomplete(rule, sentence, start, ctx) || push(ctx->rules[i]->name, ctx->rules[i]->sentence, ctx)) return -1; return 1; case LIBPARSER_SENTENCE_TYPE_STRING: if (sentence->string.length > ctx->length - ctx->position || memcmp(&ctx->data[ctx->position], sentence->string.string, sentence->string.length)) return 0; ctx->position += sentence->string.length; break; case LIBPARSER_SENTENCE_TYPE_CHAR_RANGE: if (ctx->position == ctx->length) goto mismatch; c = ((const unsigned char *)ctx->data)[ctx->position]; if (sentence->char_range.low > c || c > sentence->char_range.high) goto mismatch; ctx->position += 1u; break; case LIBPARSER_SENTENCE_TYPE_EXCEPTION: ctx->exception = 1; break; case LIBPARSER_SENTENCE_TYPE_EOF: if (ctx->position != ctx->length) goto mismatch; break; case LIBPARSER_SENTENCE_TYPE_EPSILON: break; default: abort(); } /* for atomic sentences */ unit = alloc_unit(ctx); if (!unit) return -1; unit->in = unit->next = NULL; unit->rule = rule; unit->start = start; unit->end = ctx->position; unit->sentence = sentence; unit_slot = new_unit(ctx); if (!unit_slot) { free_unit(unit, ctx); return -1; } *unit_slot = unit; #if PRINT_ACTIONS fprintf(stderr, "Writing atomic unit to slot: %s: %s, spanning [%zu, %zu)\n", sentence_type_string(unit->sentence->type), unit->rule, unit->start, unit->end); #endif return 1; exhausted: ctx->choices.choices[ctx->choices.next - 1U].choice = EXHAUSTED; mismatch: #if PRINT_ACTIONS fprintf(stderr, "Returning mismatch\n"); #endif return 0; } /** * Convert list of units into a tree * * @param ctx Parsing context * @param unit_i Reference to current position in the unit list * @param last Output parameter for the last unit on the same * level as the returned node (used to deal with * embedding of anonymous sentences) * @return The first top level node from the current * position in the list of units */ static struct libparser_unit * make_tree(struct context *ctx, size_t *unit_i, struct libparser_unit **last) { /* TODO try to flatten to call stack */ struct libparser_unit *unit, *middle, **p; size_t start; #if PRINT_ACTIONS static int indent = 0; fprintf(stderr, "%*.s", indent, ""); fprintf(stderr, "{ at %zu of %zu\n", *unit_i, ctx->units.count); indent += 4; fprintf(stderr, "%*.s", indent, ""); #endif if (*unit_i == ctx->units.count) abort(); unit = ctx->units.units[(*unit_i)++]; unit->in = NULL; unit->next = NULL; switch (unit->sentence->type) { case LIBPARSER_SENTENCE_TYPE_RULE: case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL: case LIBPARSER_SENTENCE_TYPE_OPTIONAL: case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION: case LIBPARSER_SENTENCE_TYPE_ALTERNATION: case LIBPARSER_SENTENCE_TYPE_COMMITTED: #if PRINT_ACTIONS fprintf(stderr, "Unary unit: %s: %s [%zu, %zu)\n", sentence_type_string(unit->sentence->type), unit->rule, unit->start, unit->end); #endif unit->in = make_tree(ctx, unit_i, last); break; case LIBPARSER_SENTENCE_TYPE_REJECTION: #if PRINT_ACTIONS fprintf(stderr, "Unary lookahead unit: %s: %s [%zu, %zu)\n", sentence_type_string(unit->sentence->type), unit->rule, unit->start, unit->end); #endif unit->in = make_tree(ctx, unit_i, last); start = unit->start; dealloc_unit(unit); unit = malloc(sizeof(*unit)); unit->rule = "@epsilon"; unit->sentence = ε unit->in = unit->next = NULL; unit->start = unit->end = start; *last = unit; goto out; case LIBPARSER_SENTENCE_TYPE_ND_REPEATED: case LIBPARSER_SENTENCE_TYPE_REPEATED: case LIBPARSER_SENTENCE_TYPE_CONCATENATION: #if PRINT_ACTIONS fprintf(stderr, "Binary unit: %s: %s [%zu, %zu)\n", sentence_type_string(unit->sentence->type), unit->rule, unit->start, unit->end); #endif unit->in = make_tree(ctx, unit_i, &middle); if (IS_ANONYMOUS(unit->in->rule)) { embed_rule(&unit->in, &middle); if (!middle) { p = &unit->in; goto p_found; } } p = &middle->next; if (*p) while ((*p)->next) p = &(*p)->next; p_found: *p = make_tree(ctx, unit_i, last); if (IS_ANONYMOUS((*p)->rule)) embed_rule(&*p, last); goto out; case LIBPARSER_SENTENCE_TYPE_STRING: case LIBPARSER_SENTENCE_TYPE_CHAR_RANGE: case LIBPARSER_SENTENCE_TYPE_EXCEPTION: case LIBPARSER_SENTENCE_TYPE_EOF: case LIBPARSER_SENTENCE_TYPE_EPSILON: *last = unit; #if PRINT_ACTIONS fprintf(stderr, "Atomic unit: %s: %s [%zu, %zu)\n", sentence_type_string(unit->sentence->type), unit->rule, unit->start, unit->end); #endif goto out; default: abort(); } if (IS_ANONYMOUS(unit->in->rule)) embed_rule(&unit->in, last); out: #if PRINT_ACTIONS indent -= 4; fprintf(stderr, "%*.s", indent, ""); fprintf(stderr, "} at %zu of %zu\n", *unit_i, ctx->units.count); #endif if (!unit) abort(); return unit; } /** * Remove all epsilons from the tree * * @param unit The tree * @return The tree */ static struct libparser_unit * remove_epsilons(struct libparser_unit *unit) { struct libparser_unit **node, *deleted; if (!unit) return NULL; node = &unit; while (*node) { if ((*node)->sentence->type == LIBPARSER_SENTENCE_TYPE_EPSILON) { deleted = *node; *node = (*node)->next; free(deleted); } else { (*node)->in = remove_epsilons((*node)->in); node = &(*node)->next; } } return unit; } /** * Check whether the inner most branching-point * contains a rejection * * @param ctx Parsing context * @reutrn 1 if branching-point contains rejection, * 0 otherwise */ #ifdef __GNUC__ __attribute__((__pure__)) #endif static int choice_contains_rejection(struct context *ctx) { size_t i = ctx->choices.choices[ctx->choices.count - 1u].previous_rejection; return ctx->previous_rejection != i; } int libparser_parse_file(const struct libparser_rule *const rules[], const char *data, size_t length, struct libparser_unit **rootp) { const char *rule; const union libparser_sentence *sentence; struct libparser_unit *ret = NULL, *t, *unit; struct context ctx; size_t i, j, n; int match; ctx.rules = rules; ctx.choices.choices = NULL; ctx.choices.count = 0; ctx.choices.size = 0; ctx.choices.next = 0; ctx.followups.followups = NULL; ctx.followups.size = 0; ctx.followups.count = 0; ctx.units.units = NULL; ctx.units.size = 0; ctx.units.count = 0; ctx.unit_pool = NULL; ctx.previous_rejection = SIZE_MAX; ctx.data = data; ctx.length = length; ctx.position = 0; ctx.exception = 0; for (i = 0; rules[i]; i++) if (!strcmp(rules[i]->name, "@start")) break; if (!rules[i]) abort(); rule = rules[i]->name; sentence = rules[i]->sentence; #if PRINT_ACTIONS print_grammar(rules); fprintf(stderr, "Input text: \""); print_text(ctx.data, ctx.length); fprintf(stderr, "\"\n"); #endif /* TODO guard against left-side recursion */ /* TODO break REPEATED on empty match */ followup: #if PRINT_ACTIONS print_state(&ctx); #endif match = add_unit(rule, sentence, &ctx); if (match < 0) { #if PRINT_ACTIONS fprintf(stderr, "Error encountered\n"); #endif goto out; } while (ctx.followups.count) { i = --ctx.followups.count; if (IS_COMPLETION(ctx.followups.followups[i])) { unit = ctx.followups.followups[i].unit; sentence = unit->sentence; #if PRINT_ACTIONS fprintf(stderr, "Popping completion of sentence: %s: %s, " "for unit %zu, at %zu, consuming %zu subunits\n", sentence_type_string(sentence->type), unit->rule, ctx.followups.followups[i].unit_index, ctx.position, ctx.units.count - ctx.followups.followups[i].unit_index - 1U); #endif if (sentence->type == LIBPARSER_SENTENCE_TYPE_REJECTION) { #if PRINT_ACTIONS fprintf(stderr, "Checking REJECTION\n"); #endif if (!match) { #if PRINT_ACTIONS fprintf(stderr, "Match set, replacing REJECTION with EPSILON\n"); fprintf(stderr, "Before:\n"); print_state(&ctx); #endif match = 1; for (j = ctx.followups.followups[i].unit_index + 1u; j < ctx.units.count; j++) free_unit(ctx.units.units[j], &ctx); j = ctx.followups.followups[i].unit_index; ctx.units.count = j + 1u; if (unit != ctx.units.units[j]) abort(); if (unit->in) abort(); if (unit->next) abort(); unit->rule = "@epsilon"; unit->sentence = ε while (ctx.choices.count && ctx.choices.choices[ctx.choices.count - 1u].unit_index >= j) { ctx.choices.count--; free(ctx.choices.choices[ctx.choices.count].followups.followups); } #if PRINT_ACTIONS fprintf(stderr, "After:\n"); print_state(&ctx); } else if (ctx.exception) { fprintf(stderr, "Match and exception cleared\n"); match = 0; #endif } else { #if PRINT_ACTIONS fprintf(stderr, "Match cleared\n"); #endif match = 0; } ctx.exception = 0; ctx.position = unit->start; } if (!match) goto mismatch; if (sentence->type == LIBPARSER_SENTENCE_TYPE_COMMITTED || sentence->type == LIBPARSER_SENTENCE_TYPE_ALTERNATION || sentence->type == LIBPARSER_SENTENCE_TYPE_OPTIONAL || sentence->type == LIBPARSER_SENTENCE_TYPE_REPEATED) { #if PRINT_ACTIONS fprintf(stderr, "Removing interior choices for matched %s\n", sentence_type_string(sentence->type)); #endif j = ctx.followups.followups[i].unit_index; while (ctx.choices.count && ctx.choices.choices[ctx.choices.count - 1u].unit_index > j) { ctx.choices.next = --ctx.choices.count; free(ctx.choices.choices[ctx.choices.count].followups.followups); } } #if PRINT_ACTIONS fprintf(stderr, "Setting end of popped unit to %zu\n", ctx.position); #endif unit->end = ctx.position; ctx.previous_rejection = ctx.followups.followups[i].previous_rejection; } else if (match) { rule = ctx.followups.followups[i].rule; sentence = ctx.followups.followups[i].sentence; #if PRINT_ACTIONS fprintf(stderr, "Popping follow-up sentence: %s: %s\n", sentence_type_string(sentence->type), rule); #endif goto followup; } else mismatch: if (ctx.choices.count && !choice_contains_rejection(&ctx)) { i = ctx.choices.count - 1u; ctx.choices.next = i; n = ctx.units.count; ctx.units.count = ctx.choices.choices[i].unit_index; for (j = ctx.units.count; j < n; j++) free_unit(ctx.units.units[j], &ctx); if (ctx.choices.choices[i].followups.count > ctx.followups.size) abort(); ctx.followups.count = ctx.choices.choices[i].followups.count; memcpy(ctx.followups.followups, ctx.choices.choices[i].followups.followups, ctx.followups.count * sizeof(*ctx.followups.followups)); if (ctx.choices.choices[i].choice == EXHAUSTED) { free(ctx.choices.choices[i].followups.followups); ctx.choices.count--; goto mismatch; } ctx.choices.choices[i].choice++; ctx.position = ctx.choices.choices[i].position; ctx.previous_rejection = ctx.choices.choices[i].previous_rejection; rule = ctx.choices.choices[i].rule; sentence = ctx.choices.choices[i].sentence; #if PRINT_ACTIONS fprintf(stderr, "Mismatch: retrying from branching point %zu (%s, %s), " "and unit %zu, from %zu, now with choice %zu\n", i, sentence_type_string(sentence->type), rule, ctx.position, ctx.units.count, ctx.choices.choices[i].choice); #endif goto followup; #if PRINT_ACTIONS } else { fprintf(stderr, "Mismatch with %zu pending follow-ups and no choices, " "continuing polling follow-ups\n", ctx.followups.count); print_state(&ctx); #endif } } #if PRINT_ACTIONS fprintf(stderr, "Result determined: %s\n", match ? "match" : "mismatch"); fprintf(stderr, "Due to exception?: %s\n", ctx.exception ? "yes" : "no"); print_state(&ctx); #endif if (match) { if (!ctx.units.count) abort(); #if PRINT_ACTIONS fprintf(stderr, "Converting into tree\n"); #endif i = 0; ret = make_tree(&ctx, &i, &(struct libparser_unit *){NULL}); if (ret && IS_ANONYMOUS(ret->rule)) { #if PRINT_ACTIONS fprintf(stderr, "Embedding at top level:\n "); #endif embed_rule(&ret, NULL); } if (i != ctx.units.count) abort(); ctx.units.count = 0; ret = remove_epsilons(ret); #if PRINT_ACTIONS fprintf(stderr, "Match tree:\n"); print_tree(ret, 4); #endif } else if (ctx.choices.count) { #if PRINT_ACTIONS fprintf(stderr, "Result with mismatch but remaining choices, trying next alternative\n"); #endif ctx.exception = 0; goto mismatch; } out: for (i = 0; i < ctx.choices.count; i++) free(ctx.choices.choices[i].followups.followups); free(ctx.choices.choices); free(ctx.followups.followups); for (i = 0; i < ctx.units.count; i++) dealloc_unit(ctx.units.units[i]); free(ctx.units.units); while (ctx.unit_pool) { t = ctx.unit_pool; ctx.unit_pool = t->next; free(t); } if (match < 0) { dealloc_unit(ret); *rootp = NULL; return -1; } *rootp = ret; return !ctx.exception; }