diff options
| author | Mattias Andrée <m@maandree.se> | 2026-01-05 14:35:26 +0100 |
|---|---|---|
| committer | Mattias Andrée <m@maandree.se> | 2026-02-23 07:53:08 +0100 |
| commit | 208594ca9f95a87f60ff052490a4d5824dc23801 (patch) | |
| tree | 915a17b51f62b289ba4115214a057610416d755e | |
| parent | Add committed-operator (diff) | |
| download | libparser-208594ca9f95a87f60ff052490a4d5824dc23801.tar.gz libparser-208594ca9f95a87f60ff052490a4d5824dc23801.tar.bz2 libparser-208594ca9f95a87f60ff052490a4d5824dc23801.tar.xz | |
Make deterministic the default
Signed-off-by: Mattias Andrée <m@maandree.se>
| -rw-r--r-- | README | 37 | ||||
| -rw-r--r-- | calc-example/calc.syntax | 8 | ||||
| -rw-r--r-- | libparser-generate.c | 58 | ||||
| -rw-r--r-- | libparser.7 | 30 | ||||
| -rw-r--r-- | libparser.c | 33 | ||||
| -rw-r--r-- | libparser.h | 5 | ||||
| -rw-r--r-- | print-syntax.c | 15 |
7 files changed, 136 insertions, 50 deletions
@@ -76,11 +76,14 @@ EXTENDED DESCRIPTION _low = character | integer; _high = character | integer; + nondeterministic = "?"; + + committed = "+", _, _operand; rejection = "!", _, _operand; concatenation = _operand, {_, ",", _, _operand}; - alternation = concatenation, {_, "|", _, concatenation}; - optional = "[", _, _expression, _, "]"; - repeated = "{", _, _expression, _, "}"; + alternation = concatenation, {_, [nondeterministic], "|", _, concatenation}; + optional = [nondeterministic], "[", _, _expression, _, "]"; + repeated = [nondeterministic], "{", _, _expression, _, "}"; group = "(", _, _expression, _, ")"; char-range = "<", _, _low, _, ",", _, _high, "_", ">"; exception = "-"; @@ -88,7 +91,7 @@ EXTENDED DESCRIPTION _literal = char-range | exception | string; _group = optional | repeated | group | embedded-rule; - _operand = _group | _literal | rejection; + _operand = _group | _literal | rejection | committed; _expression = alternation; @@ -109,12 +112,24 @@ EXTENDED DESCRIPTION reached, the parser will terminate there. Repeated symbols may occur any number of times, including - zero. The compiler is able to backtrack if it takes too much. + zero. The parser will try to take as much as as possible. + + Optional symbols are taken whenever possible. + + Concatenation has higher precedence than alternation, groups + ("(", ..., ")") have no semantic meaning and are useful only + for including alternations inside concatenations or put + alternations or concatenations inside a commitment or + rejection. - Concatenation has higher precedence than alternation, - groups ("(", ..., ")") have no semantic meaning and are useful - only to put a alternation inside a concatenation without - creating a new rule for that. + The parser has the ability to be non-deterministic, which + can make it really slow. The speed up parsing, you can add + commits. Once the committed sentence has been matched, the + branching-points inside it are unrecorded, so when the parser + backtracks, it will not try different choices inside the + committed sentence. Commit sentences are undone by the parser + when it backtracks to a branching-point outside the committed + sentence. In character ranges, the _high and _low values must be at least 0 and at most 255, and _high must be greater than _low. @@ -125,7 +140,9 @@ EXTENDED DESCRIPTION Left recursion is illegal (it will cause stack overflow at runtime as the empty condition before the recursion is always - met). + met). Likewise, repeated optional sentences are illegal; a + repeated sentence must always consume input, otherwise it + gets stuck. Right-context-sensitive grammar libparser originally used context-free grammar, but with diff --git a/calc-example/calc.syntax b/calc-example/calc.syntax index d76b68b..fb5a871 100644 --- a/calc-example/calc.syntax +++ b/calc-example/calc.syntax @@ -15,16 +15,16 @@ DIV = _, ("/" | "∕" | "÷"), _; sign = ADD | SUB; _digit = DIGIT | _WHITESPACE | "_" | "'"; -unsigned = DIGIT, +({_digit}, !_digit); +unsigned = DIGIT, {_digit}, !_digit; _number = unsigned | "(", _expr, (")" | -); -number = _number, +{_, _number}; (* optionally with implicit multiplication *) +number = _number, {_, _number}; (* optionally with implicit multiplication *) value = [sign], number; _expr = hyper1; -hyper1 = _, hyper2, {+(ADD | SUB), +(hyper2 | -)}, _; -hyper2 = _, value, {+(MUL | DIV), +(value | -)}, _; +hyper1 = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _; +hyper2 = _, value, {(MUL | DIV), (value | -)}, _; diff --git a/libparser-generate.c b/libparser-generate.c index a8761f8..0a91835 100644 --- a/libparser-generate.c +++ b/libparser-generate.c @@ -38,6 +38,7 @@ struct node { struct node *next; /* next element in list */ struct node *data; /* beginning of subsentence */ struct node **head; /* end of subsentence */ + int nondeterministic; }; @@ -65,15 +66,6 @@ emalloc(size_t n) } static void * -ecalloc(size_t n, size_t m) -{ - void *ret = calloc(n, m); - if (!ret) - eprintf("%s: calloc %zu %zu: %s\n", argv0, n, m, strerror(errno)); - return ret; -} - -static void * erealloc(void *ptr, size_t n) { void *ret = realloc(ptr, n); @@ -104,6 +96,20 @@ estrdup(char *s) } +static struct node * +new_node(void) +{ + struct node *ret = emalloc(sizeof(*ret)); + ret->token = NULL; + ret->parent = NULL; + ret->next = NULL; + ret->data = NULL; + ret->head = NULL; + ret->nondeterministic = 0; + return ret; +} + + static int strpcmp(const void *av, const void *bv) { @@ -120,6 +126,7 @@ isidentifier(char c) } + static int check_utf8(char *buf, size_t *ip, size_t len) { @@ -379,10 +386,11 @@ emit_and_free_sentence(struct node *node, size_t *indexp) case '!': type = "REJECTION"; unary: emit_and_free_sentence(node->data, indexp); printf("static union libparser_sentence sentence_%zu_%zu = {.unary = {" - ".type = LIBPARSER_SENTENCE_TYPE_%s, .sentence = &sentence_%zu_%zu" + ".type = LIBPARSER_SENTENCE_TYPE_%s%s, .sentence = &sentence_%zu_%zu" "}};\n", nrule_names, index, - type, nrule_names, index + 1u); + node->nondeterministic ? "ND_" : "", type, + nrule_names, index + 1u); break; case '<': @@ -412,11 +420,11 @@ emit_and_free_sentence(struct node *node, size_t *indexp) left = *indexp; emit_and_free_sentence(node->data, indexp); printf("static union libparser_sentence sentence_%zu_%zu = {.binary = {" - ".type = LIBPARSER_SENTENCE_TYPE_%s, " + ".type = LIBPARSER_SENTENCE_TYPE_%s%s, " ".left = &sentence_%zu_%zu, .right = &sentence_%zu_%zu" "}};\n", nrule_names, index, - type, + node->nondeterministic ? "ND_" : "", type, nrule_names, left, nrule_names, right); break; @@ -587,7 +595,7 @@ main(int argc, char *argv[]) struct node *stack = NULL, *parent_node, *node; char *data; struct token **tokens; - size_t i, j; + size_t i, j, nondeterministic; int cmp, err, val; if (argc) { @@ -613,6 +621,7 @@ main(int argc, char *argv[]) printf("#include <libparser.h>\n"); + nondeterministic = SIZE_MAX; i = 0; again: for (; tokens[i]; i++) { @@ -635,7 +644,7 @@ again: } /* Also remove any whitespace (the tokeniser - * simple and does not recognise mulltisymbol + * simple and does not recognise multisymbol * tokens (that is apart form strings and * identifiers) so it cannot ignore whitespace. */ if (isspace(tokens[i]->s[0])) { @@ -643,6 +652,16 @@ again: continue; } + /* Check if next token is non-deterministic */ + if (tokens[i]->s[0] == '?') { + if (tokens[i + 1u]->s[0] == '[' || + tokens[i + 1u]->s[0] == '{' || + tokens[i + 1u]->s[0] == '|') { + nondeterministic = i + 1u; + continue; + } + } + /* For the sake of code readability, identify * the token type */ if (tokens[i]->s[0] == '"') { @@ -661,7 +680,8 @@ again: eprintf("%s: expected an identifier on line %zu at column %zu (character %zu)\n", argv0, tokens[i]->lineno, tokens[i]->column, tokens[i]->character); } - stack = calloc(1, sizeof(*stack)); + stack = new_node(); + stack->nondeterministic = i == nondeterministic; stack->token = tokens[i]; stack->head = &stack->data; /* and then we expect an equals sign */ @@ -715,7 +735,8 @@ again: state = EXPECT_RANGE_LOW; push_stack: parent_node = stack; - stack = ecalloc(1, sizeof(*stack)); + stack = new_node(); + stack->nondeterministic = i == nondeterministic; stack->parent = parent_node; stack->token = tokens[i]; stack->head = &stack->data; @@ -756,7 +777,8 @@ again: * token to be an operand */ state = EXPECT_OPERAND; add_singleton: - node = calloc(1u, sizeof(*node)); + node = new_node(); + node->nondeterministic = i == nondeterministic; node->token = tokens[i]; *stack->head = node; stack->head = &node->next; diff --git a/libparser.7 b/libparser.7 index 2d921a7..c50b5c9 100644 --- a/libparser.7 +++ b/libparser.7 @@ -88,12 +88,14 @@ integer = _decimal | _hexadecimal; (* May not exceed 255. *) _low = character | integer; _high = character | integer; +nondeterministic = \(dq?\(dq; + committed = \(dq+\(dq, _, _operand; rejection = \(dq!\(dq, _, _operand; concatenation = _operand, {_, \(dq,\(dq, _, _operand}; -alternation = concatenation, {_, \(dq|\(dq, _, concatenation}; -optional = \(dq[\(dq, _, _expression, _, \(dq]\(dq; -repeated = \(dq{\(dq, _, _expression, _, \(dq}\(dq; +alternation = concatenation, {_, [nondeterministic], \(dq|\(dq, _, concatenation}; +optional = [nondeterministic], \(dq[\(dq, _, _expression, _, \(dq]\(dq; +repeated = [nondeterministic], \(dq{\(dq, _, _expression, _, \(dq}\(dq; group = \(dq(\(dq, _, _expression, _, \(dq)\(dq; char-range = \(dq<\(dq, _, _low, _, \(dq,\(dq, _, _high, \(dq_\(dq, \(dq>\(dq; exception = \(dq-\(dq; @@ -132,18 +134,18 @@ Optional symbols are taken whenever possible. Concatenation has higher precedence than alternation, groups .RB (\(dq ( "\(dq, ..., \(dq" ) \(dq) -have no semantic meaning and are useful only to put a -alternation inside a concatenation without creating a -new rule for that. +have no semantic meaning and are useful only for including +alternations inside concatenations or put alternations +or concatenations inside a commitment or rejection. .PP -The parser is non-deterministic, which can make it really -slow. The speed up parsing, you can add commits. Once the -committed sentence has been matched, the branching-points -inside it are unrecorded, so when the parser backtracks, -it will not try different choices inside the committed -sentence. Commit sentences are undone by the parser when -it backtracks to a branching-point outside the committed -sentence. +The parser has the ability to be non-deterministic, which +can make it really slow. The speed up parsing, you can add +commits. Once the committed sentence has been matched, +the branching-points inside it are unrecorded, so when +the parser backtracks, it will not try different choices +inside the committed sentence. Commit sentences are undone +by the parser when it backtracks to a branching-point +outside the committed sentence. .PP In character ranges, the .B _high diff --git a/libparser.c b/libparser.c index 7ef7267..5ecb7ee 100644 --- a/libparser.c +++ b/libparser.c @@ -267,6 +267,9 @@ sentence_type_string(enum libparser_sentence_type type) case LIBPARSER_SENTENCE_TYPE_EOF: return "EOF"; case LIBPARSER_SENTENCE_TYPE_EPSILON: return "EPSILON"; case LIBPARSER_SENTENCE_TYPE_COMMITTED: return "COMMITTED"; + case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL: return "ND_OPTIONAL"; + case LIBPARSER_SENTENCE_TYPE_ND_REPEATED: return "ND_REPEATED"; + case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION: return "ND_ALTERNATION"; default: return "<invalid>"; } } @@ -375,6 +378,15 @@ print_sentence(const union libparser_sentence *sentence, int indent) indent += 1; break; + case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION: + fprintf(stderr, "("); + print_sentence(sentence->binary.left, indent + 1); + fprintf(stderr, " ?| \n%*.s", indent + 1, ""); + indent = print_sentence(sentence->binary.right, indent + 1); + fprintf(stderr, ")"); + indent += 1; + break; + case LIBPARSER_SENTENCE_TYPE_ALTERNATION: fprintf(stderr, "("); print_sentence(sentence->binary.left, indent + 1); @@ -398,6 +410,9 @@ print_sentence(const union libparser_sentence *sentence, int indent) indent += 1; break; + case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL: + fprintf(stderr, "?"); + /* fall through */ case LIBPARSER_SENTENCE_TYPE_OPTIONAL: fprintf(stderr, "["); indent = print_sentence(sentence->unary.sentence, indent + 1); @@ -405,6 +420,9 @@ print_sentence(const union libparser_sentence *sentence, int indent) indent += 1; break; + case LIBPARSER_SENTENCE_TYPE_ND_REPEATED: + fprintf(stderr, "?"); + /* fall through */ case LIBPARSER_SENTENCE_TYPE_REPEATED: fprintf(stderr, "{"); indent = print_sentence(sentence->unary.sentence, indent + 1); @@ -827,6 +845,7 @@ add_unit(const char *rule, const union libparser_sentence *sentence, struct cont #endif switch (sentence->type) { + case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION: case LIBPARSER_SENTENCE_TYPE_ALTERNATION: if (braching(rule, sentence, start, ctx, &choice)) return -1; @@ -837,6 +856,7 @@ add_unit(const char *rule, const union libparser_sentence *sentence, struct cont return -1; return 1; + case LIBPARSER_SENTENCE_TYPE_ND_REPEATED: case LIBPARSER_SENTENCE_TYPE_REPEATED: if (braching(rule, sentence, start, ctx, &choice)) return -1; @@ -860,6 +880,7 @@ add_unit(const char *rule, const union libparser_sentence *sentence, struct cont } return 1; + case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL: case LIBPARSER_SENTENCE_TYPE_OPTIONAL: if (braching(rule, sentence, start, ctx, &choice)) return -1; @@ -1005,7 +1026,9 @@ make_tree(struct context *ctx, size_t *unit_i, struct libparser_unit **last) switch (unit->sentence->type) { case LIBPARSER_SENTENCE_TYPE_RULE: + case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL: case LIBPARSER_SENTENCE_TYPE_OPTIONAL: + case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION: case LIBPARSER_SENTENCE_TYPE_ALTERNATION: case LIBPARSER_SENTENCE_TYPE_COMMITTED: #if PRINT_ACTIONS @@ -1033,6 +1056,7 @@ make_tree(struct context *ctx, size_t *unit_i, struct libparser_unit **last) *last = unit; goto out; + case LIBPARSER_SENTENCE_TYPE_ND_REPEATED: case LIBPARSER_SENTENCE_TYPE_REPEATED: case LIBPARSER_SENTENCE_TYPE_CONCATENATION: #if PRINT_ACTIONS @@ -1179,7 +1203,6 @@ libparser_parse_file(const struct libparser_rule *const rules[], const char *dat #endif /* TODO guard against left-side recursion */ - /* TODO make branching opt-in ("?") */ /* TODO break REPEATED on empty match */ followup: #if PRINT_ACTIONS @@ -1249,9 +1272,13 @@ followup: } if (!match) goto mismatch; - if (sentence->type == LIBPARSER_SENTENCE_TYPE_COMMITTED) { + if (sentence->type == LIBPARSER_SENTENCE_TYPE_COMMITTED || + sentence->type == LIBPARSER_SENTENCE_TYPE_ALTERNATION || + sentence->type == LIBPARSER_SENTENCE_TYPE_OPTIONAL || + sentence->type == LIBPARSER_SENTENCE_TYPE_REPEATED) { #if PRINT_ACTIONS - fprintf(stderr, "Removing interior choices for matched COMMITTED\n"); + fprintf(stderr, "Removing interior choices for matched %s\n", + sentence_type_string(sentence->type)); #endif j = ctx.followups.followups[i].unit_index; while (ctx.choices.count && diff --git a/libparser.h b/libparser.h index 66bedea..54e3829 100644 --- a/libparser.h +++ b/libparser.h @@ -21,7 +21,10 @@ enum libparser_sentence_type { LIBPARSER_SENTENCE_TYPE_EXCEPTION, /* (none) */ LIBPARSER_SENTENCE_TYPE_EOF, /* (none) */ LIBPARSER_SENTENCE_TYPE_EPSILON, /* (none) */ - LIBPARSER_SENTENCE_TYPE_COMMITTED /* .unary */ + LIBPARSER_SENTENCE_TYPE_COMMITTED, /* .unary */ + LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL, /* .unary */ + LIBPARSER_SENTENCE_TYPE_ND_REPEATED, /* .unary */ + LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION /* .binary */ }; struct libparser_sentence_binary { diff --git a/print-syntax.c b/print-syntax.c index 3228401..07b3858 100644 --- a/print-syntax.c +++ b/print-syntax.c @@ -26,6 +26,15 @@ print_sentence(const union libparser_sentence *sentence, int indent) indent += 1; break; + case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION: + printf("("); + print_sentence(sentence->binary.left, indent + 1); + printf(" ?| \n%*.s", indent + 1, ""); + indent = print_sentence(sentence->binary.right, indent + 1); + printf(")"); + indent += 1; + break; + case LIBPARSER_SENTENCE_TYPE_ALTERNATION: printf("("); print_sentence(sentence->binary.left, indent + 1); @@ -49,6 +58,9 @@ print_sentence(const union libparser_sentence *sentence, int indent) indent += 1; break; + case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL: + printf("?"); + /* fall through */ case LIBPARSER_SENTENCE_TYPE_OPTIONAL: printf("["); indent = print_sentence(sentence->unary.sentence, indent + 1); @@ -56,6 +68,9 @@ print_sentence(const union libparser_sentence *sentence, int indent) indent += 1; break; + case LIBPARSER_SENTENCE_TYPE_ND_REPEATED: + printf("?"); + /* fall through */ case LIBPARSER_SENTENCE_TYPE_REPEATED: printf("{"); indent = print_sentence(sentence->unary.sentence, indent + 1); |
