From 7dd4970daafdc1992e1dba486a1477a17cb4c53e Mon Sep 17 00:00:00 2001 From: Mattias Andrée Date: Mon, 5 Jan 2026 13:50:38 +0100 Subject: Add committed-operator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mattias Andrée --- Makefile | 2 +- calc-example/calc.syntax | 8 ++++---- libparser-generate.c | 15 +++++++++------ libparser.7 | 18 +++++++++++++++--- libparser.c | 30 ++++++++++++++++++++++++++++-- libparser.h | 3 ++- print-syntax.c | 7 +++++++ 7 files changed, 66 insertions(+), 17 deletions(-) diff --git a/Makefile b/Makefile index 543e021..a63b5bf 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ include mk/$(OS).mk LIB_MAJOR = 1 -LIB_MINOR = 1 +LIB_MINOR = 2 LIB_VERSION = $(LIB_MAJOR).$(LIB_MINOR) diff --git a/calc-example/calc.syntax b/calc-example/calc.syntax index fb5a871..d76b68b 100644 --- a/calc-example/calc.syntax +++ b/calc-example/calc.syntax @@ -15,16 +15,16 @@ DIV = _, ("/" | "∕" | "÷"), _; sign = ADD | SUB; _digit = DIGIT | _WHITESPACE | "_" | "'"; -unsigned = DIGIT, {_digit}, !_digit; +unsigned = DIGIT, +({_digit}, !_digit); _number = unsigned | "(", _expr, (")" | -); -number = _number, {_, _number}; (* optionally with implicit multiplication *) +number = _number, +{_, _number}; (* optionally with implicit multiplication *) value = [sign], number; _expr = hyper1; -hyper1 = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _; -hyper2 = _, value, {(MUL | DIV), (value | -)}, _; +hyper1 = _, hyper2, {+(ADD | SUB), +(hyper2 | -)}, _; +hyper2 = _, value, {+(MUL | DIV), +(value | -)}, _; diff --git a/libparser-generate.c b/libparser-generate.c index f72ed85..a8761f8 100644 --- a/libparser-generate.c +++ b/libparser-generate.c @@ -375,6 +375,7 @@ emit_and_free_sentence(struct node *node, size_t *indexp) switch (node->token->s[0]) { case '[': type = "OPTIONAL"; goto unary; case '{': type = "REPEATED"; goto unary; + case '+': type = "COMMITTED"; goto unary; case '!': type = "REJECTION"; unary: emit_and_free_sentence(node->data, indexp); printf("static union libparser_sentence sentence_%zu_%zu = {.unary = {" @@ -501,6 +502,7 @@ order_sentences(struct node *node) case '[': case '{': case '!': + case '+': /* Everything else we immediately put into the queue, * but for brackets and unary operators, we simply * use recursion to order inner sentences */ @@ -699,10 +701,11 @@ again: * closing statements; and we still expect * the next token to be an operand */ goto push_stack; - } else if (tokens[i]->s[0] == '!') { - /* Likewise for rejections (it is added to - * the stack but it is an unary operator - * so no matching symbol will be expected) */ + } else if (tokens[i]->s[0] == '!' || tokens[i]->s[0] == '+') { + /* Likewise for rejections and commitments + * (it is added to the stack but they are + * unary operators so no matching symbol + * will be expected) */ goto push_stack; } else if (tokens[i]->s[0] == '<') { /* Likewise for value ranges, but we expect @@ -741,8 +744,8 @@ again: case EXPECT_OPERATOR: /* When we get an binary operator, or the end * of a sentence, we have to pop out all unary - * operators (rejects) from the stack */ - while (stack->token->s[0] == '!') { + * operators from the stack */ + while (stack->token->s[0] == '!' || stack->token->s[0] == '+') { *stack->parent->head = stack; stack->parent->head = &stack->next; stack = stack->parent; diff --git a/libparser.7 b/libparser.7 index 7157769..2d921a7 100644 --- a/libparser.7 +++ b/libparser.7 @@ -88,6 +88,7 @@ integer = _decimal | _hexadecimal; (* May not exceed 255. *) _low = character | integer; _high = character | integer; +committed = \(dq+\(dq, _, _operand; rejection = \(dq!\(dq, _, _operand; concatenation = _operand, {_, \(dq,\(dq, _, _operand}; alternation = concatenation, {_, \(dq|\(dq, _, concatenation}; @@ -100,7 +101,7 @@ embedded-rule = identifier; _literal = char-range | exception | string; _group = optional | repeated | group | embedded-rule; -_operand = _group | _literal | rejection; +_operand = _group | _literal | rejection | committed; _expression = alternation; @@ -135,6 +136,15 @@ have no semantic meaning and are useful only to put a alternation inside a concatenation without creating a new rule for that. .PP +The parser is non-deterministic, which can make it really +slow. The speed up parsing, you can add commits. Once the +committed sentence has been matched, the branching-points +inside it are unrecorded, so when the parser backtracks, +it will not try different choices inside the committed +sentence. Commit sentences are undone by the parser when +it backtracks to a branching-point outside the committed +sentence. +.PP In character ranges, the .B _high and @@ -150,12 +160,14 @@ the rules will appear in the tree-formatted result. .PP Left recursion is illegal (it will cause stack overflow at runtime as the empty condition before the -recursion is always met). +recursion is always met). Likewise, repeated optional +sentences are illegal; a repeated sentence must always +consume input, otherwise it gets stuck. .SS Right-context-sensitive grammar libparser originally used context-free grammar, but with introduction of the rejection rule, specifically the ability -to reject a rejection, it became a phrase for +to reject a rejection, it became a parser for right-context-sensitive grammar which is a grammar that is that can generate any context-sensitive language, it is however weakly equivalent to context-sensitive grammar. diff --git a/libparser.c b/libparser.c index eec9337..7ef7267 100644 --- a/libparser.c +++ b/libparser.c @@ -119,7 +119,7 @@ struct choice { /** * The follow-ups */ - struct followup *followups; + struct followup *followups; /* TODO add memory pool, maybe even use a memory stack */ /** * The number of follow-ups; this is @@ -266,6 +266,7 @@ sentence_type_string(enum libparser_sentence_type type) case LIBPARSER_SENTENCE_TYPE_EXCEPTION: return "EXCEPTION"; case LIBPARSER_SENTENCE_TYPE_EOF: return "EOF"; case LIBPARSER_SENTENCE_TYPE_EPSILON: return "EPSILON"; + case LIBPARSER_SENTENCE_TYPE_COMMITTED: return "COMMITTED"; default: return ""; } } @@ -390,6 +391,13 @@ print_sentence(const union libparser_sentence *sentence, int indent) indent += 1; break; + case LIBPARSER_SENTENCE_TYPE_COMMITTED: + fprintf(stderr, "+("); + indent = print_sentence(sentence->unary.sentence, indent + 2); + fprintf(stderr, ")"); + indent += 1; + break; + case LIBPARSER_SENTENCE_TYPE_OPTIONAL: fprintf(stderr, "["); indent = print_sentence(sentence->unary.sentence, indent + 1); @@ -882,6 +890,12 @@ add_unit(const char *rule, const union libparser_sentence *sentence, struct cont ctx->previous_rejection = ctx->units.count - 1u; return 1; + case LIBPARSER_SENTENCE_TYPE_COMMITTED: + if (incomplete(NULL, sentence, start, ctx) || + push(NULL, sentence->unary.sentence, ctx)) + return -1; + return 1; + case LIBPARSER_SENTENCE_TYPE_RULE: for (i = 0; ctx->rules[i]; i++) if (!strcmp(ctx->rules[i]->name, sentence->rule.rule)) @@ -993,6 +1007,7 @@ make_tree(struct context *ctx, size_t *unit_i, struct libparser_unit **last) case LIBPARSER_SENTENCE_TYPE_RULE: case LIBPARSER_SENTENCE_TYPE_OPTIONAL: case LIBPARSER_SENTENCE_TYPE_ALTERNATION: + case LIBPARSER_SENTENCE_TYPE_COMMITTED: #if PRINT_ACTIONS fprintf(stderr, "Unary unit: %s: %s [%zu, %zu)\n", sentence_type_string(unit->sentence->type), @@ -1164,7 +1179,7 @@ libparser_parse_file(const struct libparser_rule *const rules[], const char *dat #endif /* TODO guard against left-side recursion */ - /* TODO make branching opt-in ("?"), and add commit statements ("+") */ + /* TODO make branching opt-in ("?") */ /* TODO break REPEATED on empty match */ followup: #if PRINT_ACTIONS @@ -1234,6 +1249,17 @@ followup: } if (!match) goto mismatch; + if (sentence->type == LIBPARSER_SENTENCE_TYPE_COMMITTED) { +#if PRINT_ACTIONS + fprintf(stderr, "Removing interior choices for matched COMMITTED\n"); +#endif + j = ctx.followups.followups[i].unit_index; + while (ctx.choices.count && + ctx.choices.choices[ctx.choices.count - 1u].unit_index > j) { + ctx.choices.next = --ctx.choices.count; + free(ctx.choices.choices[ctx.choices.count].followups.followups); + } + } #if PRINT_ACTIONS fprintf(stderr, "Setting end of popped unit to %zu\n", ctx.position); #endif diff --git a/libparser.h b/libparser.h index 5f55b53..66bedea 100644 --- a/libparser.h +++ b/libparser.h @@ -20,7 +20,8 @@ enum libparser_sentence_type { LIBPARSER_SENTENCE_TYPE_RULE, /* .rule */ LIBPARSER_SENTENCE_TYPE_EXCEPTION, /* (none) */ LIBPARSER_SENTENCE_TYPE_EOF, /* (none) */ - LIBPARSER_SENTENCE_TYPE_EPSILON /* (none) */ + LIBPARSER_SENTENCE_TYPE_EPSILON, /* (none) */ + LIBPARSER_SENTENCE_TYPE_COMMITTED /* .unary */ }; struct libparser_sentence_binary { diff --git a/print-syntax.c b/print-syntax.c index 1584e7b..3228401 100644 --- a/print-syntax.c +++ b/print-syntax.c @@ -42,6 +42,13 @@ print_sentence(const union libparser_sentence *sentence, int indent) indent += 1; break; + case LIBPARSER_SENTENCE_TYPE_COMMITTED: + printf("+("); + indent = print_sentence(sentence->unary.sentence, indent + 2); + printf(")"); + indent += 1; + break; + case LIBPARSER_SENTENCE_TYPE_OPTIONAL: printf("["); indent = print_sentence(sentence->unary.sentence, indent + 1); -- cgit v1.2.3-70-g09d2