aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias Andrée <m@maandree.se>2026-01-05 13:50:38 +0100
committerMattias Andrée <m@maandree.se>2026-02-23 07:52:18 +0100
commit7dd4970daafdc1992e1dba486a1477a17cb4c53e (patch)
treee56e544ec55e68872397738e0e5064b000d31eba
parentOptimise detection of rejection (diff)
downloadlibparser-7dd4970daafdc1992e1dba486a1477a17cb4c53e.tar.gz
libparser-7dd4970daafdc1992e1dba486a1477a17cb4c53e.tar.bz2
libparser-7dd4970daafdc1992e1dba486a1477a17cb4c53e.tar.xz
Add committed-operator
Signed-off-by: Mattias Andrée <m@maandree.se>
-rw-r--r--Makefile2
-rw-r--r--calc-example/calc.syntax8
-rw-r--r--libparser-generate.c15
-rw-r--r--libparser.718
-rw-r--r--libparser.c30
-rw-r--r--libparser.h3
-rw-r--r--print-syntax.c7
7 files changed, 66 insertions, 17 deletions
diff --git a/Makefile b/Makefile
index 543e021..a63b5bf 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,7 @@ include mk/$(OS).mk
LIB_MAJOR = 1
-LIB_MINOR = 1
+LIB_MINOR = 2
LIB_VERSION = $(LIB_MAJOR).$(LIB_MINOR)
diff --git a/calc-example/calc.syntax b/calc-example/calc.syntax
index fb5a871..d76b68b 100644
--- a/calc-example/calc.syntax
+++ b/calc-example/calc.syntax
@@ -15,16 +15,16 @@ DIV = _, ("/" | "∕" | "÷"), _;
sign = ADD | SUB;
_digit = DIGIT | _WHITESPACE | "_" | "'";
-unsigned = DIGIT, {_digit}, !_digit;
+unsigned = DIGIT, +({_digit}, !_digit);
_number = unsigned | "(", _expr, (")" | -);
-number = _number, {_, _number}; (* optionally with implicit multiplication *)
+number = _number, +{_, _number}; (* optionally with implicit multiplication *)
value = [sign], number;
_expr = hyper1;
-hyper1 = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _;
-hyper2 = _, value, {(MUL | DIV), (value | -)}, _;
+hyper1 = _, hyper2, {+(ADD | SUB), +(hyper2 | -)}, _;
+hyper2 = _, value, {+(MUL | DIV), +(value | -)}, _;
diff --git a/libparser-generate.c b/libparser-generate.c
index f72ed85..a8761f8 100644
--- a/libparser-generate.c
+++ b/libparser-generate.c
@@ -375,6 +375,7 @@ emit_and_free_sentence(struct node *node, size_t *indexp)
switch (node->token->s[0]) {
case '[': type = "OPTIONAL"; goto unary;
case '{': type = "REPEATED"; goto unary;
+ case '+': type = "COMMITTED"; goto unary;
case '!': type = "REJECTION"; unary:
emit_and_free_sentence(node->data, indexp);
printf("static union libparser_sentence sentence_%zu_%zu = {.unary = {"
@@ -501,6 +502,7 @@ order_sentences(struct node *node)
case '[':
case '{':
case '!':
+ case '+':
/* Everything else we immediately put into the queue,
* but for brackets and unary operators, we simply
* use recursion to order inner sentences */
@@ -699,10 +701,11 @@ again:
* closing statements; and we still expect
* the next token to be an operand */
goto push_stack;
- } else if (tokens[i]->s[0] == '!') {
- /* Likewise for rejections (it is added to
- * the stack but it is an unary operator
- * so no matching symbol will be expected) */
+ } else if (tokens[i]->s[0] == '!' || tokens[i]->s[0] == '+') {
+ /* Likewise for rejections and commitments
+ * (it is added to the stack but they are
+ * unary operators so no matching symbol
+ * will be expected) */
goto push_stack;
} else if (tokens[i]->s[0] == '<') {
/* Likewise for value ranges, but we expect
@@ -741,8 +744,8 @@ again:
case EXPECT_OPERATOR:
/* When we get an binary operator, or the end
* of a sentence, we have to pop out all unary
- * operators (rejects) from the stack */
- while (stack->token->s[0] == '!') {
+ * operators from the stack */
+ while (stack->token->s[0] == '!' || stack->token->s[0] == '+') {
*stack->parent->head = stack;
stack->parent->head = &stack->next;
stack = stack->parent;
diff --git a/libparser.7 b/libparser.7
index 7157769..2d921a7 100644
--- a/libparser.7
+++ b/libparser.7
@@ -88,6 +88,7 @@ integer = _decimal | _hexadecimal; (* May not exceed 255. *)
_low = character | integer;
_high = character | integer;
+committed = \(dq+\(dq, _, _operand;
rejection = \(dq!\(dq, _, _operand;
concatenation = _operand, {_, \(dq,\(dq, _, _operand};
alternation = concatenation, {_, \(dq|\(dq, _, concatenation};
@@ -100,7 +101,7 @@ embedded-rule = identifier;
_literal = char-range | exception | string;
_group = optional | repeated | group | embedded-rule;
-_operand = _group | _literal | rejection;
+_operand = _group | _literal | rejection | committed;
_expression = alternation;
@@ -135,6 +136,15 @@ have no semantic meaning and are useful only to put a
alternation inside a concatenation without creating a
new rule for that.
.PP
+The parser is non-deterministic, which can make it really
+slow. The speed up parsing, you can add commits. Once the
+committed sentence has been matched, the branching-points
+inside it are unrecorded, so when the parser backtracks,
+it will not try different choices inside the committed
+sentence. Commit sentences are undone by the parser when
+it backtracks to a branching-point outside the committed
+sentence.
+.PP
In character ranges, the
.B _high
and
@@ -150,12 +160,14 @@ the rules will appear in the tree-formatted result.
.PP
Left recursion is illegal (it will cause stack
overflow at runtime as the empty condition before the
-recursion is always met).
+recursion is always met). Likewise, repeated optional
+sentences are illegal; a repeated sentence must always
+consume input, otherwise it gets stuck.
.SS Right-context-sensitive grammar
libparser originally used context-free grammar, but with
introduction of the rejection rule, specifically the ability
-to reject a rejection, it became a phrase for
+to reject a rejection, it became a parser for
right-context-sensitive grammar which is a grammar that is
that can generate any context-sensitive language, it is
however weakly equivalent to context-sensitive grammar.
diff --git a/libparser.c b/libparser.c
index eec9337..7ef7267 100644
--- a/libparser.c
+++ b/libparser.c
@@ -119,7 +119,7 @@ struct choice {
/**
* The follow-ups
*/
- struct followup *followups;
+ struct followup *followups; /* TODO add memory pool, maybe even use a memory stack */
/**
* The number of follow-ups; this is
@@ -266,6 +266,7 @@ sentence_type_string(enum libparser_sentence_type type)
case LIBPARSER_SENTENCE_TYPE_EXCEPTION: return "EXCEPTION";
case LIBPARSER_SENTENCE_TYPE_EOF: return "EOF";
case LIBPARSER_SENTENCE_TYPE_EPSILON: return "EPSILON";
+ case LIBPARSER_SENTENCE_TYPE_COMMITTED: return "COMMITTED";
default: return "<invalid>";
}
}
@@ -390,6 +391,13 @@ print_sentence(const union libparser_sentence *sentence, int indent)
indent += 1;
break;
+ case LIBPARSER_SENTENCE_TYPE_COMMITTED:
+ fprintf(stderr, "+(");
+ indent = print_sentence(sentence->unary.sentence, indent + 2);
+ fprintf(stderr, ")");
+ indent += 1;
+ break;
+
case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
fprintf(stderr, "[");
indent = print_sentence(sentence->unary.sentence, indent + 1);
@@ -882,6 +890,12 @@ add_unit(const char *rule, const union libparser_sentence *sentence, struct cont
ctx->previous_rejection = ctx->units.count - 1u;
return 1;
+ case LIBPARSER_SENTENCE_TYPE_COMMITTED:
+ if (incomplete(NULL, sentence, start, ctx) ||
+ push(NULL, sentence->unary.sentence, ctx))
+ return -1;
+ return 1;
+
case LIBPARSER_SENTENCE_TYPE_RULE:
for (i = 0; ctx->rules[i]; i++)
if (!strcmp(ctx->rules[i]->name, sentence->rule.rule))
@@ -993,6 +1007,7 @@ make_tree(struct context *ctx, size_t *unit_i, struct libparser_unit **last)
case LIBPARSER_SENTENCE_TYPE_RULE:
case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
case LIBPARSER_SENTENCE_TYPE_ALTERNATION:
+ case LIBPARSER_SENTENCE_TYPE_COMMITTED:
#if PRINT_ACTIONS
fprintf(stderr, "Unary unit: %s: %s [%zu, %zu)\n",
sentence_type_string(unit->sentence->type),
@@ -1164,7 +1179,7 @@ libparser_parse_file(const struct libparser_rule *const rules[], const char *dat
#endif
/* TODO guard against left-side recursion */
- /* TODO make branching opt-in ("?"), and add commit statements ("+") */
+ /* TODO make branching opt-in ("?") */
/* TODO break REPEATED on empty match */
followup:
#if PRINT_ACTIONS
@@ -1234,6 +1249,17 @@ followup:
}
if (!match)
goto mismatch;
+ if (sentence->type == LIBPARSER_SENTENCE_TYPE_COMMITTED) {
+#if PRINT_ACTIONS
+ fprintf(stderr, "Removing interior choices for matched COMMITTED\n");
+#endif
+ j = ctx.followups.followups[i].unit_index;
+ while (ctx.choices.count &&
+ ctx.choices.choices[ctx.choices.count - 1u].unit_index > j) {
+ ctx.choices.next = --ctx.choices.count;
+ free(ctx.choices.choices[ctx.choices.count].followups.followups);
+ }
+ }
#if PRINT_ACTIONS
fprintf(stderr, "Setting end of popped unit to %zu\n", ctx.position);
#endif
diff --git a/libparser.h b/libparser.h
index 5f55b53..66bedea 100644
--- a/libparser.h
+++ b/libparser.h
@@ -20,7 +20,8 @@ enum libparser_sentence_type {
LIBPARSER_SENTENCE_TYPE_RULE, /* .rule */
LIBPARSER_SENTENCE_TYPE_EXCEPTION, /* (none) */
LIBPARSER_SENTENCE_TYPE_EOF, /* (none) */
- LIBPARSER_SENTENCE_TYPE_EPSILON /* (none) */
+ LIBPARSER_SENTENCE_TYPE_EPSILON, /* (none) */
+ LIBPARSER_SENTENCE_TYPE_COMMITTED /* .unary */
};
struct libparser_sentence_binary {
diff --git a/print-syntax.c b/print-syntax.c
index 1584e7b..3228401 100644
--- a/print-syntax.c
+++ b/print-syntax.c
@@ -42,6 +42,13 @@ print_sentence(const union libparser_sentence *sentence, int indent)
indent += 1;
break;
+ case LIBPARSER_SENTENCE_TYPE_COMMITTED:
+ printf("+(");
+ indent = print_sentence(sentence->unary.sentence, indent + 2);
+ printf(")");
+ indent += 1;
+ break;
+
case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
printf("[");
indent = print_sentence(sentence->unary.sentence, indent + 1);