Add committed-operator

Signed-off-by: Mattias Andrée <m@maandree.se>
author: Mattias Andrée <m@maandree.se> 2026-01-05 13:50:38 +0100
committer: Mattias Andrée <m@maandree.se> 2026-02-23 07:52:18 +0100
commit: 7dd4970daafdc1992e1dba486a1477a17cb4c53e (patch)
tree: e56e544ec55e68872397738e0e5064b000d31eba
parent: Optimise detection of rejection (diff)
download: libparser-7dd4970daafdc1992e1dba486a1477a17cb4c53e.tar.gz
libparser-7dd4970daafdc1992e1dba486a1477a17cb4c53e.tar.bz2
libparser-7dd4970daafdc1992e1dba486a1477a17cb4c53e.tar.xz
7 files changed, 66 insertions, 17 deletions
diff --git a/Makefile b/Makefile
index 543e021..a63b5bf 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,7 @@ include mk/$(OS).mk
 
 
 LIB_MAJOR = 1
-LIB_MINOR = 1
+LIB_MINOR = 2
 LIB_VERSION = $(LIB_MAJOR).$(LIB_MINOR)
 
 
diff --git a/calc-example/calc.syntax b/calc-example/calc.syntax
index fb5a871..d76b68b 100644
--- a/calc-example/calc.syntax
+++ b/calc-example/calc.syntax
@@ -15,16 +15,16 @@ DIV         = _, ("/" | "∕" | "÷"), _;
 sign        = ADD | SUB;
 
 _digit      = DIGIT | _WHITESPACE | "_" | "'";
-unsigned    = DIGIT, {_digit}, !_digit;
+unsigned    = DIGIT, +({_digit}, !_digit);
 
 _number     = unsigned | "(", _expr, (")" | -);
 
-number      = _number, {_, _number}; (* optionally with implicit multiplication *)
+number      = _number, +{_, _number}; (* optionally with implicit multiplication *)
 
 value       = [sign], number;
 
 _expr       = hyper1;
 
 
-hyper1      = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _;
-hyper2      = _, value, {(MUL | DIV), (value | -)}, _;
+hyper1      = _, hyper2, {+(ADD | SUB), +(hyper2 | -)}, _;
+hyper2      = _, value, {+(MUL | DIV), +(value | -)}, _;
diff --git a/libparser-generate.c b/libparser-generate.c
index f72ed85..a8761f8 100644
--- a/libparser-generate.c
+++ b/libparser-generate.c
@@ -375,6 +375,7 @@ emit_and_free_sentence(struct node *node, size_t *indexp)
 	switch (node->token->s[0]) {
 	case '[': type = "OPTIONAL"; goto unary;
 	case '{': type = "REPEATED"; goto unary;
+	case '+': type = "COMMITTED"; goto unary;
 	case '!': type = "REJECTION"; unary:
 		emit_and_free_sentence(node->data, indexp);
 		printf("static union libparser_sentence sentence_%zu_%zu = {.unary = {"
@@ -501,6 +502,7 @@ order_sentences(struct node *node)
 		case '[':
 		case '{':
 		case '!':
+		case '+':
 			/* Everything else we immediately put into the queue,
 			 * but for brackets and unary operators, we simply
 			 * use recursion to order inner sentences */
@@ -699,10 +701,11 @@ again:
 					 * closing statements; and we still expect
 					 * the next token to be an operand */
 					goto push_stack;
-				} else if (tokens[i]->s[0] == '!') {
-					/* Likewise for rejections (it is added to
-					 * the stack but it is an unary operator
-					 * so no matching symbol will be expected) */
+				} else if (tokens[i]->s[0] == '!' || tokens[i]->s[0] == '+') {
+					/* Likewise for rejections and commitments
+				         * (it is added to the stack but they are
+					 * unary operators so no matching symbol
+					 * will be expected) */
 					goto push_stack;
 				} else if (tokens[i]->s[0] == '<') {
 					/* Likewise for value ranges, but we expect
@@ -741,8 +744,8 @@ again:
 		case EXPECT_OPERATOR:
 			/* When we get an binary operator, or the end
 			 * of a sentence, we have to pop out all unary
-			 * operators (rejects) from the stack */
-			while (stack->token->s[0] == '!') {
+			 * operators from the stack */
+			while (stack->token->s[0] == '!' || stack->token->s[0] == '+') {
 				*stack->parent->head = stack;
 				stack->parent->head = &stack->next;
 				stack = stack->parent;
diff --git a/libparser.7 b/libparser.7
index 7157769..2d921a7 100644
--- a/libparser.7
+++ b/libparser.7
@@ -88,6 +88,7 @@ integer          = _decimal | _hexadecimal; (* May not exceed 255. *)
 _low             = character | integer;
 _high            = character | integer;
 
+committed        = \(dq+\(dq, _, _operand;
 rejection        = \(dq!\(dq, _, _operand;
 concatenation    = _operand, {_, \(dq,\(dq, _, _operand};
 alternation      = concatenation, {_, \(dq|\(dq, _, concatenation};
@@ -100,7 +101,7 @@ embedded-rule    = identifier;
 
 _literal         = char-range | exception | string;
 _group           = optional | repeated | group | embedded-rule;
-_operand         = _group | _literal | rejection;
+_operand         = _group | _literal | rejection | committed;
 
 _expression      = alternation;
 
@@ -135,6 +136,15 @@ have no semantic meaning and are useful only to put a
 alternation inside a concatenation without creating a
 new rule for that.
 .PP
+The parser is non-deterministic, which can make it really
+slow. The speed up parsing, you can add commits. Once the
+committed sentence has been matched, the branching-points
+inside it are unrecorded, so when the parser backtracks,
+it will not try different choices inside the committed
+sentence. Commit sentences are undone by the parser when
+it backtracks to a branching-point outside the committed
+sentence.
+.PP
 In character ranges, the
 .B _high
 and
@@ -150,12 +160,14 @@ the rules will appear in the tree-formatted result.
 .PP
 Left recursion is illegal (it will cause stack
 overflow at runtime as the empty condition before the
-recursion is always met).
+recursion is always met). Likewise, repeated optional
+sentences are illegal; a repeated sentence must always
+consume input, otherwise it gets stuck.
 
 .SS Right-context-sensitive grammar
 libparser originally used context-free grammar, but with
 introduction of the rejection rule, specifically the ability
-to reject a rejection, it became a phrase for
+to reject a rejection, it became a parser for
 right-context-sensitive grammar which is a grammar that is
 that can generate any context-sensitive language, it is
 however weakly equivalent to context-sensitive grammar.
diff --git a/libparser.c b/libparser.c
index eec9337..7ef7267 100644
--- a/libparser.c
+++ b/libparser.c
@@ -119,7 +119,7 @@ struct choice {
 		/**
 		 * The follow-ups
 		 */
-		struct followup *followups;
+		struct followup *followups; /* TODO add memory pool, maybe even use a memory stack */
 
 		/**
 		 * The number of follow-ups; this is
@@ -266,6 +266,7 @@ sentence_type_string(enum libparser_sentence_type type)
 	case LIBPARSER_SENTENCE_TYPE_EXCEPTION: return "EXCEPTION";
 	case LIBPARSER_SENTENCE_TYPE_EOF: return "EOF";
 	case LIBPARSER_SENTENCE_TYPE_EPSILON: return "EPSILON";
+	case LIBPARSER_SENTENCE_TYPE_COMMITTED: return "COMMITTED";
 	default: return "<invalid>";
 	}
 }
@@ -390,6 +391,13 @@ print_sentence(const union libparser_sentence *sentence, int indent)
 		indent += 1;
 		break;
 
+	case LIBPARSER_SENTENCE_TYPE_COMMITTED:
+		fprintf(stderr, "+(");
+		indent = print_sentence(sentence->unary.sentence, indent + 2);
+		fprintf(stderr, ")");
+		indent += 1;
+		break;
+
 	case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
 		fprintf(stderr, "[");
 		indent = print_sentence(sentence->unary.sentence, indent + 1);
@@ -882,6 +890,12 @@ add_unit(const char *rule, const union libparser_sentence *sentence, struct cont
 		ctx->previous_rejection = ctx->units.count - 1u;
 		return 1;
 
+	case LIBPARSER_SENTENCE_TYPE_COMMITTED:
+		if (incomplete(NULL, sentence, start, ctx) ||
+		    push(NULL, sentence->unary.sentence, ctx))
+			return -1;
+		return 1;
+
 	case LIBPARSER_SENTENCE_TYPE_RULE:
 		for (i = 0; ctx->rules[i]; i++)
 			if (!strcmp(ctx->rules[i]->name, sentence->rule.rule))
@@ -993,6 +1007,7 @@ make_tree(struct context *ctx, size_t *unit_i, struct libparser_unit **last)
 	case LIBPARSER_SENTENCE_TYPE_RULE:
 	case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
 	case LIBPARSER_SENTENCE_TYPE_ALTERNATION:
+	case LIBPARSER_SENTENCE_TYPE_COMMITTED:
 #if PRINT_ACTIONS
 		fprintf(stderr, "Unary unit: %s: %s [%zu, %zu)\n",
 		        sentence_type_string(unit->sentence->type),
@@ -1164,7 +1179,7 @@ libparser_parse_file(const struct libparser_rule *const rules[], const char *dat
 #endif
 
 	/* TODO guard against left-side recursion */
-	/* TODO make branching opt-in ("?"), and add commit statements ("+") */
+	/* TODO make branching opt-in ("?") */
 	/* TODO break REPEATED on empty match */
 followup:
 #if PRINT_ACTIONS
@@ -1234,6 +1249,17 @@ followup:
 			}
 			if (!match) 
 				goto mismatch;
+			if (sentence->type == LIBPARSER_SENTENCE_TYPE_COMMITTED) {
+#if PRINT_ACTIONS
+				fprintf(stderr, "Removing interior choices for matched COMMITTED\n");
+#endif
+				j = ctx.followups.followups[i].unit_index;
+				while (ctx.choices.count &&
+				       ctx.choices.choices[ctx.choices.count - 1u].unit_index > j) {
+					ctx.choices.next = --ctx.choices.count;
+					free(ctx.choices.choices[ctx.choices.count].followups.followups);
+				}
+			}
 #if PRINT_ACTIONS
 			fprintf(stderr, "Setting end of popped unit to %zu\n", ctx.position);
 #endif
diff --git a/libparser.h b/libparser.h
index 5f55b53..66bedea 100644
--- a/libparser.h
+++ b/libparser.h
@@ -20,7 +20,8 @@ enum libparser_sentence_type {
 	LIBPARSER_SENTENCE_TYPE_RULE,          /* .rule */
 	LIBPARSER_SENTENCE_TYPE_EXCEPTION,     /* (none) */
 	LIBPARSER_SENTENCE_TYPE_EOF,           /* (none) */
-	LIBPARSER_SENTENCE_TYPE_EPSILON        /* (none) */
+	LIBPARSER_SENTENCE_TYPE_EPSILON,       /* (none) */
+	LIBPARSER_SENTENCE_TYPE_COMMITTED      /* .unary */
 };
 
 struct libparser_sentence_binary {
diff --git a/print-syntax.c b/print-syntax.c
index 1584e7b..3228401 100644
--- a/print-syntax.c
+++ b/print-syntax.c
@@ -42,6 +42,13 @@ print_sentence(const union libparser_sentence *sentence, int indent)
 		indent += 1;
 		break;
 
+	case LIBPARSER_SENTENCE_TYPE_COMMITTED:
+		printf("+(");
+		indent = print_sentence(sentence->unary.sentence, indent + 2);
+		printf(")");
+		indent += 1;
+		break;
+
 	case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
 		printf("[");
 		indent = print_sentence(sentence->unary.sentence, indent + 1);
author	Mattias Andrée <m@maandree.se>	2026-01-05 13:50:38 +0100
committer	Mattias Andrée <m@maandree.se>	2026-02-23 07:52:18 +0100
commit	7dd4970daafdc1992e1dba486a1477a17cb4c53e (patch)
tree	e56e544ec55e68872397738e0e5064b000d31eba
parent	Optimise detection of rejection (diff)
download	libparser-7dd4970daafdc1992e1dba486a1477a17cb4c53e.tar.gz libparser-7dd4970daafdc1992e1dba486a1477a17cb4c53e.tar.bz2 libparser-7dd4970daafdc1992e1dba486a1477a17cb4c53e.tar.xz