aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--README37
-rw-r--r--calc-example/calc.syntax8
-rw-r--r--libparser-generate.c58
-rw-r--r--libparser.730
-rw-r--r--libparser.c33
-rw-r--r--libparser.h5
-rw-r--r--print-syntax.c15
7 files changed, 136 insertions, 50 deletions
diff --git a/README b/README
index 37b2701..23d231b 100644
--- a/README
+++ b/README
@@ -76,11 +76,14 @@ EXTENDED DESCRIPTION
_low = character | integer;
_high = character | integer;
+ nondeterministic = "?";
+
+ committed = "+", _, _operand;
rejection = "!", _, _operand;
concatenation = _operand, {_, ",", _, _operand};
- alternation = concatenation, {_, "|", _, concatenation};
- optional = "[", _, _expression, _, "]";
- repeated = "{", _, _expression, _, "}";
+ alternation = concatenation, {_, [nondeterministic], "|", _, concatenation};
+ optional = [nondeterministic], "[", _, _expression, _, "]";
+ repeated = [nondeterministic], "{", _, _expression, _, "}";
group = "(", _, _expression, _, ")";
char-range = "<", _, _low, _, ",", _, _high, "_", ">";
exception = "-";
@@ -88,7 +91,7 @@ EXTENDED DESCRIPTION
_literal = char-range | exception | string;
_group = optional | repeated | group | embedded-rule;
- _operand = _group | _literal | rejection;
+ _operand = _group | _literal | rejection | committed;
_expression = alternation;
@@ -109,12 +112,24 @@ EXTENDED DESCRIPTION
reached, the parser will terminate there.
Repeated symbols may occur any number of times, including
- zero. The compiler is able to backtrack if it takes too much.
+ zero. The parser will try to take as much as as possible.
+
+ Optional symbols are taken whenever possible.
+
+ Concatenation has higher precedence than alternation, groups
+ ("(", ..., ")") have no semantic meaning and are useful only
+ for including alternations inside concatenations or put
+ alternations or concatenations inside a commitment or
+ rejection.
- Concatenation has higher precedence than alternation,
- groups ("(", ..., ")") have no semantic meaning and are useful
- only to put a alternation inside a concatenation without
- creating a new rule for that.
+ The parser has the ability to be non-deterministic, which
+ can make it really slow. The speed up parsing, you can add
+ commits. Once the committed sentence has been matched, the
+ branching-points inside it are unrecorded, so when the parser
+ backtracks, it will not try different choices inside the
+ committed sentence. Commit sentences are undone by the parser
+ when it backtracks to a branching-point outside the committed
+ sentence.
In character ranges, the _high and _low values must be at
least 0 and at most 255, and _high must be greater than _low.
@@ -125,7 +140,9 @@ EXTENDED DESCRIPTION
Left recursion is illegal (it will cause stack overflow at
runtime as the empty condition before the recursion is always
- met).
+ met). Likewise, repeated optional sentences are illegal; a
+ repeated sentence must always consume input, otherwise it
+ gets stuck.
Right-context-sensitive grammar
libparser originally used context-free grammar, but with
diff --git a/calc-example/calc.syntax b/calc-example/calc.syntax
index d76b68b..fb5a871 100644
--- a/calc-example/calc.syntax
+++ b/calc-example/calc.syntax
@@ -15,16 +15,16 @@ DIV = _, ("/" | "∕" | "÷"), _;
sign = ADD | SUB;
_digit = DIGIT | _WHITESPACE | "_" | "'";
-unsigned = DIGIT, +({_digit}, !_digit);
+unsigned = DIGIT, {_digit}, !_digit;
_number = unsigned | "(", _expr, (")" | -);
-number = _number, +{_, _number}; (* optionally with implicit multiplication *)
+number = _number, {_, _number}; (* optionally with implicit multiplication *)
value = [sign], number;
_expr = hyper1;
-hyper1 = _, hyper2, {+(ADD | SUB), +(hyper2 | -)}, _;
-hyper2 = _, value, {+(MUL | DIV), +(value | -)}, _;
+hyper1 = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _;
+hyper2 = _, value, {(MUL | DIV), (value | -)}, _;
diff --git a/libparser-generate.c b/libparser-generate.c
index a8761f8..0a91835 100644
--- a/libparser-generate.c
+++ b/libparser-generate.c
@@ -38,6 +38,7 @@ struct node {
struct node *next; /* next element in list */
struct node *data; /* beginning of subsentence */
struct node **head; /* end of subsentence */
+ int nondeterministic;
};
@@ -65,15 +66,6 @@ emalloc(size_t n)
}
static void *
-ecalloc(size_t n, size_t m)
-{
- void *ret = calloc(n, m);
- if (!ret)
- eprintf("%s: calloc %zu %zu: %s\n", argv0, n, m, strerror(errno));
- return ret;
-}
-
-static void *
erealloc(void *ptr, size_t n)
{
void *ret = realloc(ptr, n);
@@ -104,6 +96,20 @@ estrdup(char *s)
}
+static struct node *
+new_node(void)
+{
+ struct node *ret = emalloc(sizeof(*ret));
+ ret->token = NULL;
+ ret->parent = NULL;
+ ret->next = NULL;
+ ret->data = NULL;
+ ret->head = NULL;
+ ret->nondeterministic = 0;
+ return ret;
+}
+
+
static int
strpcmp(const void *av, const void *bv)
{
@@ -120,6 +126,7 @@ isidentifier(char c)
}
+
static int
check_utf8(char *buf, size_t *ip, size_t len)
{
@@ -379,10 +386,11 @@ emit_and_free_sentence(struct node *node, size_t *indexp)
case '!': type = "REJECTION"; unary:
emit_and_free_sentence(node->data, indexp);
printf("static union libparser_sentence sentence_%zu_%zu = {.unary = {"
- ".type = LIBPARSER_SENTENCE_TYPE_%s, .sentence = &sentence_%zu_%zu"
+ ".type = LIBPARSER_SENTENCE_TYPE_%s%s, .sentence = &sentence_%zu_%zu"
"}};\n",
nrule_names, index,
- type, nrule_names, index + 1u);
+ node->nondeterministic ? "ND_" : "", type,
+ nrule_names, index + 1u);
break;
case '<':
@@ -412,11 +420,11 @@ emit_and_free_sentence(struct node *node, size_t *indexp)
left = *indexp;
emit_and_free_sentence(node->data, indexp);
printf("static union libparser_sentence sentence_%zu_%zu = {.binary = {"
- ".type = LIBPARSER_SENTENCE_TYPE_%s, "
+ ".type = LIBPARSER_SENTENCE_TYPE_%s%s, "
".left = &sentence_%zu_%zu, .right = &sentence_%zu_%zu"
"}};\n",
nrule_names, index,
- type,
+ node->nondeterministic ? "ND_" : "", type,
nrule_names, left, nrule_names, right);
break;
@@ -587,7 +595,7 @@ main(int argc, char *argv[])
struct node *stack = NULL, *parent_node, *node;
char *data;
struct token **tokens;
- size_t i, j;
+ size_t i, j, nondeterministic;
int cmp, err, val;
if (argc) {
@@ -613,6 +621,7 @@ main(int argc, char *argv[])
printf("#include <libparser.h>\n");
+ nondeterministic = SIZE_MAX;
i = 0;
again:
for (; tokens[i]; i++) {
@@ -635,7 +644,7 @@ again:
}
/* Also remove any whitespace (the tokeniser
- * simple and does not recognise mulltisymbol
+ * simple and does not recognise multisymbol
* tokens (that is apart form strings and
* identifiers) so it cannot ignore whitespace. */
if (isspace(tokens[i]->s[0])) {
@@ -643,6 +652,16 @@ again:
continue;
}
+ /* Check if next token is non-deterministic */
+ if (tokens[i]->s[0] == '?') {
+ if (tokens[i + 1u]->s[0] == '[' ||
+ tokens[i + 1u]->s[0] == '{' ||
+ tokens[i + 1u]->s[0] == '|') {
+ nondeterministic = i + 1u;
+ continue;
+ }
+ }
+
/* For the sake of code readability, identify
* the token type */
if (tokens[i]->s[0] == '"') {
@@ -661,7 +680,8 @@ again:
eprintf("%s: expected an identifier on line %zu at column %zu (character %zu)\n",
argv0, tokens[i]->lineno, tokens[i]->column, tokens[i]->character);
}
- stack = calloc(1, sizeof(*stack));
+ stack = new_node();
+ stack->nondeterministic = i == nondeterministic;
stack->token = tokens[i];
stack->head = &stack->data;
/* and then we expect an equals sign */
@@ -715,7 +735,8 @@ again:
state = EXPECT_RANGE_LOW;
push_stack:
parent_node = stack;
- stack = ecalloc(1, sizeof(*stack));
+ stack = new_node();
+ stack->nondeterministic = i == nondeterministic;
stack->parent = parent_node;
stack->token = tokens[i];
stack->head = &stack->data;
@@ -756,7 +777,8 @@ again:
* token to be an operand */
state = EXPECT_OPERAND;
add_singleton:
- node = calloc(1u, sizeof(*node));
+ node = new_node();
+ node->nondeterministic = i == nondeterministic;
node->token = tokens[i];
*stack->head = node;
stack->head = &node->next;
diff --git a/libparser.7 b/libparser.7
index 2d921a7..c50b5c9 100644
--- a/libparser.7
+++ b/libparser.7
@@ -88,12 +88,14 @@ integer = _decimal | _hexadecimal; (* May not exceed 255. *)
_low = character | integer;
_high = character | integer;
+nondeterministic = \(dq?\(dq;
+
committed = \(dq+\(dq, _, _operand;
rejection = \(dq!\(dq, _, _operand;
concatenation = _operand, {_, \(dq,\(dq, _, _operand};
-alternation = concatenation, {_, \(dq|\(dq, _, concatenation};
-optional = \(dq[\(dq, _, _expression, _, \(dq]\(dq;
-repeated = \(dq{\(dq, _, _expression, _, \(dq}\(dq;
+alternation = concatenation, {_, [nondeterministic], \(dq|\(dq, _, concatenation};
+optional = [nondeterministic], \(dq[\(dq, _, _expression, _, \(dq]\(dq;
+repeated = [nondeterministic], \(dq{\(dq, _, _expression, _, \(dq}\(dq;
group = \(dq(\(dq, _, _expression, _, \(dq)\(dq;
char-range = \(dq<\(dq, _, _low, _, \(dq,\(dq, _, _high, \(dq_\(dq, \(dq>\(dq;
exception = \(dq-\(dq;
@@ -132,18 +134,18 @@ Optional symbols are taken whenever possible.
Concatenation has higher precedence than alternation,
groups
.RB (\(dq ( "\(dq, ..., \(dq" ) \(dq)
-have no semantic meaning and are useful only to put a
-alternation inside a concatenation without creating a
-new rule for that.
+have no semantic meaning and are useful only for including
+alternations inside concatenations or put alternations
+or concatenations inside a commitment or rejection.
.PP
-The parser is non-deterministic, which can make it really
-slow. The speed up parsing, you can add commits. Once the
-committed sentence has been matched, the branching-points
-inside it are unrecorded, so when the parser backtracks,
-it will not try different choices inside the committed
-sentence. Commit sentences are undone by the parser when
-it backtracks to a branching-point outside the committed
-sentence.
+The parser has the ability to be non-deterministic, which
+can make it really slow. The speed up parsing, you can add
+commits. Once the committed sentence has been matched,
+the branching-points inside it are unrecorded, so when
+the parser backtracks, it will not try different choices
+inside the committed sentence. Commit sentences are undone
+by the parser when it backtracks to a branching-point
+outside the committed sentence.
.PP
In character ranges, the
.B _high
diff --git a/libparser.c b/libparser.c
index 7ef7267..5ecb7ee 100644
--- a/libparser.c
+++ b/libparser.c
@@ -267,6 +267,9 @@ sentence_type_string(enum libparser_sentence_type type)
case LIBPARSER_SENTENCE_TYPE_EOF: return "EOF";
case LIBPARSER_SENTENCE_TYPE_EPSILON: return "EPSILON";
case LIBPARSER_SENTENCE_TYPE_COMMITTED: return "COMMITTED";
+ case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL: return "ND_OPTIONAL";
+ case LIBPARSER_SENTENCE_TYPE_ND_REPEATED: return "ND_REPEATED";
+ case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION: return "ND_ALTERNATION";
default: return "<invalid>";
}
}
@@ -375,6 +378,15 @@ print_sentence(const union libparser_sentence *sentence, int indent)
indent += 1;
break;
+ case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION:
+ fprintf(stderr, "(");
+ print_sentence(sentence->binary.left, indent + 1);
+ fprintf(stderr, " ?| \n%*.s", indent + 1, "");
+ indent = print_sentence(sentence->binary.right, indent + 1);
+ fprintf(stderr, ")");
+ indent += 1;
+ break;
+
case LIBPARSER_SENTENCE_TYPE_ALTERNATION:
fprintf(stderr, "(");
print_sentence(sentence->binary.left, indent + 1);
@@ -398,6 +410,9 @@ print_sentence(const union libparser_sentence *sentence, int indent)
indent += 1;
break;
+ case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL:
+ fprintf(stderr, "?");
+ /* fall through */
case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
fprintf(stderr, "[");
indent = print_sentence(sentence->unary.sentence, indent + 1);
@@ -405,6 +420,9 @@ print_sentence(const union libparser_sentence *sentence, int indent)
indent += 1;
break;
+ case LIBPARSER_SENTENCE_TYPE_ND_REPEATED:
+ fprintf(stderr, "?");
+ /* fall through */
case LIBPARSER_SENTENCE_TYPE_REPEATED:
fprintf(stderr, "{");
indent = print_sentence(sentence->unary.sentence, indent + 1);
@@ -827,6 +845,7 @@ add_unit(const char *rule, const union libparser_sentence *sentence, struct cont
#endif
switch (sentence->type) {
+ case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION:
case LIBPARSER_SENTENCE_TYPE_ALTERNATION:
if (braching(rule, sentence, start, ctx, &choice))
return -1;
@@ -837,6 +856,7 @@ add_unit(const char *rule, const union libparser_sentence *sentence, struct cont
return -1;
return 1;
+ case LIBPARSER_SENTENCE_TYPE_ND_REPEATED:
case LIBPARSER_SENTENCE_TYPE_REPEATED:
if (braching(rule, sentence, start, ctx, &choice))
return -1;
@@ -860,6 +880,7 @@ add_unit(const char *rule, const union libparser_sentence *sentence, struct cont
}
return 1;
+ case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL:
case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
if (braching(rule, sentence, start, ctx, &choice))
return -1;
@@ -1005,7 +1026,9 @@ make_tree(struct context *ctx, size_t *unit_i, struct libparser_unit **last)
switch (unit->sentence->type) {
case LIBPARSER_SENTENCE_TYPE_RULE:
+ case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL:
case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
+ case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION:
case LIBPARSER_SENTENCE_TYPE_ALTERNATION:
case LIBPARSER_SENTENCE_TYPE_COMMITTED:
#if PRINT_ACTIONS
@@ -1033,6 +1056,7 @@ make_tree(struct context *ctx, size_t *unit_i, struct libparser_unit **last)
*last = unit;
goto out;
+ case LIBPARSER_SENTENCE_TYPE_ND_REPEATED:
case LIBPARSER_SENTENCE_TYPE_REPEATED:
case LIBPARSER_SENTENCE_TYPE_CONCATENATION:
#if PRINT_ACTIONS
@@ -1179,7 +1203,6 @@ libparser_parse_file(const struct libparser_rule *const rules[], const char *dat
#endif
/* TODO guard against left-side recursion */
- /* TODO make branching opt-in ("?") */
/* TODO break REPEATED on empty match */
followup:
#if PRINT_ACTIONS
@@ -1249,9 +1272,13 @@ followup:
}
if (!match)
goto mismatch;
- if (sentence->type == LIBPARSER_SENTENCE_TYPE_COMMITTED) {
+ if (sentence->type == LIBPARSER_SENTENCE_TYPE_COMMITTED ||
+ sentence->type == LIBPARSER_SENTENCE_TYPE_ALTERNATION ||
+ sentence->type == LIBPARSER_SENTENCE_TYPE_OPTIONAL ||
+ sentence->type == LIBPARSER_SENTENCE_TYPE_REPEATED) {
#if PRINT_ACTIONS
- fprintf(stderr, "Removing interior choices for matched COMMITTED\n");
+ fprintf(stderr, "Removing interior choices for matched %s\n",
+ sentence_type_string(sentence->type));
#endif
j = ctx.followups.followups[i].unit_index;
while (ctx.choices.count &&
diff --git a/libparser.h b/libparser.h
index 66bedea..54e3829 100644
--- a/libparser.h
+++ b/libparser.h
@@ -21,7 +21,10 @@ enum libparser_sentence_type {
LIBPARSER_SENTENCE_TYPE_EXCEPTION, /* (none) */
LIBPARSER_SENTENCE_TYPE_EOF, /* (none) */
LIBPARSER_SENTENCE_TYPE_EPSILON, /* (none) */
- LIBPARSER_SENTENCE_TYPE_COMMITTED /* .unary */
+ LIBPARSER_SENTENCE_TYPE_COMMITTED, /* .unary */
+ LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL, /* .unary */
+ LIBPARSER_SENTENCE_TYPE_ND_REPEATED, /* .unary */
+ LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION /* .binary */
};
struct libparser_sentence_binary {
diff --git a/print-syntax.c b/print-syntax.c
index 3228401..07b3858 100644
--- a/print-syntax.c
+++ b/print-syntax.c
@@ -26,6 +26,15 @@ print_sentence(const union libparser_sentence *sentence, int indent)
indent += 1;
break;
+ case LIBPARSER_SENTENCE_TYPE_ND_ALTERNATION:
+ printf("(");
+ print_sentence(sentence->binary.left, indent + 1);
+ printf(" ?| \n%*.s", indent + 1, "");
+ indent = print_sentence(sentence->binary.right, indent + 1);
+ printf(")");
+ indent += 1;
+ break;
+
case LIBPARSER_SENTENCE_TYPE_ALTERNATION:
printf("(");
print_sentence(sentence->binary.left, indent + 1);
@@ -49,6 +58,9 @@ print_sentence(const union libparser_sentence *sentence, int indent)
indent += 1;
break;
+ case LIBPARSER_SENTENCE_TYPE_ND_OPTIONAL:
+ printf("?");
+ /* fall through */
case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
printf("[");
indent = print_sentence(sentence->unary.sentence, indent + 1);
@@ -56,6 +68,9 @@ print_sentence(const union libparser_sentence *sentence, int indent)
indent += 1;
break;
+ case LIBPARSER_SENTENCE_TYPE_ND_REPEATED:
+ printf("?");
+ /* fall through */
case LIBPARSER_SENTENCE_TYPE_REPEATED:
printf("{");
indent = print_sentence(sentence->unary.sentence, indent + 1);