From 16b3146a1ed4497205a378472b35c40eb34c0d40 Mon Sep 17 00:00:00 2001 From: Mattias Andrée Date: Mon, 19 Apr 2021 11:29:50 +0200 Subject: Add rejection + fix documentation of comment syntax (can contain string) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mattias Andrée --- README | 78 +++++++++++++++++++++++++----------------------- calc-example/calc.syntax | 32 ++++++++++---------- config.mk | 4 +-- libparser-generate.c | 17 +++++++++-- libparser.7 | 76 +++++++++++++++++++++++----------------------- libparser.c | 12 ++++++++ libparser.h | 1 + 7 files changed, 126 insertions(+), 94 deletions(-) diff --git a/README b/README index 3042070..684a6e9 100644 --- a/README +++ b/README @@ -1,4 +1,4 @@ -NAME +>NAME libparser - Context-free grammar parsing library DESCRIPTION @@ -25,21 +25,21 @@ EXTENDED DESCRIPTION (* CHARACTER CLASSES *) - _space = " " | "\n" | "\t"; - _alpha = <"a", "z"> | <"A", "Z">; - _octal = <"0", "7">; - _digit = <"0", "9">; - _xdigit = _digit | <"a", "f"> | <"A", "F">; - _nonascii = <128, 255>; + _space = " " | "\n" | "\t"; + _alpha = <"a", "z"> | <"A", "Z">; + _octal = <"0", "7">; + _digit = <"0", "9">; + _xdigit = _digit | <"a", "f"> | <"A", "F">; + _nonascii = <128, 255>; (* WHITESPACE/COMMENTS, THE GRAMMAR IS FREE-FORM *) - _comment_char = _space | <"!", 0x29> | <0x2B, 0xFF>; - _comment_tail = [_comment_char], ("*)" | _comment_tail); - _comment = "(*", _comment_tail; + _comment_char = _space | !"*", "\"", <"!", 0xFF>; + _comment_tail = [_comment_char], [_string], ("*)" | _comment_tail | -); + _comment = "(*", _comment_tail; - _ = {_space | _comment}; + _ = {_space | _comment}; (* IDENTIFIERS *) @@ -47,56 +47,58 @@ EXTENDED DESCRIPTION _identifier_head = _alpha | _digit | _nonascii | "_"; _identifier_tail = _identifier_head | "-"; - identifier = _identifier_head, {_identifier_tail}; + identifier = _identifier_head, {_identifier_tail}; (* STRINGS *) - _escape_simple = "\\" | "\"" | "'" | "a" | "b" | "f" | "n" | "r" | "v"; - _escape_hex = ("x" | "X"), _xdigit, _xdigit; - _escape_octal = _octal, {_octal}; (* May not exceed 255 in base 10 *) - _escape = _escape_simple | _escape_hex | _escape_octal | -; - _character = "\\", _escape | <1, "!"> | <"#", 0xFF>; + _escape_simple = "\\" | "\"" | "'" | "a" | "b" | "f" | "n" | "r" | "v"; + _escape_hex = ("x" | "X"), _xdigit, _xdigit; + _escape_octal = _octal, {_octal}; (* May not exceed 255 in base 10 *) + _escape = _escape_simple | _escape_hex | _escape_octal | -; + _character = "\\", _escape | !"\"", <1, 0xFF>; + _string = "\"", _character, {_character}, ("\"" | -); - string = "\"", _character, {_character}, "\""; - character = "\"", _character, "\""; + string = _string + character = "\"", _character, ("\"" | -); (* INTEGERS *) - _decimal = _digit, {_digit}; - _hexadecimal = "0", ("x" | "X"), _xdigit, {_xdigit}; + _decimal = _digit, {_digit}; + _hexadecimal = "0", ("x" | "X"), _xdigit, {_xdigit}; - integer = _decimal | _hexadecimal; (* May not exceed 255. *) + integer = _decimal | _hexadecimal; (* May not exceed 255. *) (* GROUPINGS *) - _low = character | integer; - _high = character | integer; + _low = character | integer; + _high = character | integer; - concatenation = _operand, {_, ",", _, _operand}; - alternation = concatenation, {_, "|", _, concatenation}; - optional = "[", _, _expression, _, "]"; - repeated = "{", _, _expression, _, "}"; - group = "(", _, _expression, _, ")"; - char-range = "<", _, _low, _, ",", _, _high, "_", ">"; - exception = "-"; - embedded-rule = identifier; + rejection = "!", _, _operand; + concatenation = _operand, {_, ",", _, _operand}; + alternation = concatenation, {_, "|", _, concatenation}; + optional = "[", _, _expression, _, "]"; + repeated = "{", _, _expression, _, "}"; + group = "(", _, _expression, _, ")"; + char-range = "<", _, _low, _, ",", _, _high, "_", ">"; + exception = "-"; + embedded-rule = identifier; - _literal = char-range | exception | string; - _group = optional | repeated | group | embedded-rule; - _operand = _group | _literal; + _literal = char-range | exception | string; + _group = optional | repeated | group | embedded-rule; + _operand = _group | _literal | rejection; - _expression = alternation; + _expression = alternation; (* RULES *) - rule = identifier, _, "=", _, _expression, _, ";"; + rule = identifier, _, "=", _, _expression, _, ";"; (* This is the root rule of the grammar. *) - grammar = _, {rules, _}; + grammar = _, {rules, _}; The file must be encoded in UTF-8, with LF as the line break (CR and FF are illegal just becuase). diff --git a/calc-example/calc.syntax b/calc-example/calc.syntax index 286019b..8051f57 100644 --- a/calc-example/calc.syntax +++ b/calc-example/calc.syntax @@ -1,27 +1,29 @@ -_WHITESPACE = " " | "\t" | " "; -_ = {_WHITESPACE}; +_WHITESPACE = " " | "\t" | " "; +_COMMENT = "(*", {!"*)", <0, 0xFF>}, "*)"; +_ = {_WHITESPACE | _COMMENT}; -DIGIT = <"0", "9">; -ADD = _, ("+"), _; -SUB = _, ("-" | "−"), _; -MUL = _, ("*" | "⋅" | "×"), _; -DIV = _, ("/" | "∕" | "÷"), _; +DIGIT = <"0", "9">; +ADD = _, ("+"), _; +SUB = _, ("-" | "−"), _; +MUL = _, ("*" | "⋅" | "×"), _; +DIV = _, ("/" | "∕" | "÷"), _; -sign = ADD | SUB; -unsigned = DIGIT, {DIGIT | _WHITESPACE | "_" | "'"}; +sign = ADD | SUB; -_number = unsigned | "(", _expr, (")" | -); +unsigned = DIGIT, {DIGIT | _WHITESPACE | "_" | "'"}; -number = _number, {_, _number}; (* optionally with implicit multiplication *) +_number = unsigned | "(", _expr, (")" | -); -value = [sign], number; +number = _number, {_, _number}; (* optionally with implicit multiplication *) -_expr = hyper1; +value = [sign], number; +_expr = hyper1; -hyper1 = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _; -hyper2 = _, value, {(MUL | DIV), (value | -)}, _; + +hyper1 = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _; +hyper2 = _, value, {(MUL | DIV), (value | -)}, _; diff --git a/config.mk b/config.mk index accd17b..dac7c46 100644 --- a/config.mk +++ b/config.mk @@ -4,5 +4,5 @@ MANPREFIX = $(PREFIX)/share/man CC = cc CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -I"$$(pwd)" -CFLAGS = -std=c99 -Wall -O2 -LDFLAGS = -s +CFLAGS = -std=c99 -Wall -Og -g +LDFLAGS = diff --git a/libparser-generate.c b/libparser-generate.c index 72cd294..9f82a2e 100644 --- a/libparser-generate.c +++ b/libparser-generate.c @@ -361,12 +361,13 @@ emit_and_free_sentence(struct node *node, size_t *indexp) free(node); } - if (node->token->s[0] == '[' || node->token->s[0] == '{') { + if (node->token->s[0] == '[' || node->token->s[0] == '{' || node->token->s[0] == '!') { emit_and_free_sentence(node->data, indexp); printf("static union libparser_sentence sentence_%zu_%zu = {.unary = {" ".type = LIBPARSER_SENTENCE_TYPE_%s, .sentence = &sentence_%zu_%zu" "}};\n", - nrule_names, index, node->token->s[0] == '[' ? "OPTIONAL" : "REPEATED", nrule_names, index + 1); + nrule_names, index, node->token->s[0] == '[' ? "OPTIONAL" : + node->token->s[0] == '{' ? "REPEATED" : "REJECTION", nrule_names, index + 1); } else if (node->token->s[0] == '<') { low = node->data; high = node->data->next; @@ -551,11 +552,16 @@ main(int argc, char *argv[]) again: for (; tokens[i]; i++) { if (tokens[i + 1] && tokens[i]->s[0] == '(' && tokens[i + 1]->s[0] == '*') { + free(tokens[i]); + free(tokens[i + 1]); for (i += 2; tokens[i] && tokens[i + 1]; i++) { if (tokens[i]->s[0] == '*' && tokens[i + 1]->s[0] == ')') { + free(tokens[i]); + free(tokens[i + 1]); i += 2; goto again; } + free(tokens[i]); } eprintf("%s: premature end of file\n", argv0); } @@ -612,6 +618,8 @@ again: stack->head = &stack->data; } else if (tokens[i]->s[0] == '-') { goto add; + } else if (tokens[i]->s[0] == '!') { + goto push_stack; } else { stray: eprintf("%s: stray '%c' on line %zu at column %zu (character %zu)\n", @@ -625,6 +633,11 @@ again: break; case EXPECT_OPERATOR: + while (stack->token->s[0] == '!') { + *stack->parent->head = stack; + stack->parent->head = &stack->next; + stack = stack->parent; + } if (tokens[i]->s[0] == '|' || tokens[i]->s[0] == ',') { state = EXPECT_OPERAND; add_singleton: diff --git a/libparser.7 b/libparser.7 index 31fff65..cb1d763 100644 --- a/libparser.7 +++ b/libparser.7 @@ -37,21 +37,21 @@ input can be described in its own grammar: .nf (* CHARACTER CLASSES *) -_space = \(dq \(dq | \(dq\en\(dq | \(dq\et\(dq; -_alpha = <\(dqa\(dq, \(dqz\(dq> | <\(dqA\(dq, \(dqZ\(dq>; -_octal = <\(dq0\(dq, \(dq7\(dq>; -_digit = <\(dq0\(dq, \(dq9\(dq>; -_xdigit = _digit | <\(dqa\(dq, \(dqf\(dq> | <\(dqA\(dq, \(dqF\(dq>; -_nonascii = <128, 255>; +_space = \(dq \(dq | \(dq\en\(dq | \(dq\et\(dq; +_alpha = <\(dqa\(dq, \(dqz\(dq> | <\(dqA\(dq, \(dqZ\(dq>; +_octal = <\(dq0\(dq, \(dq7\(dq>; +_digit = <\(dq0\(dq, \(dq9\(dq>; +_xdigit = _digit | <\(dqa\(dq, \(dqf\(dq> | <\(dqA\(dq, \(dqF\(dq>; +_nonascii = <128, 255>; (* WHITESPACE/COMMENTS, THE GRAMMAR IS FREE-FORM *) -_comment_char = _space | <\(dq!\(dq, 0x29> | <0x2B, 0xFF>; -_comment_tail = [_comment_char], (\(dq*)\(dq | _comment_tail); -_comment = \(dq(*\(dq, _comment_tail; +_comment_char = _space | !\(dq*\(dq, <\(dq!\(dq, 0xFF>; +_comment_tail = [_comment_char], [_string], (\(dq*)\(dq | _comment_tail | -); +_comment = \(dq(*\(dq, _comment_tail; -_ = {_space | _comment}; +_ = {_space | _comment}; (* IDENTIFIERS *) @@ -59,56 +59,58 @@ _ = {_space | _comment}; _identifier_head = _alpha | _digit | _nonascii | \(dq_\(dq; _identifier_tail = _identifier_head | \(dq-\(dq; -identifier = _identifier_head, {_identifier_tail}; +identifier = _identifier_head, {_identifier_tail}; (* STRINGS *) -_escape_simple = \(dq\e\e\(dq | \(dq\e\(dq\(dq | \(dq'\(dq | \(dqa\(dq | \(dqb\(dq | \(dqf\(dq | \(dqn\(dq | \(dqr\(dq | \(dqv\(dq; -_escape_hex = (\(dqx\(dq | \(dqX\(dq), _xdigit, _xdigit; -_escape_octal = _octal, {_octal}; (* May not exceed 255 in base 10 *) -_escape = _escape_simple | _escape_hex | _escape_octal | -; -_character = \(dq\e\e\(dq, _escape | <1, \(dq!\(dq> | <\(dq#\(dq, 0xFF>; +_escape_simple = \(dq\e\e\(dq | \(dq\e\(dq\(dq | \(dq'\(dq | \(dqa\(dq | \(dqb\(dq | \(dqf\(dq | \(dqn\(dq | \(dqr\(dq | \(dqv\(dq; +_escape_hex = (\(dqx\(dq | \(dqX\(dq), _xdigit, _xdigit; +_escape_octal = _octal, {_octal}; (* May not exceed 255 in base 10 *) +_escape = _escape_simple | _escape_hex | _escape_octal | -; +_character = \(dq\e\e\(dq, _escape | !\(dq\e\(dq\(dq, <1, 0xFF>; +_string = \(dq\e\(dq\(dq, _character, {_character}, (\(dq\e\(dq\(dq | -); -string = \(dq\e\(dq\(dq, _character, {_character}, \(dq\e\(dq\(dq; -character = \(dq\e\(dq\(dq, _character, \(dq\e\(dq\(dq; +string = _string; +character = \(dq\e\(dq\(dq, _character, (\(dq\e\(dq\(dq | -); (* INTEGERS *) -_decimal = _digit, {_digit}; -_hexadecimal = \(dq0\(dq, (\(dqx\(dq | \(dqX\(dq), _xdigit, {_xdigit}; +_decimal = _digit, {_digit}; +_hexadecimal = \(dq0\(dq, (\(dqx\(dq | \(dqX\(dq), _xdigit, {_xdigit}; -integer = _decimal | _hexadecimal; (* May not exceed 255. *) +integer = _decimal | _hexadecimal; (* May not exceed 255. *) (* GROUPINGS *) -_low = character | integer; -_high = character | integer; +_low = character | integer; +_high = character | integer; -concatenation = _operand, {_, \(dq,\(dq, _, _operand}; -alternation = concatenation, {_, \(dq|\(dq, _, concatenation}; -optional = \(dq[\(dq, _, _expression, _, \(dq]\(dq; -repeated = \(dq{\(dq, _, _expression, _, \(dq}\(dq; -group = \(dq(\(dq, _, _expression, _, \(dq)\(dq; -char-range = \(dq<\(dq, _, _low, _, \(dq,\(dq, _, _high, \(dq_\(dq, \(dq>\(dq; -exception = \(dq-\(dq; -embedded-rule = identifier; +rejection = \(dq!\(dq, _, _operand; +concatenation = _operand, {_, \(dq,\(dq, _, _operand}; +alternation = concatenation, {_, \(dq|\(dq, _, concatenation}; +optional = \(dq[\(dq, _, _expression, _, \(dq]\(dq; +repeated = \(dq{\(dq, _, _expression, _, \(dq}\(dq; +group = \(dq(\(dq, _, _expression, _, \(dq)\(dq; +char-range = \(dq<\(dq, _, _low, _, \(dq,\(dq, _, _high, \(dq_\(dq, \(dq>\(dq; +exception = \(dq-\(dq; +embedded-rule = identifier; -_literal = char-range | exception | string; -_group = optional | repeated | group | embedded-rule; -_operand = _group | _literal; +_literal = char-range | exception | string; +_group = optional | repeated | group | embedded-rule; +_operand = _group | _literal | rejection; -_expression = alternation; +_expression = alternation; (* RULES *) -rule = identifier, _, \(dq=\(dq, _, _expression, _, \(dq;\(dq; +rule = identifier, _, \(dq=\(dq, _, _expression, _, \(dq;\(dq; (* This is the root rule of the grammar. *) -grammar = _, {rules, _}; +grammar = _, {rules, _}; .fi .PP .RE diff --git a/libparser.c b/libparser.c index 36b662d..cb5981b 100644 --- a/libparser.c +++ b/libparser.c @@ -112,6 +112,18 @@ try_match(const char *rule, const union libparser_sentence *sentence, struct con } break; + case LIBPARSER_SENTENCE_TYPE_REJECTION: + unit->in = try_match(NULL, sentence->unary.sentence, ctx); + if (unit->in) { + free_unit(unit->in, ctx); + if (!ctx->exception) + goto mismatch; + ctx->exception = 0; + } + ctx->position = unit->start; + unit->rule = NULL; + break; + case LIBPARSER_SENTENCE_TYPE_OPTIONAL: unit->in = try_match(NULL, sentence->unary.sentence, ctx); goto prone; diff --git a/libparser.h b/libparser.h index 88e5b6e..d8b13d9 100644 --- a/libparser.h +++ b/libparser.h @@ -12,6 +12,7 @@ union libparser_sentence; enum libparser_sentence_type { LIBPARSER_SENTENCE_TYPE_CONCATENATION, /* .binary */ LIBPARSER_SENTENCE_TYPE_ALTERNATION, /* .binary */ + LIBPARSER_SENTENCE_TYPE_REJECTION, /* .unary */ LIBPARSER_SENTENCE_TYPE_OPTIONAL, /* .unary */ LIBPARSER_SENTENCE_TYPE_REPEATED, /* .unary */ LIBPARSER_SENTENCE_TYPE_STRING, /* .string */ -- cgit v1.2.3-70-g09d2