aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias Andrée <maandree@kth.se>2021-04-19 11:29:50 +0200
committerMattias Andrée <maandree@kth.se>2021-04-19 11:29:50 +0200
commit16b3146a1ed4497205a378472b35c40eb34c0d40 (patch)
tree1607ca63497008ce51f4333dcea9b729483f23f8
parentCompile with -O2 and -s (diff)
downloadlibparser-16b3146a1ed4497205a378472b35c40eb34c0d40.tar.gz
libparser-16b3146a1ed4497205a378472b35c40eb34c0d40.tar.bz2
libparser-16b3146a1ed4497205a378472b35c40eb34c0d40.tar.xz
Add rejection + fix documentation of comment syntax (can contain string)
Signed-off-by: Mattias Andrée <maandree@kth.se>
-rw-r--r--README78
-rw-r--r--calc-example/calc.syntax32
-rw-r--r--config.mk4
-rw-r--r--libparser-generate.c17
-rw-r--r--libparser.776
-rw-r--r--libparser.c12
-rw-r--r--libparser.h1
7 files changed, 126 insertions, 94 deletions
diff --git a/README b/README
index 3042070..684a6e9 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-NAME
+>NAME
libparser - Context-free grammar parsing library
DESCRIPTION
@@ -25,21 +25,21 @@ EXTENDED DESCRIPTION
(* CHARACTER CLASSES *)
- _space = " " | "\n" | "\t";
- _alpha = <"a", "z"> | <"A", "Z">;
- _octal = <"0", "7">;
- _digit = <"0", "9">;
- _xdigit = _digit | <"a", "f"> | <"A", "F">;
- _nonascii = <128, 255>;
+ _space = " " | "\n" | "\t";
+ _alpha = <"a", "z"> | <"A", "Z">;
+ _octal = <"0", "7">;
+ _digit = <"0", "9">;
+ _xdigit = _digit | <"a", "f"> | <"A", "F">;
+ _nonascii = <128, 255>;
(* WHITESPACE/COMMENTS, THE GRAMMAR IS FREE-FORM *)
- _comment_char = _space | <"!", 0x29> | <0x2B, 0xFF>;
- _comment_tail = [_comment_char], ("*)" | _comment_tail);
- _comment = "(*", _comment_tail;
+ _comment_char = _space | !"*", "\"", <"!", 0xFF>;
+ _comment_tail = [_comment_char], [_string], ("*)" | _comment_tail | -);
+ _comment = "(*", _comment_tail;
- _ = {_space | _comment};
+ _ = {_space | _comment};
(* IDENTIFIERS *)
@@ -47,56 +47,58 @@ EXTENDED DESCRIPTION
_identifier_head = _alpha | _digit | _nonascii | "_";
_identifier_tail = _identifier_head | "-";
- identifier = _identifier_head, {_identifier_tail};
+ identifier = _identifier_head, {_identifier_tail};
(* STRINGS *)
- _escape_simple = "\\" | "\"" | "'" | "a" | "b" | "f" | "n" | "r" | "v";
- _escape_hex = ("x" | "X"), _xdigit, _xdigit;
- _escape_octal = _octal, {_octal}; (* May not exceed 255 in base 10 *)
- _escape = _escape_simple | _escape_hex | _escape_octal | -;
- _character = "\\", _escape | <1, "!"> | <"#", 0xFF>;
+ _escape_simple = "\\" | "\"" | "'" | "a" | "b" | "f" | "n" | "r" | "v";
+ _escape_hex = ("x" | "X"), _xdigit, _xdigit;
+ _escape_octal = _octal, {_octal}; (* May not exceed 255 in base 10 *)
+ _escape = _escape_simple | _escape_hex | _escape_octal | -;
+ _character = "\\", _escape | !"\"", <1, 0xFF>;
+ _string = "\"", _character, {_character}, ("\"" | -);
- string = "\"", _character, {_character}, "\"";
- character = "\"", _character, "\"";
+ string = _string
+ character = "\"", _character, ("\"" | -);
(* INTEGERS *)
- _decimal = _digit, {_digit};
- _hexadecimal = "0", ("x" | "X"), _xdigit, {_xdigit};
+ _decimal = _digit, {_digit};
+ _hexadecimal = "0", ("x" | "X"), _xdigit, {_xdigit};
- integer = _decimal | _hexadecimal; (* May not exceed 255. *)
+ integer = _decimal | _hexadecimal; (* May not exceed 255. *)
(* GROUPINGS *)
- _low = character | integer;
- _high = character | integer;
+ _low = character | integer;
+ _high = character | integer;
- concatenation = _operand, {_, ",", _, _operand};
- alternation = concatenation, {_, "|", _, concatenation};
- optional = "[", _, _expression, _, "]";
- repeated = "{", _, _expression, _, "}";
- group = "(", _, _expression, _, ")";
- char-range = "<", _, _low, _, ",", _, _high, "_", ">";
- exception = "-";
- embedded-rule = identifier;
+ rejection = "!", _, _operand;
+ concatenation = _operand, {_, ",", _, _operand};
+ alternation = concatenation, {_, "|", _, concatenation};
+ optional = "[", _, _expression, _, "]";
+ repeated = "{", _, _expression, _, "}";
+ group = "(", _, _expression, _, ")";
+ char-range = "<", _, _low, _, ",", _, _high, "_", ">";
+ exception = "-";
+ embedded-rule = identifier;
- _literal = char-range | exception | string;
- _group = optional | repeated | group | embedded-rule;
- _operand = _group | _literal;
+ _literal = char-range | exception | string;
+ _group = optional | repeated | group | embedded-rule;
+ _operand = _group | _literal | rejection;
- _expression = alternation;
+ _expression = alternation;
(* RULES *)
- rule = identifier, _, "=", _, _expression, _, ";";
+ rule = identifier, _, "=", _, _expression, _, ";";
(* This is the root rule of the grammar. *)
- grammar = _, {rules, _};
+ grammar = _, {rules, _};
The file must be encoded in UTF-8, with LF as the line
break (CR and FF are illegal just becuase).
diff --git a/calc-example/calc.syntax b/calc-example/calc.syntax
index 286019b..8051f57 100644
--- a/calc-example/calc.syntax
+++ b/calc-example/calc.syntax
@@ -1,27 +1,29 @@
-_WHITESPACE = " " | "\t" | " ";
-_ = {_WHITESPACE};
+_WHITESPACE = " " | "\t" | " ";
+_COMMENT = "(*", {!"*)", <0, 0xFF>}, "*)";
+_ = {_WHITESPACE | _COMMENT};
-DIGIT = <"0", "9">;
-ADD = _, ("+"), _;
-SUB = _, ("-" | "−"), _;
-MUL = _, ("*" | "⋅" | "×"), _;
-DIV = _, ("/" | "∕" | "÷"), _;
+DIGIT = <"0", "9">;
+ADD = _, ("+"), _;
+SUB = _, ("-" | "−"), _;
+MUL = _, ("*" | "⋅" | "×"), _;
+DIV = _, ("/" | "∕" | "÷"), _;
-sign = ADD | SUB;
-unsigned = DIGIT, {DIGIT | _WHITESPACE | "_" | "'"};
+sign = ADD | SUB;
-_number = unsigned | "(", _expr, (")" | -);
+unsigned = DIGIT, {DIGIT | _WHITESPACE | "_" | "'"};
-number = _number, {_, _number}; (* optionally with implicit multiplication *)
+_number = unsigned | "(", _expr, (")" | -);
-value = [sign], number;
+number = _number, {_, _number}; (* optionally with implicit multiplication *)
-_expr = hyper1;
+value = [sign], number;
+_expr = hyper1;
-hyper1 = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _;
-hyper2 = _, value, {(MUL | DIV), (value | -)}, _;
+
+hyper1 = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _;
+hyper2 = _, value, {(MUL | DIV), (value | -)}, _;
diff --git a/config.mk b/config.mk
index accd17b..dac7c46 100644
--- a/config.mk
+++ b/config.mk
@@ -4,5 +4,5 @@ MANPREFIX = $(PREFIX)/share/man
CC = cc
CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -I"$$(pwd)"
-CFLAGS = -std=c99 -Wall -O2
-LDFLAGS = -s
+CFLAGS = -std=c99 -Wall -Og -g
+LDFLAGS =
diff --git a/libparser-generate.c b/libparser-generate.c
index 72cd294..9f82a2e 100644
--- a/libparser-generate.c
+++ b/libparser-generate.c
@@ -361,12 +361,13 @@ emit_and_free_sentence(struct node *node, size_t *indexp)
free(node);
}
- if (node->token->s[0] == '[' || node->token->s[0] == '{') {
+ if (node->token->s[0] == '[' || node->token->s[0] == '{' || node->token->s[0] == '!') {
emit_and_free_sentence(node->data, indexp);
printf("static union libparser_sentence sentence_%zu_%zu = {.unary = {"
".type = LIBPARSER_SENTENCE_TYPE_%s, .sentence = &sentence_%zu_%zu"
"}};\n",
- nrule_names, index, node->token->s[0] == '[' ? "OPTIONAL" : "REPEATED", nrule_names, index + 1);
+ nrule_names, index, node->token->s[0] == '[' ? "OPTIONAL" :
+ node->token->s[0] == '{' ? "REPEATED" : "REJECTION", nrule_names, index + 1);
} else if (node->token->s[0] == '<') {
low = node->data;
high = node->data->next;
@@ -551,11 +552,16 @@ main(int argc, char *argv[])
again:
for (; tokens[i]; i++) {
if (tokens[i + 1] && tokens[i]->s[0] == '(' && tokens[i + 1]->s[0] == '*') {
+ free(tokens[i]);
+ free(tokens[i + 1]);
for (i += 2; tokens[i] && tokens[i + 1]; i++) {
if (tokens[i]->s[0] == '*' && tokens[i + 1]->s[0] == ')') {
+ free(tokens[i]);
+ free(tokens[i + 1]);
i += 2;
goto again;
}
+ free(tokens[i]);
}
eprintf("%s: premature end of file\n", argv0);
}
@@ -612,6 +618,8 @@ again:
stack->head = &stack->data;
} else if (tokens[i]->s[0] == '-') {
goto add;
+ } else if (tokens[i]->s[0] == '!') {
+ goto push_stack;
} else {
stray:
eprintf("%s: stray '%c' on line %zu at column %zu (character %zu)\n",
@@ -625,6 +633,11 @@ again:
break;
case EXPECT_OPERATOR:
+ while (stack->token->s[0] == '!') {
+ *stack->parent->head = stack;
+ stack->parent->head = &stack->next;
+ stack = stack->parent;
+ }
if (tokens[i]->s[0] == '|' || tokens[i]->s[0] == ',') {
state = EXPECT_OPERAND;
add_singleton:
diff --git a/libparser.7 b/libparser.7
index 31fff65..cb1d763 100644
--- a/libparser.7
+++ b/libparser.7
@@ -37,21 +37,21 @@ input can be described in its own grammar:
.nf
(* CHARACTER CLASSES *)
-_space = \(dq \(dq | \(dq\en\(dq | \(dq\et\(dq;
-_alpha = <\(dqa\(dq, \(dqz\(dq> | <\(dqA\(dq, \(dqZ\(dq>;
-_octal = <\(dq0\(dq, \(dq7\(dq>;
-_digit = <\(dq0\(dq, \(dq9\(dq>;
-_xdigit = _digit | <\(dqa\(dq, \(dqf\(dq> | <\(dqA\(dq, \(dqF\(dq>;
-_nonascii = <128, 255>;
+_space = \(dq \(dq | \(dq\en\(dq | \(dq\et\(dq;
+_alpha = <\(dqa\(dq, \(dqz\(dq> | <\(dqA\(dq, \(dqZ\(dq>;
+_octal = <\(dq0\(dq, \(dq7\(dq>;
+_digit = <\(dq0\(dq, \(dq9\(dq>;
+_xdigit = _digit | <\(dqa\(dq, \(dqf\(dq> | <\(dqA\(dq, \(dqF\(dq>;
+_nonascii = <128, 255>;
(* WHITESPACE/COMMENTS, THE GRAMMAR IS FREE-FORM *)
-_comment_char = _space | <\(dq!\(dq, 0x29> | <0x2B, 0xFF>;
-_comment_tail = [_comment_char], (\(dq*)\(dq | _comment_tail);
-_comment = \(dq(*\(dq, _comment_tail;
+_comment_char = _space | !\(dq*\(dq, <\(dq!\(dq, 0xFF>;
+_comment_tail = [_comment_char], [_string], (\(dq*)\(dq | _comment_tail | -);
+_comment = \(dq(*\(dq, _comment_tail;
-_ = {_space | _comment};
+_ = {_space | _comment};
(* IDENTIFIERS *)
@@ -59,56 +59,58 @@ _ = {_space | _comment};
_identifier_head = _alpha | _digit | _nonascii | \(dq_\(dq;
_identifier_tail = _identifier_head | \(dq-\(dq;
-identifier = _identifier_head, {_identifier_tail};
+identifier = _identifier_head, {_identifier_tail};
(* STRINGS *)
-_escape_simple = \(dq\e\e\(dq | \(dq\e\(dq\(dq | \(dq'\(dq | \(dqa\(dq | \(dqb\(dq | \(dqf\(dq | \(dqn\(dq | \(dqr\(dq | \(dqv\(dq;
-_escape_hex = (\(dqx\(dq | \(dqX\(dq), _xdigit, _xdigit;
-_escape_octal = _octal, {_octal}; (* May not exceed 255 in base 10 *)
-_escape = _escape_simple | _escape_hex | _escape_octal | -;
-_character = \(dq\e\e\(dq, _escape | <1, \(dq!\(dq> | <\(dq#\(dq, 0xFF>;
+_escape_simple = \(dq\e\e\(dq | \(dq\e\(dq\(dq | \(dq'\(dq | \(dqa\(dq | \(dqb\(dq | \(dqf\(dq | \(dqn\(dq | \(dqr\(dq | \(dqv\(dq;
+_escape_hex = (\(dqx\(dq | \(dqX\(dq), _xdigit, _xdigit;
+_escape_octal = _octal, {_octal}; (* May not exceed 255 in base 10 *)
+_escape = _escape_simple | _escape_hex | _escape_octal | -;
+_character = \(dq\e\e\(dq, _escape | !\(dq\e\(dq\(dq, <1, 0xFF>;
+_string = \(dq\e\(dq\(dq, _character, {_character}, (\(dq\e\(dq\(dq | -);
-string = \(dq\e\(dq\(dq, _character, {_character}, \(dq\e\(dq\(dq;
-character = \(dq\e\(dq\(dq, _character, \(dq\e\(dq\(dq;
+string = _string;
+character = \(dq\e\(dq\(dq, _character, (\(dq\e\(dq\(dq | -);
(* INTEGERS *)
-_decimal = _digit, {_digit};
-_hexadecimal = \(dq0\(dq, (\(dqx\(dq | \(dqX\(dq), _xdigit, {_xdigit};
+_decimal = _digit, {_digit};
+_hexadecimal = \(dq0\(dq, (\(dqx\(dq | \(dqX\(dq), _xdigit, {_xdigit};
-integer = _decimal | _hexadecimal; (* May not exceed 255. *)
+integer = _decimal | _hexadecimal; (* May not exceed 255. *)
(* GROUPINGS *)
-_low = character | integer;
-_high = character | integer;
+_low = character | integer;
+_high = character | integer;
-concatenation = _operand, {_, \(dq,\(dq, _, _operand};
-alternation = concatenation, {_, \(dq|\(dq, _, concatenation};
-optional = \(dq[\(dq, _, _expression, _, \(dq]\(dq;
-repeated = \(dq{\(dq, _, _expression, _, \(dq}\(dq;
-group = \(dq(\(dq, _, _expression, _, \(dq)\(dq;
-char-range = \(dq<\(dq, _, _low, _, \(dq,\(dq, _, _high, \(dq_\(dq, \(dq>\(dq;
-exception = \(dq-\(dq;
-embedded-rule = identifier;
+rejection = \(dq!\(dq, _, _operand;
+concatenation = _operand, {_, \(dq,\(dq, _, _operand};
+alternation = concatenation, {_, \(dq|\(dq, _, concatenation};
+optional = \(dq[\(dq, _, _expression, _, \(dq]\(dq;
+repeated = \(dq{\(dq, _, _expression, _, \(dq}\(dq;
+group = \(dq(\(dq, _, _expression, _, \(dq)\(dq;
+char-range = \(dq<\(dq, _, _low, _, \(dq,\(dq, _, _high, \(dq_\(dq, \(dq>\(dq;
+exception = \(dq-\(dq;
+embedded-rule = identifier;
-_literal = char-range | exception | string;
-_group = optional | repeated | group | embedded-rule;
-_operand = _group | _literal;
+_literal = char-range | exception | string;
+_group = optional | repeated | group | embedded-rule;
+_operand = _group | _literal | rejection;
-_expression = alternation;
+_expression = alternation;
(* RULES *)
-rule = identifier, _, \(dq=\(dq, _, _expression, _, \(dq;\(dq;
+rule = identifier, _, \(dq=\(dq, _, _expression, _, \(dq;\(dq;
(* This is the root rule of the grammar. *)
-grammar = _, {rules, _};
+grammar = _, {rules, _};
.fi
.PP
.RE
diff --git a/libparser.c b/libparser.c
index 36b662d..cb5981b 100644
--- a/libparser.c
+++ b/libparser.c
@@ -112,6 +112,18 @@ try_match(const char *rule, const union libparser_sentence *sentence, struct con
}
break;
+ case LIBPARSER_SENTENCE_TYPE_REJECTION:
+ unit->in = try_match(NULL, sentence->unary.sentence, ctx);
+ if (unit->in) {
+ free_unit(unit->in, ctx);
+ if (!ctx->exception)
+ goto mismatch;
+ ctx->exception = 0;
+ }
+ ctx->position = unit->start;
+ unit->rule = NULL;
+ break;
+
case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
unit->in = try_match(NULL, sentence->unary.sentence, ctx);
goto prone;
diff --git a/libparser.h b/libparser.h
index 88e5b6e..d8b13d9 100644
--- a/libparser.h
+++ b/libparser.h
@@ -12,6 +12,7 @@ union libparser_sentence;
enum libparser_sentence_type {
LIBPARSER_SENTENCE_TYPE_CONCATENATION, /* .binary */
LIBPARSER_SENTENCE_TYPE_ALTERNATION, /* .binary */
+ LIBPARSER_SENTENCE_TYPE_REJECTION, /* .unary */
LIBPARSER_SENTENCE_TYPE_OPTIONAL, /* .unary */
LIBPARSER_SENTENCE_TYPE_REPEATED, /* .unary */
LIBPARSER_SENTENCE_TYPE_STRING, /* .string */