From 16b3146a1ed4497205a378472b35c40eb34c0d40 Mon Sep 17 00:00:00 2001
From: Mattias Andrée <maandree@kth.se>
Date: Mon, 19 Apr 2021 11:29:50 +0200
Subject: Add rejection + fix documentation of comment syntax (can contain
 string)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Mattias Andrée <maandree@kth.se>
---
 README                   | 78 +++++++++++++++++++++++++-----------------------
 calc-example/calc.syntax | 32 ++++++++++----------
 config.mk                |  4 +--
 libparser-generate.c     | 17 +++++++++--
 libparser.7              | 76 +++++++++++++++++++++++-----------------------
 libparser.c              | 12 ++++++++
 libparser.h              |  1 +
 7 files changed, 126 insertions(+), 94 deletions(-)

diff --git a/README b/README
index 3042070..684a6e9 100644
--- a/README
+++ b/README
@@ -1,4 +1,4 @@
-NAME
+>NAME
 	libparser - Context-free grammar parsing library
 
 DESCRIPTION
@@ -25,21 +25,21 @@ EXTENDED DESCRIPTION
 
 		(* CHARACTER CLASSES *)
 
-		_space    = " " | "\n" | "\t";
-		_alpha    = <"a", "z"> | <"A", "Z">;
-		_octal    = <"0", "7">;
-		_digit    = <"0", "9">;
-		_xdigit   = _digit | <"a", "f"> | <"A", "F">;
-		_nonascii = <128, 255>;
+		_space           = " " | "\n" | "\t";
+		_alpha           = <"a", "z"> | <"A", "Z">;
+		_octal           = <"0", "7">;
+		_digit           = <"0", "9">;
+		_xdigit          = _digit | <"a", "f"> | <"A", "F">;
+		_nonascii        = <128, 255>;
 
 
 		(* WHITESPACE/COMMENTS, THE GRAMMAR IS FREE-FORM *)
 
-		_comment_char = _space | <"!", 0x29> | <0x2B, 0xFF>;
-		_comment_tail = [_comment_char], ("*)" | _comment_tail);
-		_comment      = "(*", _comment_tail;
+		_comment_char    = _space | !"*", "\"", <"!", 0xFF>;
+		_comment_tail    = [_comment_char], [_string], ("*)" | _comment_tail | -);
+		_comment         = "(*", _comment_tail;
 
-		_ = {_space | _comment};
+		_                = {_space | _comment};
 
 
 		(* IDENTIFIERS *)
@@ -47,56 +47,58 @@ EXTENDED DESCRIPTION
 		_identifier_head = _alpha | _digit | _nonascii | "_";
 		_identifier_tail = _identifier_head | "-";
 
-		identifier = _identifier_head, {_identifier_tail};
+		identifier       = _identifier_head, {_identifier_tail};
 
 
 		(* STRINGS *)
 
-		_escape_simple = "\\" | "\"" | "'" | "a" | "b" | "f" | "n" | "r" | "v";
-		_escape_hex    = ("x" | "X"), _xdigit, _xdigit;
-		_escape_octal  = _octal, {_octal}; (* May not exceed 255 in base 10 *)
-		_escape        = _escape_simple | _escape_hex | _escape_octal | -;
-		_character     = "\\", _escape | <1, "!"> | <"#", 0xFF>;
+		_escape_simple   = "\\" | "\"" | "'" | "a" | "b" | "f" | "n" | "r" | "v";
+		_escape_hex      = ("x" | "X"), _xdigit, _xdigit;
+		_escape_octal    = _octal, {_octal}; (* May not exceed 255 in base 10 *)
+		_escape          = _escape_simple | _escape_hex | _escape_octal | -;
+		_character       = "\\", _escape | !"\"", <1, 0xFF>;
+		_string          = "\"", _character, {_character}, ("\"" | -);
 
-		string    = "\"", _character, {_character}, "\"";
-		character = "\"", _character, "\"";
+		string           = _string
+		character        = "\"", _character, ("\"" | -);
 
 
 		(* INTEGERS *)
 
-		_decimal     = _digit, {_digit};
-		_hexadecimal = "0", ("x" | "X"), _xdigit, {_xdigit};
+		_decimal         = _digit, {_digit};
+		_hexadecimal     = "0", ("x" | "X"), _xdigit, {_xdigit};
 
-		integer = _decimal | _hexadecimal; (* May not exceed 255. *)
+		integer          = _decimal | _hexadecimal; (* May not exceed 255. *)
 
 
 		(* GROUPINGS *)
 
-		_low  = character | integer;
-		_high = character | integer;
+		_low             = character | integer;
+		_high            = character | integer;
 
-		concatenation = _operand, {_, ",", _, _operand};
-		alternation   = concatenation, {_, "|", _, concatenation};
-		optional      = "[", _, _expression, _, "]";
-		repeated      = "{", _, _expression, _, "}";
-		group         = "(", _, _expression, _, ")";
-		char-range    = "<", _, _low, _, ",", _, _high, "_", ">";
-		exception     = "-";
-		embedded-rule = identifier;
+		rejection        = "!", _, _operand;
+		concatenation    = _operand, {_, ",", _, _operand};
+		alternation      = concatenation, {_, "|", _, concatenation};
+		optional         = "[", _, _expression, _, "]";
+		repeated         = "{", _, _expression, _, "}";
+		group            = "(", _, _expression, _, ")";
+		char-range       = "<", _, _low, _, ",", _, _high, "_", ">";
+		exception        = "-";
+		embedded-rule    = identifier;
 
-		_literal = char-range | exception | string;
-		_group   = optional | repeated | group | embedded-rule;
-		_operand = _group | _literal;
+		_literal         = char-range | exception | string;
+		_group           = optional | repeated | group | embedded-rule;
+		_operand         = _group | _literal | rejection;
 
-		_expression = alternation;
+		_expression      = alternation;
 
 
 		(* RULES *)
 
-		rule = identifier, _, "=", _, _expression, _, ";";
+		rule             = identifier, _, "=", _, _expression, _, ";";
 
 		(* This is the root rule of the grammar. *)
-		grammar = _, {rules, _};
+		grammar          = _, {rules, _};
 
 	The file must be encoded in UTF-8, with LF as the line
 	break (CR and FF are illegal just becuase).
diff --git a/calc-example/calc.syntax b/calc-example/calc.syntax
index 286019b..8051f57 100644
--- a/calc-example/calc.syntax
+++ b/calc-example/calc.syntax
@@ -1,27 +1,29 @@
-_WHITESPACE = " " | "\t" | " ";
-_ = {_WHITESPACE};
+_WHITESPACE   = " " | "\t" | " ";
+_COMMENT      = "(*", {!"*)", <0, 0xFF>}, "*)";
 
+_             = {_WHITESPACE | _COMMENT};
 
-DIGIT = <"0", "9">;
 
-ADD = _, ("+"),             _;
-SUB = _, ("-" | "−"),       _;
-MUL = _, ("*" | "⋅" | "×"), _;
-DIV = _, ("/" | "∕" | "÷"), _;
+DIGIT         = <"0", "9">;
 
+ADD           = _, ("+"),             _;
+SUB           = _, ("-" | "−"),       _;
+MUL           = _, ("*" | "⋅" | "×"), _;
+DIV           = _, ("/" | "∕" | "÷"), _;
 
-sign = ADD | SUB;
 
-unsigned = DIGIT, {DIGIT | _WHITESPACE | "_" | "'"};
+sign          = ADD | SUB;
 
-_number = unsigned | "(", _expr, (")" | -);
+unsigned      = DIGIT, {DIGIT | _WHITESPACE | "_" | "'"};
 
-number = _number, {_, _number}; (* optionally with implicit multiplication *)
+_number       = unsigned | "(", _expr, (")" | -);
 
-value = [sign], number;
+number        = _number, {_, _number}; (* optionally with implicit multiplication *)
 
-_expr = hyper1;
+value         = [sign], number;
 
+_expr         = hyper1;
 
-hyper1 = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _;
-hyper2 = _, value, {(MUL | DIV), (value | -)}, _;
+
+hyper1        = _, hyper2, {(ADD | SUB), (hyper2 | -)}, _;
+hyper2        = _, value, {(MUL | DIV), (value | -)}, _;
diff --git a/config.mk b/config.mk
index accd17b..dac7c46 100644
--- a/config.mk
+++ b/config.mk
@@ -4,5 +4,5 @@ MANPREFIX = $(PREFIX)/share/man
 CC = cc
 
 CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_XOPEN_SOURCE=700 -I"$$(pwd)"
-CFLAGS   = -std=c99 -Wall -O2
-LDFLAGS  = -s
+CFLAGS   = -std=c99 -Wall -Og -g
+LDFLAGS  = 
diff --git a/libparser-generate.c b/libparser-generate.c
index 72cd294..9f82a2e 100644
--- a/libparser-generate.c
+++ b/libparser-generate.c
@@ -361,12 +361,13 @@ emit_and_free_sentence(struct node *node, size_t *indexp)
 		free(node);
 	}
 
-	if (node->token->s[0] == '[' || node->token->s[0] == '{') {
+	if (node->token->s[0] == '[' || node->token->s[0] == '{' || node->token->s[0] == '!') {
 		emit_and_free_sentence(node->data, indexp);
 		printf("static union libparser_sentence sentence_%zu_%zu = {.unary = {"
 		           ".type = LIBPARSER_SENTENCE_TYPE_%s, .sentence = &sentence_%zu_%zu"
 		       "}};\n",
-		       nrule_names, index, node->token->s[0] == '[' ? "OPTIONAL" : "REPEATED", nrule_names, index + 1);
+		       nrule_names, index, node->token->s[0] == '[' ? "OPTIONAL" :
+		                           node->token->s[0] == '{' ? "REPEATED" : "REJECTION", nrule_names, index + 1);
 	} else if (node->token->s[0] == '<') {
 		low = node->data;
 		high = node->data->next;
@@ -551,11 +552,16 @@ main(int argc, char *argv[])
 again:
 	for (; tokens[i]; i++) {
 		if (tokens[i + 1] && tokens[i]->s[0] == '(' && tokens[i + 1]->s[0] == '*') {
+			free(tokens[i]);
+			free(tokens[i + 1]);
 			for (i += 2; tokens[i] && tokens[i + 1]; i++) {
 				if (tokens[i]->s[0] == '*' && tokens[i + 1]->s[0] == ')') {
+					free(tokens[i]);
+					free(tokens[i + 1]);
 					i += 2;
 					goto again;
 				}
+				free(tokens[i]);
 			}
 			eprintf("%s: premature end of file\n", argv0);
 		}
@@ -612,6 +618,8 @@ again:
 					stack->head = &stack->data;
 				} else if (tokens[i]->s[0] == '-') {
 					goto add;
+				} else if (tokens[i]->s[0] == '!') {
+					goto push_stack;
 				} else {
 				stray:
 					eprintf("%s: stray '%c' on line %zu at column %zu (character %zu)\n",
@@ -625,6 +633,11 @@ again:
 			break;
 
 		case EXPECT_OPERATOR:
+			while (stack->token->s[0] == '!') {
+				*stack->parent->head = stack;
+				stack->parent->head = &stack->next;
+				stack = stack->parent;
+			}
 			if (tokens[i]->s[0] == '|' || tokens[i]->s[0] == ',') {
 				state = EXPECT_OPERAND;
 			add_singleton:
diff --git a/libparser.7 b/libparser.7
index 31fff65..cb1d763 100644
--- a/libparser.7
+++ b/libparser.7
@@ -37,21 +37,21 @@ input can be described in its own grammar:
 .nf
 (* CHARACTER CLASSES *)
 
-_space    = \(dq \(dq | \(dq\en\(dq | \(dq\et\(dq;
-_alpha    = <\(dqa\(dq, \(dqz\(dq> | <\(dqA\(dq, \(dqZ\(dq>;
-_octal    = <\(dq0\(dq, \(dq7\(dq>;
-_digit    = <\(dq0\(dq, \(dq9\(dq>;
-_xdigit   = _digit | <\(dqa\(dq, \(dqf\(dq> | <\(dqA\(dq, \(dqF\(dq>;
-_nonascii = <128, 255>;
+_space           = \(dq \(dq | \(dq\en\(dq | \(dq\et\(dq;
+_alpha           = <\(dqa\(dq, \(dqz\(dq> | <\(dqA\(dq, \(dqZ\(dq>;
+_octal           = <\(dq0\(dq, \(dq7\(dq>;
+_digit           = <\(dq0\(dq, \(dq9\(dq>;
+_xdigit          = _digit | <\(dqa\(dq, \(dqf\(dq> | <\(dqA\(dq, \(dqF\(dq>;
+_nonascii        = <128, 255>;
 
 
 (* WHITESPACE/COMMENTS, THE GRAMMAR IS FREE-FORM *)
 
-_comment_char = _space | <\(dq!\(dq, 0x29> | <0x2B, 0xFF>;
-_comment_tail = [_comment_char], (\(dq*)\(dq | _comment_tail);
-_comment      = \(dq(*\(dq, _comment_tail;
+_comment_char    = _space | !\(dq*\(dq, <\(dq!\(dq, 0xFF>;
+_comment_tail    = [_comment_char], [_string], (\(dq*)\(dq | _comment_tail | -);
+_comment         = \(dq(*\(dq, _comment_tail;
 
-_ = {_space | _comment};
+_                = {_space | _comment};
 
 
 (* IDENTIFIERS *)
@@ -59,56 +59,58 @@ _ = {_space | _comment};
 _identifier_head = _alpha | _digit | _nonascii | \(dq_\(dq;
 _identifier_tail = _identifier_head | \(dq-\(dq;
 
-identifier = _identifier_head, {_identifier_tail};
+identifier       = _identifier_head, {_identifier_tail};
 
 
 (* STRINGS *)
 
-_escape_simple = \(dq\e\e\(dq | \(dq\e\(dq\(dq | \(dq'\(dq | \(dqa\(dq | \(dqb\(dq | \(dqf\(dq | \(dqn\(dq | \(dqr\(dq | \(dqv\(dq;
-_escape_hex    = (\(dqx\(dq | \(dqX\(dq), _xdigit, _xdigit;
-_escape_octal  = _octal, {_octal}; (* May not exceed 255 in base 10 *)
-_escape        = _escape_simple | _escape_hex | _escape_octal | -;
-_character     = \(dq\e\e\(dq, _escape | <1, \(dq!\(dq> | <\(dq#\(dq, 0xFF>;
+_escape_simple   = \(dq\e\e\(dq | \(dq\e\(dq\(dq | \(dq'\(dq | \(dqa\(dq | \(dqb\(dq | \(dqf\(dq | \(dqn\(dq | \(dqr\(dq | \(dqv\(dq;
+_escape_hex      = (\(dqx\(dq | \(dqX\(dq), _xdigit, _xdigit;
+_escape_octal    = _octal, {_octal}; (* May not exceed 255 in base 10 *)
+_escape          = _escape_simple | _escape_hex | _escape_octal | -;
+_character       = \(dq\e\e\(dq, _escape | !\(dq\e\(dq\(dq, <1, 0xFF>;
+_string          = \(dq\e\(dq\(dq, _character, {_character}, (\(dq\e\(dq\(dq | -);
 
-string    = \(dq\e\(dq\(dq, _character, {_character}, \(dq\e\(dq\(dq;
-character = \(dq\e\(dq\(dq, _character, \(dq\e\(dq\(dq;
+string           = _string;
+character        = \(dq\e\(dq\(dq, _character, (\(dq\e\(dq\(dq | -);
 
 
 (* INTEGERS *)
 
-_decimal     = _digit, {_digit};
-_hexadecimal = \(dq0\(dq, (\(dqx\(dq | \(dqX\(dq), _xdigit, {_xdigit};
+_decimal         = _digit, {_digit};
+_hexadecimal     = \(dq0\(dq, (\(dqx\(dq | \(dqX\(dq), _xdigit, {_xdigit};
 
-integer = _decimal | _hexadecimal; (* May not exceed 255. *)
+integer          = _decimal | _hexadecimal; (* May not exceed 255. *)
 
 
 (* GROUPINGS *)
 
-_low  = character | integer;
-_high = character | integer;
+_low             = character | integer;
+_high            = character | integer;
 
-concatenation = _operand, {_, \(dq,\(dq, _, _operand};
-alternation   = concatenation, {_, \(dq|\(dq, _, concatenation};
-optional      = \(dq[\(dq, _, _expression, _, \(dq]\(dq;
-repeated      = \(dq{\(dq, _, _expression, _, \(dq}\(dq;
-group         = \(dq(\(dq, _, _expression, _, \(dq)\(dq;
-char-range    = \(dq<\(dq, _, _low, _, \(dq,\(dq, _, _high, \(dq_\(dq, \(dq>\(dq;
-exception     = \(dq-\(dq;
-embedded-rule = identifier;
+rejection        = \(dq!\(dq, _, _operand;
+concatenation    = _operand, {_, \(dq,\(dq, _, _operand};
+alternation      = concatenation, {_, \(dq|\(dq, _, concatenation};
+optional         = \(dq[\(dq, _, _expression, _, \(dq]\(dq;
+repeated         = \(dq{\(dq, _, _expression, _, \(dq}\(dq;
+group            = \(dq(\(dq, _, _expression, _, \(dq)\(dq;
+char-range       = \(dq<\(dq, _, _low, _, \(dq,\(dq, _, _high, \(dq_\(dq, \(dq>\(dq;
+exception        = \(dq-\(dq;
+embedded-rule    = identifier;
 
-_literal = char-range | exception | string;
-_group   = optional | repeated | group | embedded-rule;
-_operand = _group | _literal;
+_literal         = char-range | exception | string;
+_group           = optional | repeated | group | embedded-rule;
+_operand         = _group | _literal | rejection;
 
-_expression = alternation;
+_expression      = alternation;
 
 
 (* RULES *)
 
-rule = identifier, _, \(dq=\(dq, _, _expression, _, \(dq;\(dq;
+rule             = identifier, _, \(dq=\(dq, _, _expression, _, \(dq;\(dq;
 
 (* This is the root rule of the grammar. *)
-grammar = _, {rules, _};
+grammar          = _, {rules, _};
 .fi
 .PP
 .RE
diff --git a/libparser.c b/libparser.c
index 36b662d..cb5981b 100644
--- a/libparser.c
+++ b/libparser.c
@@ -112,6 +112,18 @@ try_match(const char *rule, const union libparser_sentence *sentence, struct con
 		}
 		break;
 
+	case LIBPARSER_SENTENCE_TYPE_REJECTION:
+		unit->in = try_match(NULL, sentence->unary.sentence, ctx);
+		if (unit->in) {
+			free_unit(unit->in, ctx);
+			if (!ctx->exception)
+				goto mismatch;
+			ctx->exception = 0;
+		}
+		ctx->position = unit->start;
+		unit->rule = NULL;
+		break;
+
 	case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
 		unit->in = try_match(NULL, sentence->unary.sentence, ctx);
 		goto prone;
diff --git a/libparser.h b/libparser.h
index 88e5b6e..d8b13d9 100644
--- a/libparser.h
+++ b/libparser.h
@@ -12,6 +12,7 @@ union libparser_sentence;
 enum libparser_sentence_type {
 	LIBPARSER_SENTENCE_TYPE_CONCATENATION, /* .binary */
 	LIBPARSER_SENTENCE_TYPE_ALTERNATION,   /* .binary */
+	LIBPARSER_SENTENCE_TYPE_REJECTION,     /* .unary */
 	LIBPARSER_SENTENCE_TYPE_OPTIONAL,      /* .unary */
 	LIBPARSER_SENTENCE_TYPE_REPEATED,      /* .unary */
 	LIBPARSER_SENTENCE_TYPE_STRING,        /* .string */
-- 
cgit v1.3.1