aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMattias Andrée <maandree@kth.se>2021-04-17 23:41:01 +0200
committerMattias Andrée <maandree@kth.se>2021-04-17 23:41:01 +0200
commitc04509285946bc045f60a9e0633d818a49718fc0 (patch)
treecb9b83a4f83d3a606da7a32bf29dd78973a253d2
parentreadme: no left recursion (diff)
downloadlibparser-c04509285946bc045f60a9e0633d818a49718fc0.tar.gz
libparser-c04509285946bc045f60a9e0633d818a49718fc0.tar.bz2
libparser-c04509285946bc045f60a9e0633d818a49718fc0.tar.xz
Add libparser.7 and libparser-generate.1
Signed-off-by: Mattias Andrée <maandree@kth.se>
-rw-r--r--README4
-rw-r--r--TODO2
-rw-r--r--libparser-generate.163
-rw-r--r--libparser.7153
4 files changed, 219 insertions, 3 deletions
diff --git a/README b/README
index 9e2323d..b6594b4 100644
--- a/README
+++ b/README
@@ -19,7 +19,7 @@ DESCRIPTION
libparser is proudly non-self-hosted.
EXTENDED DESCRIPTION
- Syntax
+ Syntax
The grammar for libparser-generate(1)'s input can be described
in its own grammar:
@@ -67,7 +67,7 @@ EXTENDED DESCRIPTION
_decimal = _digit, {_digit};
_hexadecimal = "0", ("x" | "X"), _xdigit, {_xdigit};
- integer = _decimal | _hexadecimal; (* May not exceed 255. *)
+ integer = _decimal | _hexadecimal; (* May not exceed 255. *)
(* GROUPINGS *)
diff --git a/TODO b/TODO
index 641ce0d..669132d 100644
--- a/TODO
+++ b/TODO
@@ -1,3 +1,3 @@
+Add libparser_parse_file.3
Add support for prelexed
-Add man pages
Add tests
diff --git a/libparser-generate.1 b/libparser-generate.1
new file mode 100644
index 0000000..47ef367
--- /dev/null
+++ b/libparser-generate.1
@@ -0,0 +1,63 @@
+.TH LIBPARSER-GENERATE 7 LIBPARSER
+.SH NAME
+libparser-generate \- Generate grammar definition for libparser
+
+.SH SYNPOSIS
+.B libparser-generate
+.I main-rule
+
+.SH DESCRIPTION
+The
+.B libparser-generate
+utility parsers the standard input according to the
+grammar specified in
+.BR libparser (7)
+and prints to the standard output a C source file
+containing the definition for
+.B libparser_rule
+that is declared in
+.B <libparser.h>
+as
+.PP
+.RS
+.nf
+.I extern const struct libparser_rule *const libparser_rule_table[];
+.fi
+.RE
+.PP
+This table will contain all defined rules, plus three
+special rules:
+.TP
+.B @start
+.nf
+.BI "@start = " main-rule ", (@eof | @noeof);"
+.fi
+
+where
+.I main-rule
+is the value of the
+.I main-rule
+operand in the command line. This
+.RB [ @start ]
+is the rule the
+.BR libparser_parse_file (3)
+function will use when parsing input.
+.TP
+.B @eof
+This rule has a special definition, it is
+matched if the end of the file has been
+reached.
+.TP
+.B @noeof
+.nf
+.B "@noeof = -;"
+.fi
+
+This rule is simple defined as an exception,
+causing the parsing to terminate at the end
+with an exception of it didn't reach the end
+of the file.
+
+.SH SEE ALSO
+.BR libparser (7),
+.BR libparser_parse_file (3)
diff --git a/libparser.7 b/libparser.7
new file mode 100644
index 0000000..1402dcd
--- /dev/null
+++ b/libparser.7
@@ -0,0 +1,153 @@
+.TH LIBPARSER 7 LIBPARSER
+.SH NAME
+libparser \- Context-free grammar parsing library
+
+.SH DESCRIPTION
+.B libparser
+is a small C library that parses input based on a
+precompiled context-free grammar.
+.PP
+To use
+.BR libparser ,
+a developer should write a syntax for the input that
+his application shall parse, in a syntax based on
+Extended Backus–Naur form (EBNF) (somewhat simplified
+but also somewhat extended).
+.BR libparser-generate (1)
+is then used to create a C source file describing the
+syntax, which shall be compiled into an object file
+with a C compiler. This file provides a definition of
+a global variable declared in
+.BR <libparser.h> :
+.IR libparser_rule_table .
+This variable is used when calling
+.BR libparser_parse_file (3)
+to parse the application's input.
+.PP
+.B libparser
+is proudly non-self-hosted.
+
+.SH EXTENDED DESCRIPTION
+.SS Syntax
+The grammar for
+.BR libparser-generate (1)'s
+input can be described in its own grammar:
+.PP
+.RS
+.nf
+(* CHARACTER CLASSES *)
+
+_space = \(dq \(dq | \(dq\en\(dq | \(dq\et\(dq;
+_alpha = <\(dqa\(dq, \(dqz\(dq> | <\(dqA\(dq, \(dqZ\(dq>;
+_octal = <\(dq0\(dq, \(dq7\(dq>;
+_digit = <\(dq0\(dq, \(dq9\(dq>;
+_xdigit = _digit | <\(dqa\(dq, \(dqf\(dq> | <\(dqA\(dq, \(dqF\(dq>;
+_nonascii = <128, 255>;
+
+
+(* WHITESPACE/COMMENTS, THE GRAMMAR IS FREE-FORM *)
+
+_comment_char = _space | <\(dq!\(dq, 0x29> | <0x2B, 0xFF>;
+_comment_tail = [_comment_char], (\(dq*)\(dq | _comment_tail);
+_comment = \(dq(*\(dq, _comment_tail;
+
+_ = {_space | _comment};
+
+
+(* IDENTIFIERS *)
+
+_identifier_head = _alpha | _digit | _nonascii | \(dq_\(dq;
+_identifier_tail = _identifier_head | \(dq-\(dq;
+
+identifier = _identifier_head, {_identifier_tail};
+
+
+(* STRINGS *)
+
+_escape_simple = \(dq\e\e\(dq | \(dq\e\(dq\(dq | \(dq'\(dq | \(dqa\(dq | \(dqb\(dq | \(dqf\(dq | \(dqn\(dq | \(dqr\(dq | \(dqv\(dq;
+_escape_hex = (\(dqx\(dq | \(dqX\(dq), _xdigit, _xdigit;
+_escape_octal = _octal, {_octal}; (* May not exceed 255 in base 10 *)
+_escape = _escape_simple | _escape_hex | _escape_octal | -;
+_character = \(dq\e\e\(dq, _escape | <1, \(dq!\(dq> | <\(dq#\(dq, 0xFF>;
+
+string = \(dq\e\(dq\(dq, _character, {_character}, \(dq\e\(dq\(dq;
+character = \(dq\e\(dq\(dq, _character, \(dq\e\(dq\(dq;
+
+
+(* INTEGERS *)
+
+_decimal = _digit, {_digit};
+_hexadecimal = \(dq0\(dq, (\(dqx\(dq | \(dqX\(dq), _xdigit, {_xdigit};
+
+integer = _decimal | _hexadecimal; (* May not exceed 255. *)
+
+
+(* GROUPINGS *)
+
+_low = character | integer;
+_high = character | integer;
+
+concatenation = _, _operand, _, {\(dq,\(dq, _, _operand, _};
+alternation = _, concatenation, _, {\(dq|\(dq, _, concatenation, _};
+optional = _, \(dq[\(dq, _, _expression, _, \(dq]\(dq, _;
+repeated = _, \(dq{\(dq, _, _expression, _, \(dq}\(dq, _;
+group = _, \(dq(\(dq, _, _expression, _, \(dq)\(dq, _;
+char-range = _, \(dq<\(dq, _, _low, _, \(dq,\(dq, _, _high, \(dq_\(dq, \(dq>\(dq, _;
+exception = _, \(dq-\(dq, _;
+embedded-rule = _, identifier, _;
+
+_literal = char-range | exception | string;
+_group = optional | repeated | group | embedded-rule;
+_operand = _group | _literal;
+
+_expression = alternation;
+
+
+(* RULES *)
+
+rule = _, identifier, _, \(dq=\(dq, _, _expression, \(dq;\(dq;
+
+(* This is the root rule of the grammar. *)
+grammar = {rules, _}, _;
+.fi
+.PP
+.RE
+The file must be encoded in UTF-8, with LF as the line
+break (CR and FF are illegal just becuase).
+.PP
+In alternations, the first (leftmost) match is selected.
+The parser is able to backtrack incase it later turns
+out that it could not finish that branch. Whenever an
+exception is reached, the parser will terminate there.
+.PP
+Repeated symbols may occour any number of times,
+including zero. The compiler is able to backtrack if it
+takes too much.
+.PP
+Concatenation has higher precedence than alternation,
+groups
+.RB (\(dq ( "\(dq, ..., \(dq" ) \(dq)
+have no semantic meaning and are useful only to put a
+alternation inside a concatenation without creating a
+new rule for that.
+.PP
+In character ranges, the
+.B _high
+and
+.B _low
+values must be at least 0 and at most 255, and
+.B _high
+must be greater than
+.BR _low .
+.PP
+Rules that begin with an underscore will not show up
+for the application in the parse result, the rest of
+the rules will appear in the tree-formatted result.
+.PP
+Left recursion is illegal (it will cause stack
+overflow at runtime as the empty condition before the
+recursion is always met).
+
+.SH SEE ALSO
+.BR libparser-generate (1),
+.BR libparser_parse_file (3)