From c04509285946bc045f60a9e0633d818a49718fc0 Mon Sep 17 00:00:00 2001 From: Mattias Andrée Date: Sat, 17 Apr 2021 23:41:01 +0200 Subject: Add libparser.7 and libparser-generate.1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mattias Andrée --- README | 4 +- TODO | 2 +- libparser-generate.1 | 63 +++++++++++++++++++++ libparser.7 | 153 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 219 insertions(+), 3 deletions(-) create mode 100644 libparser-generate.1 create mode 100644 libparser.7 diff --git a/README b/README index 9e2323d..b6594b4 100644 --- a/README +++ b/README @@ -19,7 +19,7 @@ DESCRIPTION libparser is proudly non-self-hosted. EXTENDED DESCRIPTION - Syntax + Syntax The grammar for libparser-generate(1)'s input can be described in its own grammar: @@ -67,7 +67,7 @@ EXTENDED DESCRIPTION _decimal = _digit, {_digit}; _hexadecimal = "0", ("x" | "X"), _xdigit, {_xdigit}; - integer = _decimal | _hexadecimal; (* May not exceed 255. *) + integer = _decimal | _hexadecimal; (* May not exceed 255. *) (* GROUPINGS *) diff --git a/TODO b/TODO index 641ce0d..669132d 100644 --- a/TODO +++ b/TODO @@ -1,3 +1,3 @@ +Add libparser_parse_file.3 Add support for prelexed -Add man pages Add tests diff --git a/libparser-generate.1 b/libparser-generate.1 new file mode 100644 index 0000000..47ef367 --- /dev/null +++ b/libparser-generate.1 @@ -0,0 +1,63 @@ +.TH LIBPARSER-GENERATE 7 LIBPARSER +.SH NAME +libparser-generate \- Generate grammar definition for libparser + +.SH SYNPOSIS +.B libparser-generate +.I main-rule + +.SH DESCRIPTION +The +.B libparser-generate +utility parsers the standard input according to the +grammar specified in +.BR libparser (7) +and prints to the standard output a C source file +containing the definition for +.B libparser_rule +that is declared in +.B +as +.PP +.RS +.nf +.I extern const struct libparser_rule *const libparser_rule_table[]; +.fi +.RE +.PP +This table will contain all defined rules, plus three +special rules: +.TP +.B @start +.nf +.BI "@start = " main-rule ", (@eof | @noeof);" +.fi + +where +.I main-rule +is the value of the +.I main-rule +operand in the command line. This +.RB [ @start ] +is the rule the +.BR libparser_parse_file (3) +function will use when parsing input. +.TP +.B @eof +This rule has a special definition, it is +matched if the end of the file has been +reached. +.TP +.B @noeof +.nf +.B "@noeof = -;" +.fi + +This rule is simple defined as an exception, +causing the parsing to terminate at the end +with an exception of it didn't reach the end +of the file. + +.SH SEE ALSO +.BR libparser (7), +.BR libparser_parse_file (3) diff --git a/libparser.7 b/libparser.7 new file mode 100644 index 0000000..1402dcd --- /dev/null +++ b/libparser.7 @@ -0,0 +1,153 @@ +.TH LIBPARSER 7 LIBPARSER +.SH NAME +libparser \- Context-free grammar parsing library + +.SH DESCRIPTION +.B libparser +is a small C library that parses input based on a +precompiled context-free grammar. +.PP +To use +.BR libparser , +a developer should write a syntax for the input that +his application shall parse, in a syntax based on +Extended Backus–Naur form (EBNF) (somewhat simplified +but also somewhat extended). +.BR libparser-generate (1) +is then used to create a C source file describing the +syntax, which shall be compiled into an object file +with a C compiler. This file provides a definition of +a global variable declared in +.BR : +.IR libparser_rule_table . +This variable is used when calling +.BR libparser_parse_file (3) +to parse the application's input. +.PP +.B libparser +is proudly non-self-hosted. + +.SH EXTENDED DESCRIPTION +.SS Syntax +The grammar for +.BR libparser-generate (1)'s +input can be described in its own grammar: +.PP +.RS +.nf +(* CHARACTER CLASSES *) + +_space = \(dq \(dq | \(dq\en\(dq | \(dq\et\(dq; +_alpha = <\(dqa\(dq, \(dqz\(dq> | <\(dqA\(dq, \(dqZ\(dq>; +_octal = <\(dq0\(dq, \(dq7\(dq>; +_digit = <\(dq0\(dq, \(dq9\(dq>; +_xdigit = _digit | <\(dqa\(dq, \(dqf\(dq> | <\(dqA\(dq, \(dqF\(dq>; +_nonascii = <128, 255>; + + +(* WHITESPACE/COMMENTS, THE GRAMMAR IS FREE-FORM *) + +_comment_char = _space | <\(dq!\(dq, 0x29> | <0x2B, 0xFF>; +_comment_tail = [_comment_char], (\(dq*)\(dq | _comment_tail); +_comment = \(dq(*\(dq, _comment_tail; + +_ = {_space | _comment}; + + +(* IDENTIFIERS *) + +_identifier_head = _alpha | _digit | _nonascii | \(dq_\(dq; +_identifier_tail = _identifier_head | \(dq-\(dq; + +identifier = _identifier_head, {_identifier_tail}; + + +(* STRINGS *) + +_escape_simple = \(dq\e\e\(dq | \(dq\e\(dq\(dq | \(dq'\(dq | \(dqa\(dq | \(dqb\(dq | \(dqf\(dq | \(dqn\(dq | \(dqr\(dq | \(dqv\(dq; +_escape_hex = (\(dqx\(dq | \(dqX\(dq), _xdigit, _xdigit; +_escape_octal = _octal, {_octal}; (* May not exceed 255 in base 10 *) +_escape = _escape_simple | _escape_hex | _escape_octal | -; +_character = \(dq\e\e\(dq, _escape | <1, \(dq!\(dq> | <\(dq#\(dq, 0xFF>; + +string = \(dq\e\(dq\(dq, _character, {_character}, \(dq\e\(dq\(dq; +character = \(dq\e\(dq\(dq, _character, \(dq\e\(dq\(dq; + + +(* INTEGERS *) + +_decimal = _digit, {_digit}; +_hexadecimal = \(dq0\(dq, (\(dqx\(dq | \(dqX\(dq), _xdigit, {_xdigit}; + +integer = _decimal | _hexadecimal; (* May not exceed 255. *) + + +(* GROUPINGS *) + +_low = character | integer; +_high = character | integer; + +concatenation = _, _operand, _, {\(dq,\(dq, _, _operand, _}; +alternation = _, concatenation, _, {\(dq|\(dq, _, concatenation, _}; +optional = _, \(dq[\(dq, _, _expression, _, \(dq]\(dq, _; +repeated = _, \(dq{\(dq, _, _expression, _, \(dq}\(dq, _; +group = _, \(dq(\(dq, _, _expression, _, \(dq)\(dq, _; +char-range = _, \(dq<\(dq, _, _low, _, \(dq,\(dq, _, _high, \(dq_\(dq, \(dq>\(dq, _; +exception = _, \(dq-\(dq, _; +embedded-rule = _, identifier, _; + +_literal = char-range | exception | string; +_group = optional | repeated | group | embedded-rule; +_operand = _group | _literal; + +_expression = alternation; + + +(* RULES *) + +rule = _, identifier, _, \(dq=\(dq, _, _expression, \(dq;\(dq; + +(* This is the root rule of the grammar. *) +grammar = {rules, _}, _; +.fi +.PP +.RE +The file must be encoded in UTF-8, with LF as the line +break (CR and FF are illegal just becuase). +.PP +In alternations, the first (leftmost) match is selected. +The parser is able to backtrack incase it later turns +out that it could not finish that branch. Whenever an +exception is reached, the parser will terminate there. +.PP +Repeated symbols may occour any number of times, +including zero. The compiler is able to backtrack if it +takes too much. +.PP +Concatenation has higher precedence than alternation, +groups +.RB (\(dq ( "\(dq, ..., \(dq" ) \(dq) +have no semantic meaning and are useful only to put a +alternation inside a concatenation without creating a +new rule for that. +.PP +In character ranges, the +.B _high +and +.B _low +values must be at least 0 and at most 255, and +.B _high +must be greater than +.BR _low . +.PP +Rules that begin with an underscore will not show up +for the application in the parse result, the rest of +the rules will appear in the tree-formatted result. +.PP +Left recursion is illegal (it will cause stack +overflow at runtime as the empty condition before the +recursion is always met). + +.SH SEE ALSO +.BR libparser-generate (1), +.BR libparser_parse_file (3) -- cgit v1.2.3-70-g09d2