aboutsummaryrefslogtreecommitdiffstats
path: root/libparser.7
blob: 406f6ca6eb457271b8f7986c9e935147385a1cf3 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
.TH LIBPARSER 7 LIBPARSER
.SH NAME
libparser \- Context-free grammar parsing library

.SH DESCRIPTION
.B libparser
is a small C library that parses input based on a
precompiled context-free grammar.
.PP
To use
.BR libparser ,
a developer should write a syntax for the input that
his application shall parse, in a syntax based on
Extended Backus–Naur form (EBNF) (somewhat simplified
but also somewhat extended).
.BR libparser-generate (1)
is then used to create a C source file describing the
syntax, which shall be compiled into an object file
with a C compiler. This file provides a definition of
a global variable declared in
.BR <libparser.h> :
.IR libparser_rule_table .
This variable is used when calling
.BR libparser_parse_file (3)
to parse the application's input.
.PP
.B libparser
is proudly non-self-hosted.

.SH EXTENDED DESCRIPTION
.SS Syntax
The grammar for
.BR libparser-generate (1)'s
input can be described in its own grammar:
.PP
.RS
.nf
(* CHARACTER CLASSES *)

_space           = \(dq \(dq | \(dq\en\(dq | \(dq\et\(dq;
_alpha           = <\(dqa\(dq, \(dqz\(dq> | <\(dqA\(dq, \(dqZ\(dq>;
_octal           = <\(dq0\(dq, \(dq7\(dq>;
_digit           = <\(dq0\(dq, \(dq9\(dq>;
_xdigit          = _digit | <\(dqa\(dq, \(dqf\(dq> | <\(dqA\(dq, \(dqF\(dq>;
_nonascii        = <128, 255>;


(* WHITESPACE/COMMENTS, THE GRAMMAR IS FREE-FORM *)

_comment_char    = _space | !\(dq*\(dq, !\(dq\e\(dq\(dq, <\(dq!\(dq, 0xFF>;
_comment_tail    = [_comment_char], [_string], (\(dq*)\(dq | _comment_tail | -);
_comment         = \(dq(*\(dq, _comment_tail;

_                = {_space | _comment};


(* IDENTIFIERS *)

_identifier_head = _alpha | _digit | _nonascii | \(dq_\(dq;
_identifier_tail = _identifier_head | \(dq-\(dq;

identifier       = _identifier_head, {_identifier_tail};


(* STRINGS *)

_escape_simple   = \(dq\e\e\(dq | \(dq\e\(dq\(dq | \(dq'\(dq | \(dqa\(dq | \(dqb\(dq | \(dqf\(dq | \(dqn\(dq | \(dqr\(dq | \(dqv\(dq;
_escape_hex      = (\(dqx\(dq | \(dqX\(dq), _xdigit, _xdigit;
_escape_octal    = _octal, {_octal}; (* May not exceed 255 in base 10 *)
_escape          = _escape_simple | _escape_hex | _escape_octal | -;
_character       = \(dq\e\e\(dq, _escape | !\(dq\e\(dq\(dq, <1, 0xFF>;
_string          = \(dq\e\(dq\(dq, _character, {_character}, (\(dq\e\(dq\(dq | -);

string           = _string;
character        = \(dq\e\(dq\(dq, _character, (\(dq\e\(dq\(dq | -);


(* INTEGERS *)

_decimal         = _digit, {_digit};
_hexadecimal     = \(dq0\(dq, (\(dqx\(dq | \(dqX\(dq), _xdigit, {_xdigit};

integer          = _decimal | _hexadecimal; (* May not exceed 255. *)


(* GROUPINGS *)

_low             = character | integer;
_high            = character | integer;

rejection        = \(dq!\(dq, _, _operand;
concatenation    = _operand, {_, \(dq,\(dq, _, _operand};
alternation      = concatenation, {_, \(dq|\(dq, _, concatenation};
optional         = \(dq[\(dq, _, _expression, _, \(dq]\(dq;
repeated         = \(dq{\(dq, _, _expression, _, \(dq}\(dq;
group            = \(dq(\(dq, _, _expression, _, \(dq)\(dq;
char-range       = \(dq<\(dq, _, _low, _, \(dq,\(dq, _, _high, \(dq_\(dq, \(dq>\(dq;
exception        = \(dq-\(dq;
embedded-rule    = identifier;

_literal         = char-range | exception | string;
_group           = optional | repeated | group | embedded-rule;
_operand         = _group | _literal | rejection;

_expression      = alternation;


(* RULES *)

rule             = identifier, _, \(dq=\(dq, _, _expression, _, \(dq;\(dq;

(* This is the root rule of the grammar. *)
grammar          = _, {rules, _};
.fi
.PP
.RE
The file must be encoded in UTF-8, with LF as the line
break (CR and FF are illegal just becuase).
.PP
In alternations, the first (leftmost) match is selected.
The parser is able to backtrack incase it later turns
out that it could not finish that branch. Whenever an
exception is reached, the parser will terminate there.
.PP
Repeated symbols may occour any number of times,
including zero. The compiler is able to backtrack if it
takes too much.
.PP
Concatenation has higher precedence than alternation,
groups
.RB (\(dq ( "\(dq, ..., \(dq" ) \(dq)
have no semantic meaning and are useful only to put a
alternation inside a concatenation without creating a
new rule for that.
.PP
In character ranges, the
.B _high
and
.B _low
values must be at least 0 and at most 255, and
.B _high
must be greater than
.BR _low .
.PP
Rules that begin with an underscore will not show up
for the application in the parse result, the rest of
the rules will appear in the tree-formatted result.
.PP
Left recursion is illegal (it will cause stack
overflow at runtime as the empty condition before the
recursion is always met).

.SH SEE ALSO
.BR libparser-generate (1),
.BR libparser_parse_file (3)