/* See LICENSE file for copyright and license details. */
#include <ctype.h>
#include <errno.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
static const char *argv0 = "libparser-generate";
static void
usage(void)
{
fprintf(stderr, "usage: %s main-rule\n", argv0);
exit(1);
}
#define eprintf(...) (fprintf(stderr, __VA_ARGS__), exit(1))
struct token {
size_t lineno;
size_t column;
size_t character;
char s[];
};
struct node {
struct token *token;
struct node *parent;
struct node *next;
struct node *data;
struct node **head;
};
static char **rule_names = NULL;
static size_t nrule_names = 0;
static size_t rule_names_size = 0;
static char **want_rules = NULL;
static size_t nwant_rules = 0;
static size_t want_rules_size = 0;
static void *
emalloc(size_t n)
{
void *ret = malloc(n);
if (!ret)
eprintf("%s: malloc %zu: %s\n", argv0, n, strerror(errno));
return ret;
}
static void *
ecalloc(size_t n, size_t m)
{
void *ret = calloc(n, m);
if (!ret)
eprintf("%s: calloc %zu %zu: %s\n", argv0, n, m, strerror(errno));
return ret;
}
static void *
erealloc(void *ptr, size_t n)
{
void *ret = realloc(ptr, n);
if (!ret)
eprintf("%s: realloc %p %zu: %s\n", argv0, ptr, n, strerror(errno));
return ret;
}
static void *
ereallocarray(void *ptr, size_t n, size_t m)
{
void *ret;
if (n && m > SIZE_MAX / n)
eprintf("%s: realloc %p %zu*%zu: %s\n", argv0, ptr, n, m, strerror(EOVERFLOW));
ret = realloc(ptr, n * n);
if (!ret)
eprintf("%s: realloc %p %zu*%zu: %s\n", argv0, ptr, n, m, strerror(errno));
return ret;
}
static char *
estrdup(char *s)
{
size_t n = strlen(s) + 1;
char *ret = emalloc(n);
memcpy(ret, s, n);
return ret;
}
static int
strpcmp(const void *av, const void *bv)
{
const char *const *a = av;
const char *const *b = bv;
return strcmp(*a, *b);
}
static int
isidentifier(char c)
{
return isalpha(c) || isdigit(c) || !isascii(c) || c == '_';
}
static int
check_utf8(char *buf, size_t *ip, size_t len)
{
size_t req, i;
uint32_t cp;
if ((buf[*ip] & 0xE0) == 0xC0) {
cp = (uint32_t)(unsigned char)(buf[*ip] ^ 0xC0);
req = 2;
} else if ((buf[*ip] & 0xF0) == 0xE0) {
cp = (uint32_t)(unsigned char)(buf[*ip] ^ 0xE0);
req = 3;
} else if ((buf[*ip] & 0xF8) == 0xF0) {
cp = (uint32_t)(unsigned char)(buf[*ip] ^ 0xF0);
req = 4;
} else {
return 0;
}
if (req > len - *ip)
return 0;
for (i = 1; i < req; i++) {
cp <<= 6;
if ((buf[*ip + i] & 0xC0) != 0x80)
return 0;
cp |= (uint32_t)(unsigned char)(buf[*ip + i] ^ 0x80);
}
*ip += req;
if ((cp & UINT32_C(0xFFF8000)) == UINT32_C(0xD8000))
return 0;
if (cp < (uint32_t)1 << (7 + 0 * 6))
return 0;
if (cp < (uint32_t)1 << (5 + 1 * 6))
return req == 2;
if (cp < (uint32_t)1 << (4 + 2 * 6))
return req == 3;
if (cp <= UINT32_C(0x10FFFF))
return req == 4;
return 0;
}
static char *
readall_and_validate(int fd, const char *fname)
{
size_t lineno = 1, column = 0, character = 0;
size_t size = 0, len = 0, i;
char *buf = NULL;
ssize_t r;
for (;; len += (size_t)r) {
if (len == size)
buf = erealloc(buf, size += 1024);
r = read(fd, &buf[len], size - len);
if (r <= 0) {
if (!r)
break;
eprintf("%s: read %s: %s\n", argv0, fname, strerror(errno));
}
}
for (i = 0; i < len; i++) {
if (buf[i] == '\n') {
lineno += 1;
column = 0;
character = 0;
} else if (buf[i] == '\t') {
column += 8 - column % 8;
character += 1;
} else if (buf[i] == '\r') {
eprintf("%s: %s contains a CR character on line %zu at column %zu (character %zu)\n",
argv0, fname, lineno, column, character);
} else if ((0 < buf[i] && buf[i] < ' ') || buf[i] == 0x7F) {
eprintf("%s: %s contains a illegal character on line %zu at column %zu (character %zu)\n",
argv0, fname, lineno, column, character);
} else if (buf[i] == '\0') {
eprintf("%s: %s contains a NUL byte on line %zu at column %zu (character %zu)\n",
argv0, fname, lineno, column, character);
} else if (!(buf[i] & 0x80)) {
character += 1;
column += 1;
} else if ((buf[i] & 0xC0) == 0x80) {
eprintf("%s: %s contains a illegal byte on line %zu at column %zu (character %zu)\n",
argv0, fname, lineno, column, character);
} else {
if (!check_utf8(buf, &i, len)) {
eprintf("%s: %s contains a illegal byte sequence on line %zu at column %zu (character %zu)\n",
argv0, fname, lineno, column, character);
}
i--;
character += 1;
column += 1;
}
}
buf = erealloc(buf, len + 1);
buf[len] = '\0';
return buf;
}
static struct token **
tokenise(const char *data)
{
enum {
NEW_TOKEN,
IDENTIFIER,
STRING,
STRING_ESC,
SPACE
} state = NEW_TOKEN;
size_t lineno = 1, column = 0, character = 0;
size_t token_lineno = 0, token_column = 0, token_character = 0;
struct token **tokens = NULL;
char *token = NULL;
size_t i, ntokens = 0, tokens_size = 0;
size_t token_len = 0, token_size = 0;
for (i = 0; data[i]; i++) {
again:
switch (state) {
case NEW_TOKEN:
token_lineno = lineno;
token_column = column;
token_character = character;
if (token_len == token_size)
token = erealloc(token, token_size += 16);
token[token_len++] = data[i];
if (isidentifier(data[i])) {
state = IDENTIFIER;
} else if (isspace(data[i])) {
state = SPACE;
} else if (data[i] == '"') {
state = STRING;
if (data[i + 1] == '"') {
eprintf("%s: empty string token on line %zu at column %zu (character %zu)\n",
argv0, lineno, column, character);
}
} else {
add_token:
if (token_len == token_size)
token = erealloc(token, token_size += 16);
token[token_len++] = '\0';
if (ntokens == tokens_size)
tokens = ereallocarray(tokens, tokens_size += 16, sizeof(*tokens));
tokens[ntokens] = emalloc(offsetof(struct token, s) + token_len);
tokens[ntokens]->lineno = token_lineno;
tokens[ntokens]->column = token_column;
tokens[ntokens]->character = token_character;
stpcpy(tokens[ntokens++]->s, token);
token_len = 0;
state = NEW_TOKEN;
}
break;
case IDENTIFIER:
if (isidentifier(data[i]) || data[i] == '-') {
add_char:
if (token_len == token_size)
token = erealloc(token, token_size += 16);
token[token_len++] = data[i];
} else {
add_token_and_do_again:
if (token_len == token_size)
token = erealloc(token, token_size += 16);
token[token_len++] = '\0';
if (ntokens == tokens_size)
tokens = ereallocarray(tokens, tokens_size += 16, sizeof(*tokens));
tokens[ntokens] = emalloc(offsetof(struct token, s) + token_len);
tokens[ntokens]->lineno = token_lineno;
tokens[ntokens]->column = token_column;
tokens[ntokens]->character = token_character;
stpcpy(tokens[ntokens++]->s, token);
token_len = 0;
state = NEW_TOKEN;
goto again;
}
break;
case STRING:
if (data[i] == '\n' || data[i] == '\t') {
eprintf("%s: illegal whitespace on line %zu at column %zu (character %zu)\n",
argv0, lineno, column, character);
} else if (data[i] == '"') {
goto add_token;
} else if (data[i] == '\\') {
state = STRING_ESC;
goto add_char;
} else {
goto add_char;
}
break;
case STRING_ESC:
if (data[i] == '\n' || data[i] == '\t') {
eprintf("%s: illegal whitespace on line %zu at column %zu (character %zu)\n",
argv0, lineno, column, character);
}
if (token_len == token_size)
token = erealloc(token, token_size += 16);
token[token_len++] = data[i];
state = STRING;
break;
case SPACE:
if (isspace(data[i])) {
goto add_char;
} else {
goto add_token_and_do_again;
}
break;
default:
abort();
};
if (data[i] == '\n') {
lineno += 1;
column = 0;
character = 0;
} else if (data[i] == '\t') {
column += 8 - column % 8;
character += 1;
} else {
character += (data[i] & 0xC0) != 0x80;
column += 1;
}
}
if (state != NEW_TOKEN && state != SPACE)
eprintf("%s: premature end of file\n", argv0);
tokens = ereallocarray(tokens, ntokens + 1, sizeof(*tokens));
tokens[ntokens] = NULL;
free(token);
return tokens;
}
static void
emit_and_free_sentence(struct node *node, size_t *indexp)
{
size_t index = (*indexp)++, left, right;
struct node *next;
for (; node->token->s[0] == '('; node = next) {
next = node->data;
free(node->token);
free(node);
}
if (node->token->s[0] == '[' || node->token->s[0] == '{') {
emit_and_free_sentence(node->data, indexp);
printf("static union libparser_sentence sentence_%zu_%zu = {.unary = {"
".type = LIBPARSER_SENTENCE_TYPE_%s, .sentence = &sentence_%zu_%zu"
"}};\n",
nrule_names, index, node->token->s[0] == '[' ? "OPTIONAL" : "REPEATED", nrule_names, index + 1);
} else if (node->token->s[0] == '|' || node->token->s[0] == ',') {
right = *indexp;
emit_and_free_sentence(node->data->next, indexp);
left = *indexp;
emit_and_free_sentence(node->data, indexp);
printf("static union libparser_sentence sentence_%zu_%zu = {.binary = {"
".type = LIBPARSER_SENTENCE_TYPE_%s, "
".left = &sentence_%zu_%zu, .right = &sentence_%zu_%zu"
"}};\n",
nrule_names, index, node->token->s[0] == '|' ? "ALTERNATION" : "CONCATENATION",
nrule_names, left, nrule_names, right);
} else if (node->token->s[0] == '"') {
printf("static union libparser_sentence sentence_%zu_%zu = {.string = {"
".type = LIBPARSER_SENTENCE_TYPE_STRING, "
".string = %s\", .length = sizeof(%s\") - 1"
"}};\n",
nrule_names, index, node->token->s, node->token->s);
} else if (node->token->s[0] == '-') {
printf("static union libparser_sentence sentence_%zu_%zu = {.type = LIBPARSER_SENTENCE_TYPE_EXCEPTION};\n",
nrule_names, index);
} else {
if (nwant_rules == want_rules_size)
want_rules = ereallocarray(want_rules, want_rules_size += 16, sizeof(*want_rules));
want_rules[nwant_rules++] = estrdup(node->token->s);
printf("static union libparser_sentence sentence_%zu_%zu = {.rule = {"
".type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"%s\""
"}};\n",
nrule_names, index, node->token->s);
}
free(node->token);
free(node);
}
static struct node *
order_sentences(struct node *node)
{
struct node *tail = NULL, **head = &tail;
struct node *stack = NULL;
struct node *next, *prev;
for (; node; node = next) {
next = node->next;
if (node->token->s[0] == '(' || node->token->s[0] == '[' || node->token->s[0] == '{') {
node->data = order_sentences(node->data);
*head = node;
head = &node->next;
} else if (node->token->s[0] == '|' || node->token->s[0] == ',') {
again_operators:
if (!stack) {
node->next = stack;
stack = node;
} else if (node->token->s[0] == ',' && stack->token->s[0] == '|') {
node->next = stack;
stack = node;
} else if (node->token->s[0] == stack->token->s[0]) {
*head = stack;
head = &stack->next;
stack = stack->next;
node->next = stack;
stack = node;
} else {
*head = stack;
head = &stack->next;
stack = stack->next;
goto again_operators;
}
} else {
*head = node;
head = &node->next;
}
}
for (; stack; stack = next) {
next = stack->next;
*head = stack;
head = &stack->next;
}
*head = NULL;
for (stack = tail, prev = NULL; stack; prev = stack, stack = next) {
next = stack->next;
stack->next = prev;
if (stack->token->s[0] == '|' || stack->token->s[0] == ',') {
prev = stack->next->next->next;
stack->data = stack->next->next;
stack->data->next = stack->next;
stack->next = prev;
}
}
return prev;
}
static void
emit_and_free_rule(struct node *rule)
{
size_t index = 0;
rule->data = order_sentences(rule->data);
emit_and_free_sentence(rule->data, &index);
printf("static struct libparser_rule rule_%zu = {\"%s\", &sentence_%zu_0};\n", nrule_names, rule->token->s, nrule_names);
if (nrule_names == rule_names_size)
rule_names = ereallocarray(rule_names, rule_names_size += 16, sizeof(*rule_names));
rule_names[nrule_names++] = estrdup(rule->token->s);
free(rule->token);
free(rule);
}
int
main(int argc, char *argv[])
{
enum {
IDENTIFIER,
STRING,
SYMBOL,
} type;
enum {
NEW_RULE,
EXPECT_EQUALS,
EXPECT_OPERAND,
EXPECT_OPERATOR
} state = NEW_RULE;
struct node *stack = NULL, *parent_node, *node;
char *data;
struct token **tokens;
size_t i, j;
int cmp, err;
if (argc) {
argv0 = *argv++;
argc--;
}
if (argc && argv[0][0] == '-') {
if (argv[0][1] != '-' || argv[0][2])
usage();
argv++;
argc--;
}
if (argc != 1 || !isidentifier(argv[0][0]))
usage();
for (i = 0; argv[0][i]; i++)
if (!isidentifier(argv[0][i]) && argv[0][i] != '-')
usage();
data = readall_and_validate(STDIN_FILENO, "<stdin>");
tokens = tokenise(data);
free(data);
printf("#include <libparser.h>\n");
i = 0;
again:
for (; tokens[i]; i++) {
if (tokens[i + 1] && tokens[i]->s[0] == '(' && tokens[i + 1]->s[0] == '*') {
for (i += 2; tokens[i] && tokens[i + 1]; i++) {
if (tokens[i]->s[0] == '*' && tokens[i + 1]->s[0] == ')') {
i += 2;
goto again;
}
}
eprintf("%s: premature end of file\n", argv0);
}
if (tokens[i]->s[0] == '"') {
type = STRING;
} else if (isidentifier(tokens[i]->s[0])) {
type = IDENTIFIER;
} else if (isspace(tokens[i]->s[0])) {
free(tokens[i]);
continue;
} else {
type = SYMBOL;
}
switch (state) {
case NEW_RULE:
if (type != IDENTIFIER) {
eprintf("%s: expected an identifier on line %zu at column %zu (character %zu)\n",
argv0, tokens[i]->lineno, tokens[i]->column, tokens[i]->character);
}
stack = calloc(1, sizeof(*stack));
stack->token = tokens[i];
stack->head = &stack->data;
state = EXPECT_EQUALS;
for (j = 0; j < nrule_names; j++) {
if (!strcmp(rule_names[j], tokens[i]->s)) {
eprintf("%s: duplicate definition of \"%s\" on line %zu at column %zu (character %zu)\n",
argv0, tokens[i]->s, tokens[i]->lineno, tokens[i]->column, tokens[i]->character);
}
}
break;
case EXPECT_EQUALS:
if (type != SYMBOL || tokens[i]->s[0] != '=') {
eprintf("%s: expected an '=' on line %zu at column %zu (character %zu)\n",
argv0, tokens[i]->lineno, tokens[i]->column, tokens[i]->character);
}
free(tokens[i]);
state = EXPECT_OPERAND;
break;
case EXPECT_OPERAND:
if (type == SYMBOL) {
if (tokens[i]->s[0] == '(' || tokens[i]->s[0] == '[' || tokens[i]->s[0] == '{') {
parent_node = stack;
stack = ecalloc(1, sizeof(*stack));
stack->parent = parent_node;
stack->token = tokens[i];
stack->head = &stack->data;
} else if (tokens[i]->s[0] == '-') {
goto add;
} else {
stray:
eprintf("%s: stray '%c' on line %zu at column %zu (character %zu)\n",
argv0, tokens[i]->s[0], tokens[i]->lineno, tokens[i]->column, tokens[i]->character);
}
} else {
add:
state = EXPECT_OPERATOR;
goto add_singleton;
}
break;
case EXPECT_OPERATOR:
if (tokens[i]->s[0] == '|' || tokens[i]->s[0] == ',') {
state = EXPECT_OPERAND;
add_singleton:
node = calloc(1, sizeof(*node));
node->token = tokens[i];
*stack->head = node;
stack->head = &node->next;
} else if (tokens[i]->s[0] == ')') {
if (stack->token->s[0] != '(')
goto stray;
goto pop;
} else if (tokens[i]->s[0] == ']') {
if (stack->token->s[0] != '[')
goto stray;
goto pop;
} else if (tokens[i]->s[0] == '}') {
if (stack->token->s[0] != '{')
goto stray;
pop:
free(tokens[i]);
*stack->parent->head = stack;
stack->parent->head = &stack->next;
stack = stack->parent;
} else if (tokens[i]->s[0] == ';') {
if (stack->token->s[0] == ')' || stack->token->s[0] == ']' || stack->token->s[0] == '}')
eprintf("%s: premature end of rule on line %zu at column %zu (character %zu): "
"'%s' on line %zu at column %zu (character %zu) not closed\n",
argv0, tokens[i]->lineno, tokens[i]->column, tokens[i]->character, stack->token->s,
stack->token->lineno, stack->token->column, stack->token->character);
emit_and_free_rule(stack);
free(tokens[i]);
state = NEW_RULE;
} else {
eprintf("%s: expected a '|', ',', or '%c' on line %zu at column %zu (character %zu)\n",
argv0,
stack->token->s[0] == '(' ? ')' :
stack->token->s[0] == '[' ? ']' :
stack->token->s[0] == '{' ? '}' : ';',
tokens[i]->lineno, tokens[i]->column, tokens[i]->character);
}
break;
default:
abort();
}
}
free(tokens);
if (state != NEW_RULE)
eprintf("%s: premature end of file\n", argv0);
err = 0;
qsort(rule_names, nrule_names, sizeof(*rule_names), strpcmp);
qsort(want_rules, nwant_rules, sizeof(*want_rules), strpcmp);
for (i = j = 0; i < nrule_names && j < nwant_rules;) {
cmp = strcmp(rule_names[i], want_rules[j]);
if (!cmp) {
i++;
for (j++; j < nwant_rules && !strcmp(want_rules[j - 1], want_rules[j]); j++);
} else if (!strcmp(rule_names[i], argv[0])) {
i++;
} else if (cmp < 0) {
eprintf("%s: rule \"%s\" defined but not used\n", argv0, rule_names[i]);
i++;
err = 1;
} else {
eprintf("%s: rule \"%s\" used but not defined\n", argv0, want_rules[j]);
for (j++; j < nwant_rules && !strcmp(want_rules[j - 1], want_rules[j]); j++);
err = 1;
}
}
for (; i < nrule_names; i++) {
if (strcmp(rule_names[i], argv[0])) {
eprintf("%s: rule \"%s\" defined but not used\n", argv0, rule_names[i]);
err = 1;
}
}
while (j < nwant_rules) {
eprintf("%s: rule \"%s\" used but not defined\n", argv0, want_rules[j]);
for (j++; j < nwant_rules && !strcmp(want_rules[j - 1], want_rules[j]); j++);
err = 1;
}
if (err)
exit(1);
for (i = 0; i < nrule_names; i++)
if (!strcmp(rule_names[i], argv[0]))
goto found_main;
eprintf("%s: specified main rule (\"%s\") was not defined\n", argv0, argv[0]);
found_main:
printf("static union libparser_sentence noeof_sentence = {.type = LIBPARSER_SENTENCE_TYPE_EXCEPTION};\n");
printf("static struct libparser_rule noeof_rule = {\"@noeof\", &noeof_sentence};\n");
printf("static union libparser_sentence noeof_rule_sentence = {.rule = "
"{.type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"@noeof\"}"
"};\n");
printf("static union libparser_sentence eof_sentence = {.type = LIBPARSER_SENTENCE_TYPE_EOF};\n");
printf("static struct libparser_rule eof_rule = {\"@eof\", &eof_sentence};\n");
printf("static union libparser_sentence eof_rule_sentence = {.rule = "
"{.type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"@eof\"}"
"};\n");
printf("static union libparser_sentence end_sentence = {.binary = {"
".type = LIBPARSER_SENTENCE_TYPE_ALTERNATION, "
".left = &eof_rule_sentence, .right = &noeof_rule_sentence"
"}};\n");
printf("static union libparser_sentence main_rule_sentence = {.rule = "
"{.type = LIBPARSER_SENTENCE_TYPE_RULE, .rule = \"%s\"}"
"};\n", argv[0]);
printf("static union libparser_sentence main_sentence = {.binary = {"
".type = LIBPARSER_SENTENCE_TYPE_CONCATENATION, "
".left = &main_rule_sentence, .right = &end_sentence"
"}};\n");
printf("static struct libparser_rule main_rule = {\"@start\", &main_sentence};\n");
printf("const struct libparser_rule *const libparser_rule_table[] = {\n");
for (i = 0; i < nrule_names; i++) {
printf("\t&rule_%zu,\n", i);
free(rule_names[i]);
}
printf("\t&eof_rule,\n");
printf("\t&noeof_rule,\n");
printf("\t&main_rule,\n");
printf("\tNULL\n};\n");
free(rule_names);
for (i = 0; i < nwant_rules; i++)
free(want_rules[i]);
free(want_rules);
if (ferror(stdout) || fflush(stdout) || fclose(stdout))
eprintf("%s: printf: %s\n", argv0, strerror(errno));
return 0;
}