1 files changed, 1157 insertions, 136 deletions
diff --git a/libparser.c b/libparser.c
index 545aaee..56ce703 100644
--- a/libparser.c
+++ b/libparser.c
@@ -1,28 +1,480 @@
 /* See LICENSE file for copyright and license details. */
 #include "libparser.h"
+#include <errno.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 
-#define IS_HIDDEN(RULE) (!(RULE) || (RULE)[0] == '_')
+#define PRINT_ACTIONS 0
+
+#if PRINT_ACTIONS
+# include <ctype.h>
+# include <stdio.h>
+#endif
+
+
+#define IS_ANONYMOUS(RULE) (!(RULE) || (RULE)[0] == '_')
 /* NULL is used when evaluating a subsentence,
  * and rule with the prefix "_" are embedded
  * as subsentences */
 
+#define IS_COMPLETION(FOLLOWUP) (!!(FOLLOWUP).unit)
+
+#define EXHAUSTED SIZE_MAX
+
+
+/**
+ * Follow-up stack node
+ * 
+ * `.unit` decides whether it is a completion
+ * marker or a subsequent sentence; it is a
+ * completion marker if and only iff `.unit`
+ * is non-`NULL`.
+ */
+struct followup {
+	/**
+	 * The unit being completed; if `NULL`
+	 * it is a subsequent sentence rather
+	 * than a completion marker and,
+	 * `.sentence` and `.rule` should be used
+	 * instead
+	 */
+	struct libparser_unit *unit;
+
+	/**
+	 * Set if and only if `.unit` is non-`NULL`:
+	 * the
+	 * index of the node
+	 */
+	size_t unit_index;
+
+	/* -- or -- */
+
+	/**
+	 * Set if and only if `.unit` is `NULL`:
+	 * the `rule` argument `add_unit` should
+	 * be called with
+	 */
+	const char *rule;
+
+	/**
+	 * Set if and only if `.unit` is `NULL`:
+	 * the `sentence` argument `add_unit`
+	 * should be called with
+	 */
+	const union libparser_sentence *sentence;
+};
+
+
+/**
+ * Branching choice
+ */
+struct choice {
+	/**
+	 * The current choice value, `EXHAUSTED`
+	 * when the choices are exhausted and the
+	 * sentence cannot be matched
+	 */
+	size_t choice;
+
+	/**
+	 * The position in the input text
+	 */
+	size_t position;
+
+	/**
+	 * The `rule` argument `add_unit` was
+	 * called with
+	 */
+	const char *rule;
+
+	/**
+	 * The `sentence` argument `add_unit`
+	 * was called with
+	 */
+	const union libparser_sentence *sentence;
 
+	/**
+	 * The value `.units.count` of the `ctx`
+	 * argument `add_unit` was called with
+	 */
+	size_t unit_index;
+
+	/**
+	 * Save state of the follow-up stack
+	 */
+	struct {
+		/**
+		 * The follow-ups
+		 */
+		struct followup *followups;
+
+		/**
+		 * The number of follow-ups; this is
+		 * also the number of allocated elements
+		 */
+		size_t count;
+	} followups;
+};
+
+
+/**
+ * Parsing state and context
+ */
 struct context {
-	const struct libparser_rule *const *rules; /* rule table */
-	struct libparser_unit *cache; /* memory allocation cache */
-	const char *data; /* text being parsed */
-	size_t length;    /* length of text */
-	size_t position;  /* current position in the text */
-	char done;        /* end reached or .exception or .error set */
-	char exception;   /* exception-statement reached */
-	char error;       /* error encountered */
+	/**
+	 * The rule table
+	 */
+	const struct libparser_rule *const *rules;
+
+	/**
+	 * Memory pool for parsing units
+	 */
+	struct libparser_unit *unit_pool;
+
+	/**
+	 * Current choices
+	 */
+	struct {
+		/**
+		 * The branching points and the choices made there
+		 */
+		struct choice *choices;
+
+		/**
+		 * Number of allocated choices
+		 */
+		size_t size;
+
+		/**
+		 * Number of made choices (it is be temporarily
+		 * one more then .next when amending a choice)
+		 */
+		size_t count;
+
+		/**
+		 * The index of the next choice
+		 */
+		size_t next;
+	} choices;
+
+	/**
+	 * Follow-up stack; it is popped any time
+	 * a sentence (including literals and
+	 * subsentences) has ended
+	 */
+	struct {
+		/**
+		 * The follow-ups
+		 */
+		struct followup *followups;
+
+		/**
+		 * The number of allocated slots
+		 */
+		size_t size;
+
+		/**
+		 * The number of used slots
+		 */
+		size_t count;
+	} followups;
+
+	/**
+	 * List of units leading up to the end
+	 * of the input
+	 */
+	struct {
+		/**
+		 * The units
+		 */
+		struct libparser_unit **units;
+
+		/**
+		 * The number of allocated slots
+		 */
+		size_t size;
+
+		/**
+		 * The number of used slots
+		 */
+		size_t count;
+	} units;
+
+	/**
+	 * The text being parsed
+	 */
+	const char *data;
+
+	/**
+	 * The length of the text
+	 */
+	size_t length;
+
+	/**
+	 * The current position in the text
+	 */
+	size_t position;
+
+	/**
+	 * Set when an exception-statement reached is encountered;
+	 * cleared when rejected the exception is rejected
+	 */
+	char exception;
 };
 
 
 /**
- * Recursively place move a unit into
+ * Epsilon rule, used when en OPTIONAL is not take
+ */
+static union libparser_sentence epsilon = {.type = LIBPARSER_SENTENCE_TYPE_EPSILON};
+
+
+#if PRINT_ACTIONS
+
+
+static const char *
+sentence_type_string(enum libparser_sentence_type type)
+{
+	switch (type) {
+	case LIBPARSER_SENTENCE_TYPE_CONCATENATION: return "CONCATENATION";
+	case LIBPARSER_SENTENCE_TYPE_ALTERNATION: return "ALTERNATION";
+	case LIBPARSER_SENTENCE_TYPE_REJECTION: return "REJECTION";
+	case LIBPARSER_SENTENCE_TYPE_OPTIONAL: return "OPTIONAL";
+	case LIBPARSER_SENTENCE_TYPE_REPEATED: return "REPEATED";
+	case LIBPARSER_SENTENCE_TYPE_STRING: return "STRING";
+	case LIBPARSER_SENTENCE_TYPE_CHAR_RANGE: return "CHAR_RANGE";
+	case LIBPARSER_SENTENCE_TYPE_RULE: return "RULE";
+	case LIBPARSER_SENTENCE_TYPE_EXCEPTION: return "EXCEPTION";
+	case LIBPARSER_SENTENCE_TYPE_EOF: return "EOF";
+	case LIBPARSER_SENTENCE_TYPE_EPSILON: return "EPSILON";
+	default: return "<invalid>";
+	}
+}
+
+
+static void
+print_tree(struct libparser_unit *unit, int indent)
+{
+	for (; unit; unit = unit->next) {
+		fprintf(stderr, "%*.s", indent, "");
+		fprintf(stderr, "%s: %s%s%s [%zu, %zu) {%p}\n",
+		        unit->rule, sentence_type_string(unit->sentence->type),
+		        unit->sentence->type == LIBPARSER_SENTENCE_TYPE_RULE ? " " : "",
+		        unit->sentence->type == LIBPARSER_SENTENCE_TYPE_RULE
+		            ? unit->sentence->rule.rule : "",
+		        unit->start, unit->end, (void *)unit);
+		print_tree(unit->in, indent + 2);
+	}
+}
+
+
+static void
+print_state(struct context *ctx)
+{
+	struct libparser_unit *unit;
+	const union libparser_sentence *sentence;
+	size_t i;
+
+	fprintf(stderr, "Position in text: %zu of %zu\n", ctx->position, ctx->length);
+	fprintf(stderr, "Exception state: %i\n", (int)ctx->exception);
+
+	fprintf(stderr, "Have %zu units:\n", ctx->units.count);
+	for (i = 0; i < ctx->units.count; i++) {
+		if (ctx->units.units[i]->next)
+			fprintf(stderr, "  (\n");
+		print_tree(ctx->units.units[i], 4);
+		if (ctx->units.units[i]->next)
+			fprintf(stderr, "  )\n");
+	}
+
+	fprintf(stderr, "Have %zu choices:\n", ctx->choices.count);
+	for (i = 0; i < ctx->choices.count; i++) {
+		fprintf(stderr, "    For unit %zu with %zu followups at %zu: %s: %s, ",
+		        ctx->choices.choices[i].unit_index,
+		        ctx->choices.choices[i].followups.count,
+		        ctx->choices.choices[i].position,
+		        sentence_type_string(ctx->choices.choices[i].sentence->type),
+		        ctx->choices.choices[i].rule);
+		if (ctx->choices.choices[i].choice == EXHAUSTED) {
+			fprintf(stderr, "choices EXHAUSTED\n");
+		} else {
+			fprintf(stderr, "using choice %zu\n", ctx->choices.choices[i].choice);
+		}
+	}
+
+	fprintf(stderr, "Have %zu follow-ups pending (outermost at top):\n",
+	        ctx->followups.count);
+	for (i = 0; i < ctx->followups.count; i++) {
+		if (IS_COMPLETION(ctx->followups.followups[i])) {
+			unit = ctx->followups.followups[i].unit;
+			sentence = unit->sentence;
+			fprintf(stderr, "    Completion: unit %zu: %s: %s%s%s [%zu, %zu)\n",
+			        ctx->followups.followups[i].unit_index,
+			        unit->rule, sentence_type_string(sentence->type),
+			        sentence->type == LIBPARSER_SENTENCE_TYPE_RULE ? " " : "",
+			        sentence->type == LIBPARSER_SENTENCE_TYPE_RULE
+			            ? sentence->rule.rule : "",
+			        unit->start, unit->end);
+		} else {
+			sentence = ctx->followups.followups[i].sentence;
+			fprintf(stderr, "    (%s, %s%s%s)\n",
+			        ctx->followups.followups[i].rule,
+			        sentence_type_string(sentence->type),
+			        sentence->type == LIBPARSER_SENTENCE_TYPE_RULE ? " " : "",
+			        sentence->type == LIBPARSER_SENTENCE_TYPE_RULE
+			            ? sentence->rule.rule : "");
+		}
+	}
+}
+
+
+static int
+print_sentence(const union libparser_sentence *sentence, int indent)
+{
+	unsigned char low, high;
+	int len;
+
+	switch (sentence->type) {
+	case LIBPARSER_SENTENCE_TYPE_CONCATENATION:
+		fprintf(stderr, "(");
+		indent = print_sentence(sentence->binary.left, indent + 1);
+		fprintf(stderr, ", ");
+		indent = print_sentence(sentence->binary.right, indent + 2);
+		fprintf(stderr, ")");
+		indent += 1;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_ALTERNATION:
+		fprintf(stderr, "(");
+		print_sentence(sentence->binary.left, indent + 1);
+		fprintf(stderr, " | \n%*.s", indent + 1, "");
+		indent = print_sentence(sentence->binary.right, indent + 1);
+		fprintf(stderr, ")");
+		indent += 1;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_REJECTION:
+		fprintf(stderr, "!(");
+		indent = print_sentence(sentence->unary.sentence, indent + 2);
+		fprintf(stderr, ")");
+		indent += 1;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
+		fprintf(stderr, "[");
+		indent = print_sentence(sentence->unary.sentence, indent + 1);
+		fprintf(stderr, "]");
+		indent += 1;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_REPEATED:
+		fprintf(stderr, "{");
+		indent = print_sentence(sentence->unary.sentence, indent + 1);
+		fprintf(stderr, "}");
+		indent += 1;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_STRING:
+		fprintf(stderr, "\"%.*s\"%n", (int)sentence->string.length, sentence->string.string, &len);
+		indent += len;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_CHAR_RANGE:
+		low = sentence->char_range.low;
+		high = sentence->char_range.high;
+		if (isprint(low) && isprint(high))
+			fprintf(stderr, "<\"%c\", \"%c\">%n", low, high, &len);
+		else if (isprint(sentence->char_range.low))
+			fprintf(stderr, "<\"%c\", 0x%02x>%n", low, high, &len);
+		else if (isprint(sentence->char_range.high))
+			fprintf(stderr, "<0x%02x, \"%c\">%n", low, high, &len);
+		else
+			fprintf(stderr, "<0x%02x, 0x%02x>%n", low, high, &len);
+		indent += len;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_RULE:
+		fprintf(stderr, "%s%n", sentence->rule.rule, &len);
+		indent += len;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_EXCEPTION:
+		fprintf(stderr, "-");
+		indent += 1;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_EOF:
+		fprintf(stderr, "%s%n", "!<0x00, 0xFF>", &len);
+		indent += len;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_EPSILON:
+		fprintf(stderr, "@EPSILON");
+		indent += 8;
+		break;
+
+	default:
+		abort();
+	}
+
+	return indent;
+}
+
+
+static void
+print_grammar(const struct libparser_rule *const *rules)
+{
+	size_t i;
+	int indent;
+	fprintf(stderr, "Rules:\n");
+	for (i = 0; rules[i]; i++) {
+		fprintf(stderr, "    %s = %n", rules[i]->name, &indent);
+		print_sentence(rules[i]->sentence, indent);
+		fprintf(stderr, ";\n");
+	}
+}
+
+
+#endif
+
+
+/**
+ * Embed a rule's or subsentence's matches
+ * into where the rule or subsentence was
+ * matched
+ * 
+ * @param  where  The pointer to the match
+ * @param  last   Optional output paramter for the
+ *                last embedded, top-level unit
+ */
+static void
+embed_rule(struct libparser_unit **where, struct libparser_unit **last)
+{
+	struct libparser_unit *old = *where;
+
+#if PRINT_ACTIONS
+	fprintf(stderr, "Embedding %s: %s [%zu, %zu)\n", old->rule,
+	        sentence_type_string(old->sentence->type), old->start, old->end);
+#endif
+
+	if (old->next)
+		abort();
+	if (old->in == old)
+		abort();
+	*where = old->in;
+	free(old);
+	if (last) {
+		*last = *where;
+		if (*last)
+			while ((*last)->next)
+				*last = (*last)->next;
+	}
+}
+
+
+/**
+ * Recursively move a unit into
  * the memory allocation cache
  * 
  * @param  unit  Unit to place in the cache
@@ -36,8 +488,8 @@ free_unit(struct libparser_unit *unit, struct context *ctx)
 		free_unit(unit->in, ctx);
 		prev = unit;
 		unit = unit->next;
-		prev->next = ctx->cache;
-		ctx->cache = prev;
+		prev->next = ctx->unit_pool;
+		ctx->unit_pool = prev;
 	}
 }
 
@@ -64,9 +516,6 @@ dealloc_unit(struct libparser_unit *unit)
  * new unit, but first try to reuse
  * from the memory cache
  * 
- * On allocation failure, the the
- * error marked in the parsing context
- * 
  * @param   ctx  Parsing context
  * @return       Newly allocated (or polled from the
  *               cache) unit, `NULL` on failure
@@ -75,131 +524,341 @@ static struct libparser_unit *
 alloc_unit(struct context *ctx)
 {
 	struct libparser_unit *unit;
-	if (!ctx->cache) {
+	if (!ctx->unit_pool) {
 		unit = malloc(sizeof(*unit));
-		if (!unit) {
-			ctx->done = 1;
-			ctx->error = 1;
+		if (!unit)
 			return NULL;
-		}
 	} else {
-		unit = ctx->cache;
-		ctx->cache = unit->next;
+		unit = ctx->unit_pool;
+		ctx->unit_pool = unit->next;
 	}
 	return unit;
 }
 
 
 /**
- * Embed a rule's or subsentence's matches
- * into where the rule or subsentence was
- * matched
+ * Grow the unit list (if ncessary) and return a
+ * pointer the next empty slot
  * 
- * @param  where  The pointer to the match
- * @param  ctx    Parsing context
+ * @param   ctx  Parsing context
+ * @return       Pointer to the unit slot
  */
-static void
-embed_rule(struct libparser_unit **where, struct context *ctx)
+static struct libparser_unit **
+new_unit(struct context *ctx)
 {
-	/* remove matched unit */
-	(*where)->next = ctx->cache;
-	ctx->cache = *where;
-	/* insert interior where matched */
-	*where = (*where)->in;
+	void *new;
+	if (ctx->units.count == ctx->units.size) {
+		if (ctx->units.size > SIZE_MAX / sizeof(*ctx->units.units) - 64u) {
+		enomem:
+			errno = ENOMEM;
+			return NULL;
+		}
+		ctx->units.size += 64u;
+		new = realloc(ctx->units.units, ctx->units.size * sizeof(*ctx->units.units));
+		if (!new)
+			goto enomem;
+		ctx->units.units = new;
+	}
+#if PRINT_ACTIONS
+	fprintf(stderr, "New unit slot at %zu\n", ctx->units.count);
+#endif
+	return &ctx->units.units[ctx->units.count++];
 }
 
 
-static struct libparser_unit *
-try_match(const char *rule, const union libparser_sentence *sentence, struct context *ctx)
+/**
+ * Create a new follow-up node and push it
+ * into the follow-up stack
+ * 
+ * @param   ctx  Parsing context
+ * @return       The follow-up node, `NULL` on failure
+ */
+static struct followup *
+new_followup(struct context *ctx)
 {
-	struct libparser_unit *unit, *next;
-	struct libparser_unit **head;
-	unsigned char c;
-	size_t i;
+	void *new;
+	if (ctx->followups.count == ctx->followups.size) {
+		if (ctx->followups.size > SIZE_MAX / sizeof(*ctx->followups.followups) - 64u) {
+		enomem:
+			errno = ENOMEM;
+			return NULL;
+		}
+		ctx->followups.size += 64u;
+		new = realloc(ctx->followups.followups,
+		              ctx->followups.size * sizeof(*ctx->followups.followups));
+		if (!new)
+			goto enomem;
+		ctx->followups.followups = new;
+	}
+
+#if PRINT_ACTIONS
+	fprintf(stderr, "New follow-up slot at %zu\n", ctx->followups.count);
+#endif
+
+	return &ctx->followups.followups[ctx->followups.count++];
+}
+
+
+/**
+ * Push sentence into follow-up stack
+ * 
+ * @param   rule      The name of the rule, or `NULL` if it is an unnamed subsentence
+ * @param   sentence  The sentence to push
+ * @param   ctx       Parsing context
+ * @return            0 on success, -1 on failure
+ */
+static int
+push(const char *rule, const union libparser_sentence *sentence, struct context *ctx)
+{
+	struct followup *followup = new_followup(ctx);
+	if (!followup)
+		return -1;
+
+	followup->sentence = sentence;
+	followup->unit = NULL;
+	followup->rule = rule;
+
+#if PRINT_ACTIONS
+	fprintf(stderr, "Pushing follow-up sentence: %s: %s\n",
+	        sentence_type_string(sentence->type), rule);
+#endif
+
+	return 0;
+}
+
+
+/**
+ * Create a unit for a sentence and push it into
+ * the follow-up stack to mark the sentence as
+ * completed when the follow-up node is processed
+ * 
+ * @param   rule      The name of the rule, or `NULL` if it is an unnamed subsentence
+ * @param   sentence  The sentence to push
+ * @param   start     The position in the input text where the sentence begins
+ * @param   ctx       Parsing context
+ * @return            0 on success, -1 on failure
+ */
+static int
+incomplete(const char *rule, const union libparser_sentence *sentence, size_t start, struct context *ctx)
+{
+	struct followup *followup;
+	struct libparser_unit *unit;
+	struct libparser_unit **unit_slot;
 
 	unit = alloc_unit(ctx);
 	if (!unit)
-		return NULL;
-	unit->end = 0;
+		return -1;
 	unit->in = unit->next = NULL;
 	unit->rule = rule;
-	unit->start = ctx->position;
+	unit->start = start;
+	unit->end = start;
+	unit->sentence = sentence;
 
-	switch (sentence->type) {
-	case LIBPARSER_SENTENCE_TYPE_CONCATENATION:
-		unit->in = try_match(NULL, sentence->binary.left, ctx);
-		if (!unit->in)
-			goto mismatch;
-		if (ctx->done)
-			break;
-		unit->in->next = try_match(NULL, sentence->binary.right, ctx);
-		if (!unit->in->next) {
-			free_unit(unit->in, ctx);
-			goto mismatch;
-		}
-		if (IS_HIDDEN(unit->in->next->rule))
-			embed_rule(&unit->in->next, ctx);
-		if (IS_HIDDEN(unit->in->rule)) {
-			next = unit->in->next;
-			embed_rule(&unit->in, ctx);
-			/* rejoin with right-hand */
-			if (unit->in) {
-				for (head = &unit->in->next; *head; head = &(*head)->next);
-				*head = next;
-			} else {
-				unit->in = next;
+	followup = new_followup(ctx);
+	if (!followup) {
+		free_unit(unit, ctx);
+		return -1;
+	}
+	followup->unit = unit;
+	followup->unit_index = ctx->units.count;
+
+#if PRINT_ACTIONS
+	fprintf(stderr, "Pushing follow-up completion for unit %zu: %s: %s\n",
+	        ctx->units.count, sentence_type_string(sentence->type), rule);
+#endif
+
+	unit_slot = new_unit(ctx);
+	if (!unit_slot) {
+		free_unit(unit, ctx);
+		return -1;
+	}
+	*unit_slot = unit;
+#if PRINT_ACTIONS
+	fprintf(stderr, "Writing unit to slot: %s: %s, spanning [%zu, %zu)\n",
+	        unit->rule, sentence_type_string(unit->sentence->type), unit->start, unit->end);
+#endif
+
+	return 0;
+}
+
+
+/**
+ * Add branching point to list of choices,
+ * or if already added, just the get the
+ * current choice
+ * 
+ * @param   rule        The name of the rule that is a brachning
+ *                      point, `NULL` if a subsentence
+ * @param   sentence    The sentence that is brachning point
+ * @param   start       The position in the input text where the
+ *                      brachning begins
+ * @param   choice_out  Output parameter for the current choice value
+ * @return              0 on success, -1 on failure
+ */
+static int
+braching(const char *rule, const union libparser_sentence *sentence, size_t start, struct context *ctx, size_t *choice_out)
+{
+	struct choice *choice;
+	size_t size;
+	void *new;
+	if (ctx->choices.next == ctx->choices.count) {
+		if (ctx->choices.count == ctx->choices.size) {
+			size = ctx->choices.size;
+			if (size > SIZE_MAX - 8u) {
+			enomem:
+				errno = ENOMEM;
+				return -1;
 			}
+			size += 8u;
+			if (size > SIZE_MAX / sizeof(*ctx->choices.choices))
+				goto enomem;
+			new = realloc(ctx->choices.choices, size * sizeof(*ctx->choices.choices));
+			if (!new)
+				goto enomem;
+			ctx->choices.choices = new;
+			ctx->choices.size = size;
 		}
-		break;
+		choice = &ctx->choices.choices[ctx->choices.next];
+		choice->choice = 0;
+		choice->position = start;
+		choice->rule = rule;
+		choice->sentence = sentence;
+		choice->unit_index = ctx->units.count;
+		choice->followups.followups = NULL;
+		choice->followups.count = ctx->followups.count;
+		if (choice->followups.count) {
+			size = choice->followups.count * sizeof(*choice->followups.followups);
+			choice->followups.followups = malloc(size);
+			if (!choice->followups.followups)
+				goto enomem;
+			memcpy(choice->followups.followups, ctx->followups.followups, size);
+		}
+		ctx->choices.count++;
+
+#if PRINT_ACTIONS
+		fprintf(stderr, "Adding branching point at %zu for unit %zu: %s: %s, at %zu in text\n",
+		        ctx->choices.next, ctx->units.count,
+		        sentence_type_string(sentence->type), rule, start);
+	} else {
+		fprintf(stderr, "Revisiting branching point %zu at %zu in text for unit %zu: "
+		                "%s: %s, at %zu in text; choice value: %zu\n",
+		        ctx->choices.next, start,
+		        ctx->choices.choices[ctx->choices.next].unit_index,
+		        sentence_type_string(ctx->choices.choices[ctx->choices.next].sentence->type),
+		        ctx->choices.choices[ctx->choices.next].rule,
+		        ctx->choices.choices[ctx->choices.next].position,
+		        ctx->choices.choices[ctx->choices.next].choice);
+#endif
+	}
+	*choice_out = ctx->choices.choices[ctx->choices.next++].choice;
+	return 0;
+}
+
 
+/**
+ * Add a sentence, as a unit, to the list of units,
+ * will adding to the follow-up stack, the subsentences
+ * the the completion of the sentence unless it is
+ * atomic, and for branching points, record the choice
+ * with backtracking data
+ * 
+ * @param   rule      The name of the rule, `NULL` if it is a subsentence
+ * @param   sentence  The sentence to add
+ * @param   ctx       Parsing context
+ * @return            1 on matched or incomplete, 0 on mismatch, -1 on failure
+ */
+static int
+add_unit(const char *rule, const union libparser_sentence *sentence, struct context *ctx)
+{
+	struct libparser_unit *unit;
+	struct libparser_unit **unit_slot;
+	size_t start = ctx->position;
+	unsigned char c;
+	size_t i, choice;
+
+#if PRINT_ACTIONS
+	fprintf(stderr, "Visiting %s: %s, at %zu in text\n",
+	        sentence_type_string(sentence->type), rule, start);
+#endif
+
+	switch (sentence->type) {
 	case LIBPARSER_SENTENCE_TYPE_ALTERNATION:
-		unit->in = try_match(NULL, sentence->binary.left, ctx);
-		if (!unit->in) {
-			unit->in = try_match(NULL, sentence->binary.right, ctx);
-			if (!unit->in)
-				goto mismatch;
-		}
-	prone:
-		if (unit->in && IS_HIDDEN(unit->in->rule))
-			embed_rule(&unit->in, ctx);
-		break;
+		if (braching(rule, sentence, start, ctx, &choice))
+			return -1;
+		if (choice > 1u)
+			goto exhausted;
+		if (incomplete(rule, sentence, start, ctx) ||
+		    push(NULL, choice ? sentence->binary.right : sentence->binary.left, ctx))
+			return -1;
+		return 1;
 
-	case LIBPARSER_SENTENCE_TYPE_REJECTION:
-		unit->in = try_match(NULL, sentence->unary.sentence, ctx);
-		if (unit->in) {
-			free_unit(unit->in, ctx);
-			if (!ctx->exception)
-				goto mismatch;
-			ctx->exception = 0;
+	case LIBPARSER_SENTENCE_TYPE_REPEATED:
+		if (braching(rule, sentence, start, ctx, &choice))
+			return -1;
+		if (choice == 0u) {
+			if (incomplete(rule, sentence, start, ctx) ||
+			    push(NULL, sentence, ctx) ||
+			    push(NULL, sentence->unary.sentence, ctx))
+				return -1;
+		} else if (choice == 1u) {
+			if (incomplete(rule, sentence, start, ctx) ||
+			    push("@epsilon", &epsilon, ctx) ||
+			    push(NULL, sentence->unary.sentence, ctx))
+				return -1;
+		} else if (choice == 2u) {
+			if (incomplete(rule, sentence, start, ctx) ||
+			    push("@epsilon", &epsilon, ctx) ||
+			    push("@epsilon", &epsilon, ctx))
+				return -1;
+		} else {
+			goto exhausted;
 		}
-		ctx->position = unit->start;
-		unit->rule = NULL;
-		break;
+		return 1;
 
 	case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
-		unit->in = try_match(NULL, sentence->unary.sentence, ctx);
-		goto prone;
+		if (braching(rule, sentence, start, ctx, &choice))
+			return -1;
+		if (choice == 0u) {
+			if (incomplete(rule, sentence, start, ctx) ||
+			    push(NULL, sentence->unary.sentence, ctx))
+				return -1;
+		} else if (choice == 1u) {
+			if (incomplete(rule, sentence, start, ctx) ||
+			    push("@epsilon", &epsilon, ctx))
+				return -1;
+		} else {
+			goto exhausted;
+		}
+		return 1;
 
-	case LIBPARSER_SENTENCE_TYPE_REPEATED:
-		head = &unit->in;
-		while ((*head = try_match(NULL, sentence->unary.sentence, ctx))) {
-			if (IS_HIDDEN((*head)->rule)) {
-				embed_rule(head, ctx);
-				while (*head)
-					head = &(*head)->next;
-			} else {
-				head = &(*head)->next;
-			}
-			if (ctx->done)
+	case LIBPARSER_SENTENCE_TYPE_CONCATENATION:
+		if (incomplete(rule, sentence, start, ctx) ||
+		    push(NULL, sentence->binary.right, ctx) ||
+		    push(NULL, sentence->binary.left, ctx))
+			return -1;
+		return 1;
+
+	case LIBPARSER_SENTENCE_TYPE_REJECTION:
+		if (incomplete(NULL, sentence, start, ctx) ||
+		    push(NULL, sentence->unary.sentence, ctx))
+			return -1;
+		return 1;
+
+	case LIBPARSER_SENTENCE_TYPE_RULE:
+		for (i = 0; ctx->rules[i]; i++)
+			if (!strcmp(ctx->rules[i]->name, sentence->rule.rule))
 				break;
-		}
-		break;
+		if (!ctx->rules[i])
+			abort();
+		if (incomplete(rule, sentence, start, ctx) ||
+		    push(ctx->rules[i]->name, ctx->rules[i]->sentence, ctx))
+			return -1;
+		return 1;
 
 	case LIBPARSER_SENTENCE_TYPE_STRING:
-		if (sentence->string.length > ctx->length - ctx->position)
-			goto mismatch;
-		if (memcmp(&ctx->data[ctx->position], sentence->string.string, sentence->string.length))
-			goto mismatch;
+		if (sentence->string.length > ctx->length - ctx->position ||
+		    memcmp(&ctx->data[ctx->position], sentence->string.string, sentence->string.length))
+			return 0;
 		ctx->position += sentence->string.length;
 		break;
 
@@ -212,59 +871,249 @@ try_match(const char *rule, const union libparser_sentence *sentence, struct con
 		ctx->position += 1u;
 		break;
 
-	case LIBPARSER_SENTENCE_TYPE_RULE:
-		for (i = 0; ctx->rules[i]; i++)
-			if (!strcmp(ctx->rules[i]->name, sentence->rule.rule))
-				break;
-		if (!ctx->rules[i])
-			abort();
-		unit->in = try_match(ctx->rules[i]->name, ctx->rules[i]->sentence, ctx);
-		if (!unit->in)
-			goto mismatch;
-		goto prone;
-
 	case LIBPARSER_SENTENCE_TYPE_EXCEPTION:
-		ctx->done = 1;
 		ctx->exception = 1;
 		break;
 
 	case LIBPARSER_SENTENCE_TYPE_EOF:
 		if (ctx->position != ctx->length)
 			goto mismatch;
-		ctx->done = 1;
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_EPSILON:
 		break;
 
 	default:
 		abort();
 	}
 
+	/* for atomic sentences */
+	unit = alloc_unit(ctx);
+	if (!unit)
+		return -1;
+	unit->in = unit->next = NULL;
+	unit->rule = rule;
+	unit->start = start;
 	unit->end = ctx->position;
-	return unit;
+	unit->sentence = sentence;
+	unit_slot = new_unit(ctx);
+	if (!unit_slot) {
+		free_unit(unit, ctx);
+		return -1;
+	}
+	*unit_slot = unit;
+#if PRINT_ACTIONS
+	fprintf(stderr, "Writing atomic unit to slot: %s: %s, spanning [%zu, %zu)\n",
+	        sentence_type_string(unit->sentence->type), unit->rule, unit->start, unit->end);
+#endif
+	return 1;
 
+exhausted:
+	ctx->choices.choices[ctx->choices.next - 1U].choice = EXHAUSTED;
 mismatch:
-	/* On mismatch, restore position in text */
-	ctx->position = unit->start;
-	/* and place the unit in the memory cache */
-	unit->next = ctx->cache;
-	ctx->cache = unit;
-	return NULL;
+#if PRINT_ACTIONS
+	fprintf(stderr, "Returning mismatch\n");
+#endif
+	return 0;
+}
+
+
+/**
+ * Convert list of units into a tree
+ * 
+ * @param   ctx     Parsing context
+ * @param   unit_i  Reference to current position in the unit list
+ * @param   last    Output parameter for the last unit on the same
+ *                  level as the returned node (used to deal with
+ *                  embedding of anonymous sentences)
+ * @return          The first top level node from the current
+ *                  position in the list of units
+ */
+static struct libparser_unit *
+make_tree(struct context *ctx, size_t *unit_i, struct libparser_unit **last)
+{
+	/* TODO try to flatten to call stack */
+	struct libparser_unit *unit, *middle, **p;
+	size_t start;
+
+#if PRINT_ACTIONS
+	static int indent = 0;
+	fprintf(stderr, "%*.s", indent, "");
+	fprintf(stderr, "{ at %zu of %zu\n", *unit_i, ctx->units.count);
+	indent += 4;
+	fprintf(stderr, "%*.s", indent, "");
+#endif
+
+	if (*unit_i == ctx->units.count)
+		abort();
+
+	unit = ctx->units.units[(*unit_i)++];
+	unit->in = NULL;
+	unit->next = NULL;
+
+	switch (unit->sentence->type) {
+	case LIBPARSER_SENTENCE_TYPE_RULE:
+	case LIBPARSER_SENTENCE_TYPE_OPTIONAL:
+	case LIBPARSER_SENTENCE_TYPE_ALTERNATION:
+#if PRINT_ACTIONS
+		fprintf(stderr, "Unary unit: %s: %s [%zu, %zu)\n",
+		        sentence_type_string(unit->sentence->type),
+		        unit->rule, unit->start, unit->end);
+#endif
+		unit->in = make_tree(ctx, unit_i, last);
+		break;
+
+	case LIBPARSER_SENTENCE_TYPE_REJECTION:
+#if PRINT_ACTIONS
+		fprintf(stderr, "Unary lookahead unit: %s: %s [%zu, %zu)\n",
+		        sentence_type_string(unit->sentence->type),
+		        unit->rule, unit->start, unit->end);
+#endif
+		unit->in = make_tree(ctx, unit_i, last);
+		start = unit->start;
+		dealloc_unit(unit);
+		unit = malloc(sizeof(*unit));
+		unit->rule = "@epsilon";
+		unit->sentence = &epsilon;
+		unit->in = unit->next = NULL;
+		unit->start = unit->end = start;
+		*last = unit;
+		goto out;
+
+	case LIBPARSER_SENTENCE_TYPE_REPEATED:
+	case LIBPARSER_SENTENCE_TYPE_CONCATENATION:
+#if PRINT_ACTIONS
+		fprintf(stderr, "Binary unit: %s: %s [%zu, %zu)\n",
+		        sentence_type_string(unit->sentence->type),
+		        unit->rule, unit->start, unit->end);
+#endif
+		unit->in = make_tree(ctx, unit_i, &middle);
+		if (IS_ANONYMOUS(unit->in->rule)) {
+			embed_rule(&unit->in, &middle);
+			if (!middle) {
+				p = &unit->in;
+				goto p_found;
+			}
+		}
+		p = &middle->next;
+		if (*p)
+			while ((*p)->next)
+				p = &(*p)->next;
+	p_found:
+		*p = make_tree(ctx, unit_i, last);
+		if (IS_ANONYMOUS((*p)->rule))
+			embed_rule(&*p, last);
+		goto out;
+
+	case LIBPARSER_SENTENCE_TYPE_STRING:
+	case LIBPARSER_SENTENCE_TYPE_CHAR_RANGE:
+	case LIBPARSER_SENTENCE_TYPE_EXCEPTION:
+	case LIBPARSER_SENTENCE_TYPE_EOF:
+	case LIBPARSER_SENTENCE_TYPE_EPSILON:
+		*last = unit;
+#if PRINT_ACTIONS
+		fprintf(stderr, "Atomic unit: %s: %s [%zu, %zu)\n",
+		        sentence_type_string(unit->sentence->type),
+		        unit->rule, unit->start, unit->end);
+#endif
+		goto out;
+	default:
+		abort();
+	}
+
+	if (IS_ANONYMOUS(unit->in->rule))
+		embed_rule(&unit->in, last);
+
+out:
+#if PRINT_ACTIONS
+	indent -= 4;
+	fprintf(stderr, "%*.s", indent, "");
+	fprintf(stderr, "} at %zu of %zu\n", *unit_i, ctx->units.count);
+#endif
+	if (!unit)
+		abort();
+	return unit;
+}
+
+
+/**
+ * Remove all epsilons from the tree
+ * 
+ * @param   unit  The tree
+ * @return        The tree
+ */
+static struct libparser_unit *
+remove_epsilons(struct libparser_unit *unit)
+{
+	struct libparser_unit **node, *deleted;
+
+	if (!unit)
+		return NULL;
+
+	node = &unit;
+	while (*node) {
+		if ((*node)->sentence->type == LIBPARSER_SENTENCE_TYPE_EPSILON) {
+			deleted = *node;
+			*node = (*node)->next;
+			free(deleted);
+		} else {
+			(*node)->in = remove_epsilons((*node)->in);
+			node = &(*node)->next;
+		}
+	}
+
+	return unit;
+}
+
+
+/**
+ * Check whether the inner most branching-point
+ * contains a rejection
+ * 
+ * @param   ctx  Parsing context
+ * @reutrn       1 if branching-point contains rejection,
+ *               0 otherwise
+ */
+#ifdef __GNUC__
+__attribute__((__pure__))
+#endif
+static int
+choice_contains_rejection(struct context *ctx)
+{
+	/* TODO we should keep track of our rejections so we can do this in constant time */
+	size_t i = ctx->choices.choices[ctx->choices.count - 1u].unit_index;
+	for (i += 1u; i < ctx->units.count; i++)
+		if (ctx->units.units[i]->sentence->type == LIBPARSER_SENTENCE_TYPE_REJECTION)
+			return 1;
+	return 0;
 }
 
 
 int
 libparser_parse_file(const struct libparser_rule *const rules[], const char *data, size_t length, struct libparser_unit **rootp)
 {
-	struct libparser_unit *ret, *t;
+	const char *rule;
+	const union libparser_sentence *sentence;
+	struct libparser_unit *ret = NULL, *t, *unit;
 	struct context ctx;
-	size_t i;
+	size_t i, j, n;
+	int match;
 
 	ctx.rules = rules;
-	ctx.cache = NULL;
+	ctx.choices.choices = NULL;
+	ctx.choices.count = 0;
+	ctx.choices.size = 0;
+	ctx.choices.next = 0;
+	ctx.followups.followups = NULL;
+	ctx.followups.size = 0;
+	ctx.followups.count = 0;
+	ctx.units.units = NULL;
+	ctx.units.size = 0;
+	ctx.units.count = 0;
+	ctx.unit_pool = NULL;
 	ctx.data = data;
 	ctx.length = length;
 	ctx.position = 0;
-	ctx.done = 0;
-	ctx.error = 0;
 	ctx.exception = 0;
 
 	for (i = 0; rules[i]; i++)
@@ -272,16 +1121,188 @@ libparser_parse_file(const struct libparser_rule *const rules[], const char *dat
 			break;
 	if (!rules[i])
 		abort();
+	rule = rules[i]->name;
+	sentence = rules[i]->sentence;
+
+#if PRINT_ACTIONS
+	print_grammar(rules);
+#endif
 
-	ret = try_match(rules[i]->name, rules[i]->sentence, &ctx);
+	/* TODO guard against left-side recursion */
+	/* TODO make branching opt-in ("?"), and add commit statements ("+") */
+	/* TODO break REPEATED on empty match */
+followup:
+#if PRINT_ACTIONS
+	print_state(&ctx);
+#endif
+	match = add_unit(rule, sentence, &ctx);
+	if (match < 0) {
+#if PRINT_ACTIONS
+		fprintf(stderr, "Error encountered\n");
+#endif
+		goto out;
+	}
+	while (ctx.followups.count) {
+		i = --ctx.followups.count;
+		if (IS_COMPLETION(ctx.followups.followups[i])) {
+			unit = ctx.followups.followups[i].unit;
+			sentence = unit->sentence;
+#if PRINT_ACTIONS
+			fprintf(stderr, "Popping completion of sentence: %s: %s, "
+			                "for unit %zu, at %zu, consuming %zu subunits\n",
+			        sentence_type_string(sentence->type), unit->rule,
+			        ctx.followups.followups[i].unit_index, ctx.position,
+			        ctx.units.count - ctx.followups.followups[i].unit_index - 1U);
+#endif
+			if (sentence->type == LIBPARSER_SENTENCE_TYPE_REJECTION) {
+#if PRINT_ACTIONS
+				fprintf(stderr, "Checking REJECTION\n");
+#endif
+				if (!match) {
+#if PRINT_ACTIONS
+					fprintf(stderr, "Match set, replacing REJECTION with EPSILON\n");
+					fprintf(stderr, "Before:\n");
+					print_state(&ctx);
+#endif
+					match = 1;
+					for (j = ctx.followups.followups[i].unit_index + 1u; j < ctx.units.count; j++)
+						free_unit(ctx.units.units[j], &ctx);
+					j = ctx.followups.followups[i].unit_index;
+					ctx.units.count = j + 1u;
+					if (unit != ctx.units.units[j])
+						abort();
+					if (unit->in)
+						abort();
+					if (unit->next)
+						abort();
+					unit->rule = "@epsilon";
+					unit->sentence = &epsilon;
+					while (ctx.choices.count && ctx.choices.choices[ctx.choices.count - 1u].unit_index >= j) {
+						ctx.choices.count--;
+						free(ctx.choices.choices[ctx.choices.count].followups.followups);
+					}
+#if PRINT_ACTIONS
+					fprintf(stderr, "After:\n");
+					print_state(&ctx);
+				} else if (ctx.exception) {
+					fprintf(stderr, "Match and exception cleared\n");
+					match = 0;
+#endif
+				} else {
+#if PRINT_ACTIONS
+					fprintf(stderr, "Match cleared\n");
+#endif
+					match = 0;
+				}
+				ctx.exception = 0;
+				ctx.position = unit->start;
+			}
+			if (!match) 
+				goto mismatch;
+#if PRINT_ACTIONS
+			fprintf(stderr, "Setting end of popped unit to %zu\n", ctx.position);
+#endif
+			unit->end = ctx.position;
+		} else if (match) {
+			rule = ctx.followups.followups[i].rule;
+			sentence = ctx.followups.followups[i].sentence;
+#if PRINT_ACTIONS
+			fprintf(stderr, "Popping follow-up sentence: %s: %s\n",
+			        sentence_type_string(sentence->type), rule);
+#endif
+			goto followup;
+		} else mismatch: if (ctx.choices.count && !choice_contains_rejection(&ctx)) {
+			i = ctx.choices.count - 1u;
+			ctx.choices.next = i;
+			n = ctx.units.count;
+			ctx.units.count = ctx.choices.choices[i].unit_index;
+			for (j = ctx.units.count; j < n; j++)
+				free_unit(ctx.units.units[j], &ctx);
+			if (ctx.choices.choices[i].followups.count > ctx.followups.size)
+				abort();
+			ctx.followups.count = ctx.choices.choices[i].followups.count;
+			memcpy(ctx.followups.followups,
+			       ctx.choices.choices[i].followups.followups,
+			       ctx.followups.count * sizeof(*ctx.followups.followups));
+			if (ctx.choices.choices[i].choice == EXHAUSTED) {
+				free(ctx.choices.choices[i].followups.followups);
+				ctx.choices.count--;
+				goto mismatch;
+			}
+			ctx.choices.choices[i].choice++;
+			ctx.position = ctx.choices.choices[i].position;
+			rule = ctx.choices.choices[i].rule;
+			sentence = ctx.choices.choices[i].sentence;
+#if PRINT_ACTIONS
+			fprintf(stderr, "Mismatch: retrying from branching point %zu (%s, %s), "
+			                "and unit %zu, from %zu, now with choice %zu\n",
+			        i, sentence_type_string(sentence->type), rule, ctx.position,
+			        ctx.units.count, ctx.choices.choices[i].choice);
+#endif
+			goto followup;
+#if PRINT_ACTIONS
+		} else {
+			fprintf(stderr, "Mismatch with %zu pending follow-ups and no choices, "
+			        "continuing polling follow-ups\n",
+			        ctx.followups.count);
+			print_state(&ctx);
+#endif
+		}
+	}
+
+#if PRINT_ACTIONS
+	fprintf(stderr, "Result determined: %s\n", match ? "match" : "mismatch");
+	fprintf(stderr, "Due to exception?: %s\n", ctx.exception ? "yes" : "no");
+	print_state(&ctx);
+#endif
+
+	if (match) {
+		if (!ctx.units.count)
+			abort();
+#if PRINT_ACTIONS
+		fprintf(stderr, "Converting into tree\n");
+#endif
+		i = 0;
+		ret = make_tree(&ctx, &i, &(struct libparser_unit *){NULL});
+		if (ret && IS_ANONYMOUS(ret->rule)) {
+#if PRINT_ACTIONS
+			fprintf(stderr, "Embedding at top level:\n    ");
+#endif
+			embed_rule(&ret, NULL);
+		}
+		if (i != ctx.units.count)
+			abort();
+		ctx.units.count = 0;
+
+		ret = remove_epsilons(ret);
+
+#if PRINT_ACTIONS
+		fprintf(stderr, "Match tree:\n");
+		print_tree(ret, 4);
+#endif
+	} else if (ctx.choices.count) {
+#if PRINT_ACTIONS
+		fprintf(stderr, "Result with mismatch but remaining choices, trying next alternative\n");
+#endif
+		ctx.exception = 0;
+		goto mismatch;
+	}
 
-	while (ctx.cache) {
-		t = ctx.cache;
-		ctx.cache = t->next;
+out:
+	for (i = 0; i < ctx.choices.count; i++)
+		free(ctx.choices.choices[i].followups.followups);
+	free(ctx.choices.choices);
+	free(ctx.followups.followups);
+	for (i = 0; i < ctx.units.count; i++)
+		dealloc_unit(ctx.units.units[i]);
+	free(ctx.units.units);
+	while (ctx.unit_pool) {
+		t = ctx.unit_pool;
+		ctx.unit_pool = t->next;
 		free(t);
 	}
 
-	if (ctx.error) {
+	if (match < 0) {
 		dealloc_unit(ret);
 		*rootp = NULL;
 		return -1;