From 5e8cc61fe8626814a855a6347b3df3c24e7b7bb0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sat, 13 Jul 2013 07:21:37 +1000
Subject: [PATCH] New file: parsergen

This reads and analyses a grammar and generates a parser.

It include a simple calculator

Signed-off-by: NeilBrown <neilb@suse.de>
---
 csrc/parsergen.mdc | 2554 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 2554 insertions(+)
 create mode 100644 csrc/parsergen.mdc

diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc
new file mode 100644
index 0000000..ddf03ab
--- /dev/null
+++ b/csrc/parsergen.mdc
@@ -0,0 +1,2554 @@
+# LR(1) Parser Generator #
+
+This parser generator takes a grammar description combined with code
+fragments, analyses it, and can report details about the analysis and
+write out C code files which can be compiled to make a parser.
+
+There are several distinct sections.
+
+1. `grammar_read` will read a grammar from a literate-code file and
+   store the productions, symbols, and code fragments.
+2. `grammar_analyse` will take that grammar and build LR parsing
+   states and look-ahead tables.
+3. `grammar_report` will present the details of the analysis
+   in a readable format and will report any conflicts.
+4. `parser_generate` will write out C code files with various
+   tables and with the code fragments arranged in useful places.
+5. `parser_run` is a library function intended to be linked together
+   with the generated parser tables to complete the implementation of
+   a parser.
+6. `calc.cgm` is a test grammar for a simple calculator.
+
+###### File: parsergen.c
+	#include <unistd.h>
+	#include <stdlib.h>
+	#include <stdio.h>
+	## includes
+	## forward declarations
+	## declarations
+	## functions
+	## grammar_read
+	## grammar_analyse
+	## grammar_report
+	## parser_generate
+	## main
+###### File: parser.h
+	## exported types
+	## exported functions
+###### File: libparser.c
+	#include <unistd.h>
+	#include <stdlib.h>
+	#include <stdio.h>
+	## parser includes
+	## parser
+###### File: calc.cgm
+	## demo grammar
+###### File: parsergen.mk
+	CFLAGS += -Wall -g
+	all :: parsergen calc
+	parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
+		./md2c parsergen.mdc
+
+## Reading the grammar
+
+The grammar must be stored in a markdown literate code file as parsed
+by "[mdcode][]".  It must have three top level (i.e. unreferenced)
+sections: `header`, `code`, and `grammar`.  The first two will be
+literally copied into the generated `.c` and `.h`. files.  The last
+contains the grammar.  This is tokenised with "[scanner][]".
+
+[mdcode]: mdcode.html
+[scanner]: scanner.html
+
+###### includes
+	#include "mdcode.h"
+	#include "scanner.h"
+
+###### parser includes
+	#include "mdcode.h"
+	#include "scanner.h"
+
+The grammar contains production sets, precedence/associativity
+declarations, and data type declarations.  These are all parsed with
+_ad hoc_ parsing as we don't have a parser generator yet.
+
+The precedence and associativity can be set for each production, but
+can be inherited from symbols.  The data type is potentially defined
+for each non-terminal and describes what C structure is used to store
+information about each symbol.
+
+###### declarations
+	enum assoc {Left, Right, Non};
+	char *assoc_names[] = {"Left","Right","Non"};
+
+	struct symbol {
+		struct text struct_name;
+		enum assoc assoc;
+		unsigned short precedence;
+		## symbol fields
+	};
+	struct production {
+		unsigned short precedence;
+		enum assoc assoc;
+		## production fields
+	};
+	struct grammar {
+		## grammar fields
+	};
+
+The strings reported by `mdcode` and `scanner` are `struct text` which have
+length rather than being null terminated.  To help with printing an
+comparing we define `text_is` and `prtxt`, which should possibly go in
+`mdcode`.  `scanner` does provide `text_dump` which is useful for strings
+which might contain control characters.
+
+###### functions
+	static int text_is(struct text t, char *s)
+	{
+		return (strlen(s) == t.len &&
+			strncmp(s, t.txt, t.len) == 0);
+	}
+	static void prtxt(struct text t)
+	{
+		printf("%.*s", t.len, t.txt);
+	}
+
+### Symbols
+
+Productions are comprised primarily of symbols - terminal and
+non-terminal.  We do not make a syntactic distinction between the two,
+though non-terminals must be identifiers.  Non-terminal symbols are
+those which appear in the head of a production, terminal symbols are
+those which don't.  There are also "virtual" symbols used for precedence
+marking discussed later, and sometimes we won't know what type a symbol
+is yet.
+
+###### forward declarations
+	enum symtype { Unknown, Virtual, Terminal, Nonterminal };
+	char *symtypes = "UVTN";
+###### symbol fields
+	enum symtype type;
+
+Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
+table of known symbols and the resulting parser will report them as
+`TK_reserved + N`.  A small set of identifiers are reserved for the
+different token types that `scanner` can report.
+
+###### declarations
+	static char *reserved_words[] = {
+		[TK_error]        = "ERROR",
+		[TK_number]       = "NUMBER",
+		[TK_ident]        = "IDENTIFIER",
+		[TK_mark]         = "MARK",
+		[TK_string]       = "STRING",
+		[TK_multi_string] = "MULTI_STRING",
+		[TK_indent]       = "INDENT",
+		[TK_undent]       = "UNDENT",
+		[TK_newline]      = "NEWLINE",
+		[TK_eof]          = "$eof",
+	};
+###### symbol fields
+	short num;
+
+Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
+recognised.  The former is automatically expected at the end of the text
+being parsed. The latter are always ignored by the parser.
+
+All of these symbols are stored in a simple symbol table.  We use the
+`struct text` from `mdcode` to store the name and link them together
+into a sorted list using an insertion sort.
+
+We don't have separate `find` and `insert` functions as any symbol we
+find needs to be remembered.  We simply expect `find` to always return a
+symbol, but its type might be `Unknown`.
+
+###### includes
+	#include <string.h>
+
+###### symbol fields
+	struct text name;
+	struct symbol *next;
+
+###### grammar fields
+	struct symbol *syms;
+	int num_syms;
+
+###### functions
+	static int text_cmp(struct text a, struct text b)
+	{
+		int len = a.len;
+		if (a.len > b.len)
+			len = b.len;
+		int cmp = strncmp(a.txt, b.txt, len);
+		if (cmp)
+			return cmp;
+		else
+			return a.len - b.len;
+	}
+
+	static struct symbol *sym_find(struct grammar *g, struct text s)
+	{
+		struct symbol **l = &g->syms;
+		struct symbol *n;
+		int cmp = 1;
+
+		while (*l &&
+			(cmp = text_cmp((*l)->name, s)) < 0)
+				l = & (*l)->next;
+		if (cmp == 0)
+			return *l;
+		n = calloc(1, sizeof(*n));
+		n->name = s;
+		n->type = Unknown;
+		n->next = *l;
+		n->num = -1;
+		*l = n;
+		return n;
+	}
+
+	static void symbols_init(struct grammar *g)
+	{
+		int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
+		int i;
+		for (i = 0; i < entries; i++) {
+			struct text t;
+			struct symbol *s;
+			t.txt = reserved_words[i];
+			if (!t.txt)
+				continue;
+			t.len = strlen(t.txt);
+			s = sym_find(g, t);
+			s->type = Terminal;
+			s->num = i;
+		}
+	}
+
+### Data types and precedence.
+
+Data type specification and precedence specification are both
+introduced by a dollar sign at the start of the line.  If the next
+word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+precedence, otherwise it specifies a data type.
+
+The data type name is simply stored and applied to the head of all
+subsequent productions.  It must be the name of a structure, so `$expr`
+maps to `struct expr`.
+
+The precedence line must contain a list of symbols - typically
+terminal symbols, but not necessarily.  It can only contain symbols
+that have not been seen yet, so precedence declaration must precede
+the productions that they affect.
+
+A precedence line may also contain a "virtual" symbol which is an
+identifier preceded by `$$`.  Virtual terminals carry precedence
+information but are not included in the grammar.  A production can
+declare that it inherits the precedence of a given virtual symbol.
+
+This use for `$$` precludes it from being used as a symbol in the
+described language.  Two other symbols: `${` and `}$` are also
+unavailable.
+
+Each new precedence line introduces a new precedence level and
+declares how it associates.  This level is stored in each symbol
+listed and may be inherited by any production which uses the symbol.  A
+production inherits from the last symbol which has a precedence.
+
+###### grammar fields
+	struct text current_type;
+	int prec_levels;
+
+###### declarations
+	enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
+	static const char *known[] = { "$$", "${", "}$" };
+
+###### functions
+	static char *dollar_line(struct token_state *ts, struct grammar *g)
+	{
+		struct token t = token_next(ts);
+		char *err;
+		enum assoc assoc;
+		int found;
+
+		if (t.num != TK_ident) {
+			err = "type or assoc expected after '$'";
+			goto abort;
+		}
+		if (text_is(t.txt, "LEFT"))
+			assoc = Left;
+		else if (text_is(t.txt, "RIGHT"))
+			assoc = Right;
+		else if (text_is(t.txt, "NON"))
+			assoc = Non;
+		else {
+			g->current_type = t.txt;
+			t = token_next(ts);
+			if (t.num != TK_newline) {
+				err = "Extra tokens after type name";
+				goto abort;
+			}
+			return NULL;
+		}
+
+		// This is a precedence line, need some symbols.
+		found = 0;
+		g->prec_levels += 1;
+		t = token_next(ts);
+		while (t.num != TK_newline) {
+			enum symtype type = Terminal;
+			struct symbol *s;
+			if (t.num == TK_virtual) {
+				type = Virtual;
+				t = token_next(ts);
+				if (t.num != TK_ident) {
+					err = "$$ must be followed by a word";
+					goto abort;
+				}
+			} else if (t.num != TK_ident &&
+			           t.num != TK_mark) {
+				err = "Illegal token in precedence line";
+				goto abort;
+			}
+			s = sym_find(g, t.txt);
+			if (s->type != Unknown) {
+				err = "Symbols in precedence line must not already be known.";
+				goto abort;
+			}
+			s->type = type;
+			s->precedence = g->prec_levels;
+			s->assoc = assoc;
+			found += 1;
+		}
+		if (found == 0)
+			err = "No symbols given on precedence line";
+			goto abort;
+		return NULL;
+	abort:
+		while (t.num != TK_newline && t.num != TK_eof)
+			t = token_next(ts);
+		return err;
+	}
+
+### Productions
+
+A production either starts with an identifier which is the head
+non-terminal, or a vertical bar (`|`) in which case this production
+uses the same head as the previous one.  The identifier must be
+followed by a `->` mark.  All productions for a given non-terminal must
+be grouped together with the `nonterminal ->` given only once.
+
+After this a (possibly empty) sequence of identifiers and marks form
+the body of the production.  A virtual symbol may be given after the
+body by preceding it with `$$`.  If a virtual symbol is given, the
+precedence of the production is that for the virtual symbol.  If none
+is given, the precedence is inherited from the last symbol in the
+production which has a precedence specified.
+
+After the optional precedence may come  the `${` mark.  This indicates
+the start of a code fragment.  If present, this must be on the same
+line as the start of the production.
+
+All of the text from the `${` through to the matching `}$` is
+collected and forms the code-fragment for the production.  It must all
+be in one `code_node` of the literate code.  The `}$` must be
+at the end of a line.
+
+Text in the code fragment will undergo substitutions where `$N` for
+some numeric `N` will be replaced with a variable holding the parse
+information for the particular symbol in the production.  `$0` is the
+head of the production, `$1` is the first symbol of the body, etc.
+The type of `$N` for a terminal symbol is `struct token`.  For
+non-terminal, it is whatever has been declared for that symbol.
+
+While building productions we will need to add to an array which needs to
+grow dynamically.
+
+###### functions
+	static void array_add(void *varray, int *cnt, void *new)
+	{
+		void ***array = varray;
+		int current = 0;
+		const int step = 8;
+		current = ((*cnt-1) | (step-1))+1;
+		if (*cnt == current) {
+			/* must grow */
+			current += step;
+			*array = realloc(*array, current * sizeof(void*));
+		}
+		(*array)[*cnt] = new;
+		(*cnt) += 1;
+	}
+
+Collecting the code fragment simply involves reading tokens until we
+hit the end token `}$`, and noting the character position of the start and
+the end.
+
+###### functions
+	static struct text collect_code(struct token_state *state,
+	                                struct token start)
+	{
+		struct text code;
+		struct token t;
+		code.txt = start.txt.txt + start.txt.len;
+		do
+			t = token_next(state);
+		while (t.node == start.node &&
+		       t.num != TK_close && t.num != TK_error &&
+		       t.num != TK_eof);
+		if (t.num == TK_close && t.node == start.node)
+			code.len = t.txt.txt - code.txt;
+		else
+			code.txt = NULL;
+		return code;
+	}
+
+Now we have all the bit we need to parse a full production.
+
+###### includes
+	#include <memory.h>
+
+###### grammar fields
+	struct production **productions;
+	int production_count;
+
+###### production fields
+	struct symbol  *head;
+	struct symbol **body;
+	int             body_size;
+	struct text     code;
+
+###### symbol fields
+	int first_production;
+
+###### functions
+	static char *parse_production(struct grammar *g,
+	                              struct symbol *head,
+	                              struct token_state *state)
+	{
+		/* Head has already been parsed. */
+		struct token tk;
+		char *err;
+		struct production p, *pp;
+
+		memset(&p, 0, sizeof(p));
+		p.head = head;
+		tk = token_next(state);
+		while (tk.num == TK_ident || tk.num == TK_mark) {
+			struct symbol *bs = sym_find(g, tk.txt);
+			if (bs->type == Unknown)
+				bs->type = Terminal;
+			if (bs->type == Virtual) {
+				err = "Virtual symbol not permitted in production";
+				goto abort;
+			}
+			if (bs->precedence) {
+				p.precedence = bs->precedence;
+				p.assoc = bs->assoc;
+			}
+			array_add(&p.body, &p.body_size, bs);
+			tk = token_next(state);
+		}
+		if (tk.num == TK_virtual) {
+			struct symbol *vs;
+			tk = token_next(state);
+			if (tk.num != TK_ident) {
+				err = "word required after $$";
+				goto abort;
+			}
+			vs = sym_find(g, tk.txt);
+			if (vs->type != Virtual) {
+				err = "symbol after $$ must be virtual";
+				goto abort;
+			}
+			p.precedence = vs->precedence;
+			p.assoc = vs->assoc;
+			tk = token_next(state);
+		}
+		if (tk.num == TK_open) {
+			p.code = collect_code(state, tk);
+			if (p.code.txt == NULL) {
+				err = "code fragment not closed properly";
+				goto abort;
+			}
+			tk = token_next(state);
+		}
+		if (tk.num != TK_newline && tk.num != TK_eof) {
+			err = "stray tokens at end of line";
+			goto abort;
+		}
+		pp = malloc(sizeof(*pp));
+		*pp = p;
+		array_add(&g->productions, &g->production_count, pp);
+		return NULL;
+	abort:
+		while (tk.num != TK_newline && tk.num != TK_eof)
+			tk = token_next(state);
+		return err;
+	}
+
+With the ability to parse production and dollar-lines, we have nearly all
+that we need to parse a grammar from a `code_node`.
+
+The head of the first production will effectively be the `start` symbol of
+the grammar.  However it wont _actually_ be so.  Processing the grammar is
+greatly simplified if the real start symbol only has a single production,
+and expect `$eof` as the final terminal.  So when we find the first explicit
+production we insert an extra production as production zero which looks like
+
+###### Example: production 0
+	$start -> START $eof
+
+where `START` is the first non-terminal give.
+
+###### create production zero
+	struct production *p = calloc(1,sizeof(*p));
+	struct text start = {"$start",6};
+	struct text eof = {"$eof",4};
+	p->head = sym_find(g, start);
+	p->head->type = Nonterminal;
+	array_add(&p->body, &p->body_size, head);
+	array_add(&p->body, &p->body_size, sym_find(g, eof));
+	g->start  = p->head->num;
+	p->head->first_production = g->production_count;
+	array_add(&g->productions, &g->production_count, p);
+
+Now we are ready to read in the grammar.
+
+###### grammar fields
+	int start;	// the 'start' symbol.
+
+###### grammar_read
+	static struct grammar *grammar_read(struct code_node *code)
+	{
+		struct token_config conf = {
+			.word_start = "",
+			.word_cont = "",
+			.words_marks = known,
+			.known_count = sizeof(known)/sizeof(known[0]),
+			.number_chars = "",
+			.ignored = (1 << TK_line_comment)
+			         | (1 << TK_block_comment)
+				 | (0 << TK_number)
+				 | (0 << TK_string)
+				 | (1 << TK_multi_string)
+				 | (1 << TK_indent)
+				 | (1 << TK_undent),
+		};
+
+		struct token_state *state = token_open(code, &conf);
+		struct token tk = token_next(state);
+		struct symbol *head = NULL;
+		struct grammar *g;
+		char *err = NULL;
+
+		g = calloc(1, sizeof(*g));
+		symbols_init(g);
+
+		for (tk = token_next(state); tk.num != TK_eof;
+		     tk = token_next(state)) {
+			if (tk.num == TK_newline)
+				continue;
+			if (tk.num == TK_ident) {
+				// new non-terminal
+				head = sym_find(g, tk.txt);
+				if (head->type == Nonterminal)
+					err = "This non-terminal has already be used.";
+				else if (head->type == Virtual)
+					err = "Virtual symbol not permitted in head of production";
+				else {
+					head->type = Nonterminal;
+					head->struct_name = g->current_type;
+					if (g->start == 0) {
+						## create production zero
+					}
+					head->first_production = g->production_count;
+					tk = token_next(state);
+					if (tk.num == TK_mark &&
+					    text_is(tk.txt, "->"))
+						err = parse_production(g, head, state);
+					else
+						err = "'->' missing in production";
+				}
+			} else if (tk.num == TK_mark
+			           && text_is(tk.txt, "|")) {
+				// another production for same non-term
+				if (head)
+					err = parse_production(g, head, state);
+				else
+					err = "First production must have a head";
+			} else if (tk.num == TK_mark
+			           &&  text_is(tk.txt, "$")) {
+				err = dollar_line(state, g);
+			} else {
+				err = "Unrecognised token at start of line.";
+			}
+			if (err)
+				goto abort;
+		}
+		token_close(state);
+		return g;
+	abort:
+		fprintf(stderr, "Error at line %d: %s\n",
+		        tk.line, err);
+		token_close(state);
+		free(g);
+		return NULL;
+	}
+
+## Analysing the grammar
+
+The central task in analysing the grammar is to determine a set of
+states to drive the parsing process.  This will require finding
+various sets of symbols and of "items".  Some of these sets will need
+to attach information to each element in the set, so they are more
+like maps.
+
+Each "item" may have a 'look-ahead' or `LA` set associated with
+it.  Many of these will be the same as each other.  To avoid memory
+wastage, and to simplify some comparisons of sets, these sets will be
+stored in a list of unique sets, each assigned a number.
+
+Once we have the data structures in hand to manage these sets and
+lists, we can start setting the 'nullable' flag, build the 'FIRST'
+sets, and then create the item sets which define the various states.
+
+### Symbol sets.
+
+Though we don't only store symbols in these sets, they are the main
+things we store, so they are called symbol sets or "symsets".
+
+A symset has a size, an array of shorts, and an optional array of data
+values, which are also shorts.  If the array of data is not present,
+we store an impossible pointer, as `NULL` is used to indicate that no
+memory has been allocated yet;
+
+###### declarations
+	struct symset {
+		short cnt;
+		unsigned short *syms, *data;
+	};
+	#define NO_DATA ((unsigned short *)1)
+	const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
+	const struct symset INIT_DATASET = { 0, NULL, NULL };
+
+The arrays of shorts are allocated in blocks of 8 and are kept sorted
+by using an insertion sort.  We don't explicitly record the amount of
+allocated space as it can be derived directly from the current `cnt` using
+`((cnt - 1) | 7) + 1`.
+
+###### functions
+	static void symset_add(struct symset *s, int key, int val)
+	{
+		int i;
+		int current = ((s->cnt-1) | 7) + 1;
+		if (current == s->cnt) {
+			current += 8;
+			s->syms = realloc(s->syms, sizeof(*s->syms) * current);
+			if (s->data != NO_DATA)
+				s->data = realloc(s->data, sizeof(*s->data) * current);
+		}
+		i = s->cnt;
+		while (i > 0 && s->syms[i-1] > key) {
+			s->syms[i] = s->syms[i-1];
+			if (s->data != NO_DATA)
+				s->data[i] = s->data[i-1];
+			i--;
+		}
+		s->syms[i] = key;
+		if (s->data != NO_DATA)
+			s->data[i] = val;
+		s->cnt += 1;
+	}
+
+Finding a symbol (or item) in a `symset` uses a simple binary search.
+We return the index where the value was found (so data can be accessed),
+or `-1` to indicate failure.
+
+	static int symset_find(struct symset *ss, int key)
+	{
+		int lo = 0;
+		int hi = ss->cnt;
+
+		if (hi == 0)
+			return -1;
+		while (lo + 1 < hi) {
+			int mid = (lo + hi) / 2;
+			if (ss->syms[mid] <= key)
+				lo = mid;
+			else
+				hi = mid;
+		}
+		if (ss->syms[lo] == key)
+			return lo;
+		return -1;
+	}
+
+We will often want to form the union of two symsets.  When we do, we
+will often want to know if anything changed (as they might mean there
+is more work to do).  So `symset_union` reports whether anything was
+added to the first set.  We use a slow quadratic approach as these
+sets don't really get very big.  If profiles shows this to be a problem is
+can be optimised later.
+
+	static int symset_union(struct symset *a, struct symset *b)
+	{
+		int i;
+		int added = 0;
+		for (i = 0; i < b->cnt; i++)
+			if (symset_find(a, b->syms[i]) < 0) {
+				int data = 0;
+				if (b->data != NO_DATA)
+					data = b->data[i];
+				symset_add(a, b->syms[i], data);
+				added++;
+			}
+		return added;
+	}
+
+And of course we must be able to free a symset.
+
+	static void symset_free(struct symset ss)
+	{
+		free(ss.syms);
+		if (ss.data != NO_DATA)
+			free(ss.data);
+	}
+
+### Symset Storage
+
+Some symsets are simply stored somewhere appropriate in the `grammar`
+data structure, others needs a bit of indirection.  This applies
+particularly to "LA" sets.  These will be paired with "items" in an
+"itemset".  As itemsets will be stored in a symset, the "LA" set needs to be
+stored in the `data` field, so we need a mapping from a small (short)
+number to an LA `symset`.
+
+As mentioned earlier this involves creating a list of unique symsets.
+
+For now, we just use a linear list sorted by insertion.  A skiplist
+would be more efficient and may be added later.
+
+###### declarations
+
+	struct setlist {
+		struct symset ss;
+		int num;
+		struct setlist *next;
+	};
+
+###### grammar fields
+	struct setlist *sets;
+	int nextset;
+
+###### functions
+
+	static int ss_cmp(struct symset a, struct symset b)
+	{
+		int i;
+		int diff = a.cnt - b.cnt;
+		if (diff)
+			return diff;
+		for (i = 0; i < a.cnt; i++) {
+			diff = (int)a.syms[i] - (int)b.syms[i];
+			if (diff)
+				return diff;
+		}
+		return 0;
+	}
+
+	static int save_set(struct grammar *g, struct symset ss)
+	{
+		struct setlist **sl = &g->sets;
+		int cmp = 1;
+		struct setlist *s;
+
+		while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
+			sl = & (*sl)->next;
+		if (cmp == 0) {
+			symset_free(ss);
+			return (*sl)->num;
+		}
+
+		s = malloc(sizeof(*s));
+		s->ss = ss;
+		s->num = g->nextset;
+		g->nextset += 1;
+		s->next = *sl;
+		*sl = s;
+		return s->num;
+	}
+
+Finding a set by number is currently performed by a simple linear search.
+If this turns out to hurt performance, we can store the sets in a dynamic
+array like the productions.
+
+	static struct symset set_find(struct grammar *g, int num)
+	{
+		struct setlist *sl = g->sets;
+		while (sl && sl->num != num)
+			sl = sl->next;
+		return sl->ss;
+	}
+
+
+### Setting `nullable`
+
+We set `nullable` on the head symbol for any production for which all
+the body symbols (if any) are nullable.  As this is a recursive
+definition, any change in the `nullable` setting means that we need to
+re-evaluate where it needs to be set.
+
+We simply loop around performing the same calculations until no more
+changes happen.
+
+###### symbol fields
+	int nullable;
+
+###### functions
+	static void set_nullable(struct grammar *g)
+	{
+		int check_again = 1;
+		while (check_again) {
+			int p;
+			check_again = 0;
+			for (p = 0; p < g->production_count; p++) {
+				struct production *pr = g->productions[p];
+				int s;
+
+				if (pr->head->nullable)
+					continue;
+				for (s = 0; s < pr->body_size; s++)
+					if (! pr->body[s]->nullable)
+						break;
+				if (s == pr->body_size) {
+					pr->head->nullable = 1;
+					check_again = 1;
+				}
+			}
+		}
+	}
+
+### Building the `first` sets
+
+When calculating what can follow a particular non-terminal, we will need to
+know what the "first" terminal in an subsequent non-terminal might be.  So
+we calculate the `first` set for every non-terminal and store them in an
+array.  We don't bother recording the "first" set for terminals as they are
+trivial.
+
+As the "first" for one symbol might depend on the "first" of another,
+we repeat the calculations until no changes happen, just like with
+`set_nullable`.  This makes use of the fact that `symset_union`
+reports if any change happens.
+
+The core of this which finds the "first" of part of a production body
+will be reused for computing the "follow" sets, so we split it out
+into a separate function.
+
+###### grammar fields
+	struct symset *first;
+
+###### functions
+
+	static int add_first(struct production *pr, int start,
+	                     struct symset *target, struct grammar *g,
+	                     int *to_end)
+	{
+		int s;
+		int changed = 0;
+		for (s = start; s < pr->body_size; s++) {
+			struct symbol *bs = pr->body[s];
+			if (bs->type == Terminal) {
+				if (symset_find(target, bs->num) < 0) {
+					symset_add(target, bs->num, 0);
+					changed = 1;
+				}
+				break;
+			} else if (symset_union(target, &g->first[bs->num]))
+				changed = 1;
+			if (!bs->nullable)
+				break;
+		}
+		if (to_end)
+			*to_end = (s == pr->body_size);
+		return changed;
+	}
+
+	static void build_first(struct grammar *g)
+	{
+		int check_again = 1;
+		int s;
+		g->first = calloc(g->num_syms, sizeof(g->first[0]));
+		for (s = 0; s < g->num_syms; s++)
+			g->first[s] = INIT_SYMSET;
+
+		while (check_again) {
+			int p;
+			check_again = 0;
+			for (p = 0; p < g->production_count; p++) {
+				struct production *pr = g->productions[p];
+				struct symset *head = &g->first[pr->head->num];
+
+				if (add_first(pr, 0, head, g, NULL))
+					check_again = 1;
+			}
+		}
+	}
+
+### Building the `follow` sets.
+
+There are two different situations when we will want to generate "follow"
+sets.  If we are doing an SLR analysis, we want to generate a single
+"follow" set for each non-terminal in the grammar.  That is what is
+happening here.  If we are doing an LALR or LR analysis we will want
+to generate a separate "LA" set for each item.  We do that later
+in state generation.
+
+There are two parts to generating a "follow" set.  Firstly we look at
+every place that any non-terminal appears in the body of any
+production, and we find the set of possible "first" symbols after
+there.  This is done using `add_first` above and only needs to be done
+once as the "first" sets are now stable and will not change.
+
+###### follow code
+
+	for (p = 0; p < g->production_count; p++) {
+		struct production *pr = g->productions[p];
+		int b;
+
+		for (b = 0; b < pr->body_size - 1; b++) {
+			struct symbol *bs = pr->body[b];
+			if (bs->type == Terminal)
+				continue;
+			add_first(pr, b+1, &g->follow[bs->num], g, NULL);
+		}
+	}
+
+The second part is to add the "follow" set of the head of a production
+to the "follow" sets of the final symbol in the production, and any
+other symbol which is followed only by `nullable` symbols.  As this
+depends on "follow" itself we need to repeatedly perform the process
+until no further changes happen.
+
+###### follow code
+
+	for (again = 0, p = 0;
+	     p < g->production_count;
+	     p < g->production_count-1
+	        ? p++ : again ? (again = 0, p = 0)
+			      : p++) {
+		struct production *pr = g->productions[p];
+		int b;
+
+		for (b = pr->body_size - 1; b >= 0; b--) {
+			struct symbol *bs = pr->body[b];
+			if (bs->type == Terminal)
+				break;
+			if (symset_union(&g->follow[bs->num],
+			                 &g->follow[pr->head->num]))
+				again = 1;
+			if (!bs->nullable)
+				break;
+		}
+	}
+
+We now just need to create and initialise the `follow` list to get a
+complete function.
+
+###### grammar fields
+	struct symset *follow;
+
+###### functions
+	static void build_follow(struct grammar *g)
+	{
+		int s, again, p;
+		g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
+		for (s = 0; s < g->num_syms; s++)
+			g->follow[s] = INIT_SYMSET;
+		## follow code
+	}
+
+### Building itemsets and states
+
+There are three different levels of detail that can be chosen for
+building the itemsets and states for the LR grammar.  They are:
+
+1. LR(0) or SLR(1), where no look-ahead is considered.
+2. LALR(1) where we build look-ahead sets with each item and merge
+   the LA sets when we find two paths to the same "kernel" set of items.
+3. LR(1) where different look-ahead for any item in the code means
+   a different state must be created.
+
+###### forward declarations
+	enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
+
+We need to be able to look through existing states to see if a newly
+generated state already exists.  For now we use a simple sorted linked
+list.
+
+An item is a pair of numbers: the production number and the position of
+"DOT", which is an index into the body of the production.
+As the numbers are not enormous we can combine them into a single "short"
+and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
+production number (so 4000 productions with maximum of 15 symbols in the
+body).
+
+Comparing the itemsets will be a little different to comparing symsets
+as we want to do the lookup after generating the "kernel" of an
+itemset, so we need to ignore the offset=zero items which are added during
+completion.
+
+To facilitate this, we modify the "DOT" number so that "0" sorts to the end of
+the list in the symset, and then only compare items before the first "0".
+
+###### declarations
+	static inline unsigned short item_num(int production, int index)
+	{
+		return production | (((index-1)&0x1f) << 11);
+	}
+	static inline int item_prod(unsigned short item)
+	{
+		return item & 0x7ff;
+	}
+	static inline int item_index(unsigned short item)
+	{
+		return ((item >> 11)+1) & 0x1f;
+	}
+
+For LR(1) analysis we need to compare not just the itemset in a state
+but also the LA sets.  As we assign each unique LA set a number, we
+can just compare the symset and the data values together.
+
+###### functions
+	static int itemset_cmp(struct symset a, struct symset b,
+	                       enum grammar_type type)
+	{
+		int i;
+		int av, bv;
+
+		for (i = 0;
+		     i < a.cnt && i < b.cnt &&
+		     item_index(a.syms[i]) > 0 &&
+		     item_index(b.syms[i]) > 0;
+		     i++) {
+			int diff = a.syms[i] - b.syms[i];
+			if (diff)
+				return diff;
+			if (type == LR1) {
+				diff = a.data[i] - b.data[i];
+				if (diff)
+					return diff;
+			}
+		}
+		if (i == a.cnt || item_index(a.syms[i]) == 0)
+			av = -1;
+		else
+			av = a.syms[i];
+		if (i == b.cnt || item_index(b.syms[i]) == 0)
+			bv = -1;
+		else
+			bv = b.syms[i];
+		if (av - bv)
+			return av - bv;
+		if (type < LR1 || av == -1)
+			return 0;
+		return
+			a.data[i] - b.data[i];
+	}
+
+And now we can build the list of itemsets.  The lookup routine returns
+both a success flag and a pointer to where in the list an insert
+should happen, so we don't need to search a second time.
+
+###### declarations
+	struct itemset {
+		struct itemset *next;
+		short state;
+		struct symset items;
+		struct symset go_to;
+		char completed;
+	};
+
+###### grammar fields
+	struct itemset *items;
+	int states;
+
+###### functions
+	static int itemset_find(struct grammar *g, struct itemset ***where,
+	                        struct symset kernel, enum grammar_type type)
+	{
+		struct itemset **ip;
+
+		for (ip = &g->items; *ip ; ip = & (*ip)->next) {
+			struct itemset *i = *ip;
+			int diff;
+			diff = itemset_cmp(i->items, kernel, type);
+			if (diff < 0)
+				continue;
+			if (diff > 0)
+				break;
+			/* found */
+			*where = ip;
+			return 1;
+		}
+		*where = ip;
+		return 0;
+	}
+
+Adding an itemset may require merging the LA sets if LALR analysis is
+happening. If any new LA set add symbol that weren't in the old LA set, we
+clear the `completed` flag so that the dependants of this itemset will be
+recalculated and their LA sets updated.
+
+`add_itemset` must consume the symsets it is passed, either by adding
+them to a data structure, of freeing them.
+
+	static int add_itemset(struct grammar *g, struct symset ss,
+	                       enum grammar_type type)
+	{
+		struct itemset **where, *is;
+		int i;
+		int found = itemset_find(g, &where, ss, type);
+		if (!found) {
+			is = calloc(1, sizeof(*is));
+			is->state = g->states;
+			g->states += 1;
+			is->items = ss;
+			is->next = *where;
+			is->go_to = INIT_DATASET;
+			*where = is;
+			return is->state;
+		}
+		is = *where;
+		if (type != LALR) {
+			symset_free(ss);
+			return is->state;
+		}
+		for (i = 0; i < ss.cnt; i++) {
+			struct symset temp = INIT_SYMSET, s;
+			if (ss.data[i] == is->items.data[i])
+				continue;
+			s = set_find(g, is->items.data[i]);
+			symset_union(&temp, &s);
+			s = set_find(g, ss.data[i]);
+			if (symset_union(&temp, &s)) {
+				is->items.data[i] = save_set(g, temp);
+				is->completed = 0;
+			} else
+				symset_free(temp);
+		}
+		symset_free(ss);
+		return is->state;
+	}
+
+#### The build
+
+To build all the itemsets, we first insert the initial itemset made from the
+start symbol, complete each itemset, and then generate new itemsets from old
+until no new ones can be made.
+
+Completing an itemset means finding all the items where "DOT" is followed by
+a nonterminal and adding "DOT=0" items for every production from that
+non-terminal - providing each item hasn't already been added.
+
+If LA sets are needed, the LA set for each new item is found using
+`add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This may
+be supplemented by the LA set for the item which produce the new item.
+
+We also collect a set of all symbols which follow "DOT" (in `done`) as this
+is used in the next stage.
+
+NOTE: precedence handling should happen here - I haven't written this yet
+though.
+
+###### complete itemset
+	for (i = 0; i < is->items.cnt; i++) {
+		int p = item_prod(is->items.syms[i]);
+		int bs = item_index(is->items.syms[i]);
+		struct production *pr = g->productions[p];
+		int p2;
+		struct symbol *s;
+		struct symset LA = INIT_SYMSET;
+		int sn = 0;
+
+		if (bs == pr->body_size)
+			continue;
+		s = pr->body[bs];
+		if (symset_find(&done, s->num) < 0)
+			symset_add(&done, s->num, 0);
+		if (s->type != Nonterminal)
+			continue;
+		again = 1;
+		if (type >= LALR) {
+			// Need the LA set.
+			int to_end;
+			add_first(pr, bs+1, &LA, g, &to_end);
+			if (to_end) {
+				struct symset ss = set_find(g, is->items.data[i]);
+				symset_union(&LA, &ss);
+			}
+			sn = save_set(g, LA);
+			LA = set_find(g, sn);
+		}
+		/* Add productions for this symbol */
+		for (p2 = s->first_production;
+		     p2 < g->production_count &&
+		      g->productions[p2]->head == s;
+		     p2++) {
+			int itm = item_num(p2, 0);
+			int pos = symset_find(&is->items, itm);
+			if (pos < 0) {
+				symset_add(&is->items, itm, sn);
+				/* Will have re-ordered, so start
+				 * from beginning again */
+				i = -1;
+			} else if (type >= LALR) {
+				struct symset ss = set_find(g, is->items.data[pos]);
+				struct symset tmp = INIT_SYMSET;
+
+				symset_union(&tmp, &ss);
+				if (symset_union(&tmp, &LA)) {
+					is->items.data[pos] = save_set(g, tmp);
+					i = -1;
+				}else
+					symset_free(tmp);
+			}
+		}
+	}
+
+For each symbol we found (and placed in `done`) we collect all the items for
+which this symbol is next, and create a new itemset with "DOT" advanced over
+the symbol.  This is then added to the collection of itemsets (or merged
+with a pre-existing itemset).
+
+###### derive itemsets
+	// Now we have a completed itemset, so we need to
+	// create all the 'next' itemset and create them
+	// if they don't exist.
+	for (i = 0; i < done.cnt; i++) {
+		int j;
+		int state;
+		struct symset newitemset = INIT_SYMSET;
+		if (type >= LALR)
+			newitemset = INIT_DATASET;
+
+		for (j = 0; j < is->items.cnt; j++) {
+			int itm = is->items.syms[j];
+			int p = item_prod(itm);
+			int bp = item_index(itm);
+			struct production *pr = g->productions[p];
+			int la = 0;
+			int pos;
+
+			if (bp == pr->body_size)
+				continue;
+			if (pr->body[bp]->num != done.syms[i])
+				continue;
+			if (type >= LALR)
+				la = is->items.data[j];
+			pos = symset_find(&newitemset, pr->head->num);
+			if (pos < 0)
+				symset_add(&newitemset, item_num(p, bp+1), la);
+			else if (type >= LALR) {
+				// Need to merge la set.
+				int la2 = newitemset.data[pos];
+				if (la != la2) {
+					struct symset ss = set_find(g, la2);
+					struct symset LA = INIT_SYMSET;
+					symset_union(&LA, &ss);
+					ss = set_find(g, la);
+					if (symset_union(&LA, &ss))
+						newitemset.data[pos] = save_set(g, LA);
+					else
+						symset_free(LA);
+				}
+			}
+		}
+		state = add_itemset(g, newitemset, type);
+		if (symset_find(&is->go_to, done.syms[i]) < 0)
+			symset_add(&is->go_to, done.syms[i], state);
+	}
+
+All that is left is to crate the initial itemset from production zero, and
+with `TK_eof` as the LA set.
+
+###### functions
+	static void build_itemsets(struct grammar *g, enum grammar_type type)
+	{
+		struct symset first = INIT_SYMSET;
+		struct itemset *is;
+		int again;
+		int la = 0;
+		if (type >= LALR) {
+			// LA set just has eof
+			struct symset eof = INIT_SYMSET;
+			symset_add(&eof, TK_eof, 0);
+			la = save_set(g, eof);
+			first = INIT_DATASET;
+		}
+		// production 0, offset 0  (with no data)
+		symset_add(&first, item_num(0, 0), la);
+		add_itemset(g, first, type);
+		for (again = 0, is = g->items;
+		     is;
+		     is = is->next ?: again ? (again = 0, g->items) : NULL) {
+			int i;
+			struct symset done = INIT_SYMSET;
+			if (is->completed)
+				continue;
+			is->completed = 1;
+			## complete itemset
+			## derive itemsets
+			symset_free(done);
+		}
+	}
+
+### Completing the analysis.
+
+The exact process of analysis depends on which level was requested.  For
+`LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
+`LR1` we need first, but not follow.  For `SLR` we need both.
+
+We don't build the "action" tables that you might expect as the parser
+doesn't use them.  They will be calculated to some extent if needed for
+a report.
+
+Once we have built everything we allocate arrays for the two lists:
+symbols and itemsets.  This allows more efficient access during reporting.
+The symbols are grouped as terminals and non-terminals and we record the
+changeover point in `first_nonterm`.
+
+###### grammar fields
+	struct symbol **symtab;
+	struct itemset **statetab;
+	int first_nonterm;
+
+###### grammar_analyse
+
+	static void grammar_analyse(struct grammar *g, enum grammar_type type)
+	{
+		struct symbol *s;
+		struct itemset *is;
+		int snum = TK_reserved;
+		for (s = g->syms; s; s = s->next)
+			if (s->num < 0 && s->type == Terminal) {
+				s->num = snum;
+				snum++;
+			}
+		g->first_nonterm = snum;
+		for (s = g->syms; s; s = s->next)
+			if (s->num < 0) {
+				s->num = snum;
+				snum++;
+			}
+		g->num_syms = snum;
+		g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
+		for (s = g->syms; s; s = s->next)
+			g->symtab[s->num] = s;
+
+		if (type >= SLR) {
+			set_nullable(g);
+			build_first(g);
+		}
+		if (type == SLR)
+			build_follow(g);
+
+		build_itemsets(g, type);
+
+		g->statetab = calloc(g->states, sizeof(g->statetab[0]));
+		for (is = g->items; is ; is = is->next)
+			g->statetab[is->state] = is;
+	}
+
+## Reporting on the Grammar
+
+The purpose of the report is to give the grammar developer insight into
+how the grammar parser will work.  It is basically a structured dump of
+all the tables that have been generated, plus an description of any conflicts.
+
+###### grammar_report
+	static int grammar_report(struct grammar *g, enum grammar_type type)
+	{
+		report_symbols(g);
+		if (g->follow)
+			report_follow(g);
+		report_itemsets(g);
+		return report_conflicts(g, type);
+	}
+
+Firstly we have the complete list of symbols, together with the "FIRST"
+set if that was generated.
+
+###### functions
+
+	static void report_symbols(struct grammar *g)
+	{
+		int n;
+		if (g->first)
+			printf("SYMBOLS + FIRST:\n");
+		else
+			printf("SYMBOLS:\n");
+
+		for (n = 0; n < g->num_syms; n++) {
+			struct symbol *s = g->symtab[n];
+			if (!s)
+				continue;
+
+			printf(" %c%3d%c: ",
+			       s->nullable ? '*':' ',
+			       s->num, symtypes[s->type]);
+			prtxt(s->name);
+			if (s->precedence)
+				printf(" (%d%s)", s->precedence,
+				       assoc_names[s->assoc]);
+
+			if (g->first && s->type == Nonterminal) {
+				int j;
+				char c = ':';
+				for (j = 0; j < g->first[n].cnt; j++) {
+					printf("%c ", c);
+					c = ',';
+					prtxt(g->symtab[g->first[n].syms[j]]->name);
+				}
+			}
+			printf("\n");
+		}
+	}
+
+Then we have to follow sets if they were computed.
+
+	static void report_follow(struct grammar *g)
+	{
+		int n;
+		printf("FOLLOW:\n");
+		for (n = 0; n < g->num_syms; n++)
+			if (g->follow[n].cnt) {
+				int j;
+				char c = ':';
+				printf("  ");
+				prtxt(g->symtab[n]->name);
+				for (j = 0; j < g->follow[n].cnt; j++) {
+					printf("%c ", c);
+					c = ',';
+					prtxt(g->symtab[g->follow[n].syms[j]]->name);
+				}
+				printf("\n");
+			}
+	}
+
+And finally the item sets.  These include the GO TO tables and, for
+LALR and LR1, the LA set for each item.  Lots of stuff, so we break
+it up a bit.  First the items, with production number and associativity.
+
+	static void report_item(struct grammar *g, int itm)
+	{
+		int p = item_prod(itm);
+		int dot = item_index(itm);
+		struct production *pr = g->productions[p];
+		int i;
+
+		printf("    ");
+		prtxt(pr->head->name);
+		printf(" ->");
+		for (i = 0; i < pr->body_size; i++) {
+			printf(" %s", (dot == i ? ". ": ""));
+			prtxt(pr->body[i]->name);
+		}
+		if (dot == pr->body_size)
+			printf(" .");
+		printf(" [%d]", p);
+		if (pr->precedence)
+			printf(" (%d%s)", pr->precedence,
+			       assoc_names[pr->assoc]);
+		printf("\n");
+	}
+
+The LA sets which are (possibly) reported with each item:
+
+	static void report_la(struct grammar *g, int lanum)
+	{
+		struct symset la = set_find(g, lanum);
+		int i;
+		char c = ':';
+
+		printf("        LOOK AHEAD(%d)", lanum);
+		for (i = 0; i < la.cnt; i++) {
+			printf("%c ", c);
+			c = ',';
+			prtxt(g->symtab[la.syms[i]]->name);
+		}
+		printf("\n");
+	}
+
+Then the go to sets:
+
+
+	static void report_goto(struct grammar *g, struct symset gt)
+	{
+		int i;
+		printf("    GOTO:\n");
+
+		for (i = 0; i < gt.cnt; i++) {
+			printf("      ");
+			prtxt(g->symtab[gt.syms[i]]->name);
+			printf(" -> %d\n", gt.data[i]);
+		}
+	}
+
+Now we can report all the item sets complete with items, LA sets, and GO TO.
+
+	static void report_itemsets(struct grammar *g)
+	{
+		int s;
+		printf("ITEM SETS(%d)\n", g->states);
+		for (s = 0; s < g->states; s++) {
+			int j;
+			struct itemset *is = g->statetab[s];
+			printf("  Itemset %d:\n", s);
+			for (j = 0; j < is->items.cnt; j++) {
+				report_item(g, is->items.syms[j]);
+				if (is->items.data != NO_DATA)
+					report_la(g, is->items.data[j]);
+			}
+			report_goto(g, is->go_to);
+		}
+	}
+
+### Reporting conflicts
+
+Conflict detection varies a lot among different analysis levels.  However
+LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
+LR1 are also similar as they have FOLLOW or LA sets.
+
+###### functions
+
+	## conflict functions
+
+	static int report_conflicts(struct grammar *g, enum grammar_type type)
+	{
+		int cnt = 0;
+		printf("Conflicts:\n");
+		if (type < SLR)
+			cnt = conflicts_lr0(g, type);
+		else
+			cnt = conflicts_slr(g, type);
+		if (cnt == 0)
+			printf(" - no conflicts\n");
+		return cnt;
+	}
+
+LR0 conflicts are any state which have both a reducible item and
+a shiftable item.
+
+LR05 conflicts only occurs if two possibly reductions exist,
+as shifts always over-ride reductions.
+
+###### conflict functions
+	static int conflicts_lr0(struct grammar *g, enum grammar_type type)
+	{
+		int i;
+		int cnt = 0;
+		for (i = 0; i < g->states; i++) {
+			struct itemset *is = g->statetab[i];
+			int last_reduce = -1;
+			int prev_reduce = -1;
+			int last_shift = -1;
+			int j;
+			if (!is)
+				continue;
+			for (j = 0; j < is->items.cnt; j++) {
+				int itm = is->items.syms[j];
+				int p = item_prod(itm);
+				int bp = item_index(itm);
+				struct production *pr = g->productions[p];
+
+				if (bp == pr->body_size) {
+					prev_reduce = last_reduce;
+					last_reduce = j;
+					continue;
+				}
+				if (pr->body[bp]->type == Terminal)
+					last_shift = j;
+			}
+			if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
+				printf("  State %d has both SHIFT and REDUCE:\n", i);
+				report_item(g, is->items.syms[last_shift]);
+				report_item(g, is->items.syms[last_reduce]);
+				cnt++;
+			}
+			if (prev_reduce >= 0) {
+				printf("  State %d has 2 (or more) reducible items\n", i);
+				report_item(g, is->items.syms[prev_reduce]);
+				report_item(g, is->items.syms[last_reduce]);
+				cnt++;
+			}
+		}
+		return cnt;
+	}
+
+SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
+look ahead, or if a symbol in a look-ahead can be shifted.  The differ only
+in the source of the look ahead set.
+
+We build a dataset mapping terminal to item for possible SHIFTs and then
+another for possible REDUCE operations. We report when we get conflicts
+between the two.
+
+	static int conflicts_slr(struct grammar *g, enum grammar_type type)
+	{
+		int i;
+		int cnt = 0;
+
+		for (i = 0; i < g->states; i++) {
+			struct itemset *is = g->statetab[i];
+			struct symset shifts = INIT_DATASET;
+			struct symset reduce = INIT_DATASET;
+			int j;
+			if (!is)
+				continue;
+			/* First collect the shifts */
+			for (j = 0; j < is->items.cnt; j++) {
+				int itm = is->items.syms[j];
+				int p = item_prod(itm);
+				int bp = item_index(itm);
+				struct production *pr = g->productions[p];
+
+				if (bp < pr->body_size &&
+				    pr->body[bp]->type == Terminal) {
+					/* shiftable */
+					int sym = pr->body[bp]->num;
+					if (symset_find(&shifts, sym) < 0)
+						symset_add(&shifts, sym, itm);
+				}
+			}
+			/* Now look for reduction and conflicts */
+			for (j = 0; j < is->items.cnt; j++) {
+				int itm = is->items.syms[j];
+				int p = item_prod(itm);
+				int bp = item_index(itm);
+				struct production *pr = g->productions[p];
+
+				if (bp < pr->body_size)
+					continue;
+				/* reducible */
+				struct symset la;
+				if (type == SLR)
+					la = g->follow[pr->head->num];
+				else
+					la = set_find(g, is->items.data[j]);
+				int k;
+				for (k = 0; k < la.cnt; k++) {
+					int pos = symset_find(&shifts, la.syms[k]);
+					if (pos >= 0) {
+						printf("  State %d has SHIFT/REDUCE conflict on ", i);
+						prtxt(g->symtab[la.syms[k]]->name);
+						printf(":\n");
+						report_item(g, shifts.data[pos]);
+						report_item(g, itm);
+						cnt++;
+					}
+					pos = symset_find(&reduce, la.syms[k]);
+					if (pos < 0) {
+						symset_add(&reduce, la.syms[k], itm);
+						continue;
+					}
+					printf("  State %d has REDUCE/REDUCE conflict on ", i);
+					prtxt(g->symtab[la.syms[k]]->name);
+					printf(":\n");
+					report_item(g, itm);
+					report_item(g, reduce.data[pos]);
+					cnt++;
+				}
+			}
+			symset_free(shifts);
+			symset_free(reduce);
+		}
+		return cnt;
+	}
+
+
+## Generating the parser
+
+The export part of the parser is the `parse_XX` function, where the name
+`XX` is based on the name of the parser files.
+
+This takes a `code_node`, a partially initialized `token_config`, and an
+optional `FILE` to send tracing to.  The `token_config` gets the list of
+known words added and then is used with the `code_node` to initialize the
+scanner.
+
+`parse_XX` then call the library function `parser_run` to actually complete
+the parse,  This needs the `states` table and function to call the various
+pieces of code provided in the grammar file, so they are generated first.
+
+###### parser_generate
+
+	static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
+	{
+		gen_known(f, g);
+		gen_goto(f, g);
+		gen_states(f, g);
+		gen_reduce(f, g, file);
+		gen_free(f, g);
+
+		fprintf(f, "#line 0 \"gen_parser\"\n");
+		fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
+		        name);
+		fprintf(f, "{\n");
+		fprintf(f, "\tstruct token_state *tokens;\n");
+		fprintf(f, "\tconfig->words_marks = known;\n");
+		fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
+		fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
+		fprintf(f, "\ttokens = token_open(code, config);\n");
+		fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace);\n");
+		fprintf(f, "\ttoken_close(tokens);\n");
+		fprintf(f, "\treturn rv;\n");
+		fprintf(f, "}\n\n");
+	}
+
+### Table words table
+
+The know words is simply an array of terminal symbols.
+
+###### functions
+
+	static void gen_known(FILE *f, struct grammar *g)
+	{
+		int i;
+		fprintf(f, "#line 0 \"gen_known\"\n");
+		fprintf(f, "static const char *known[] = {\n");
+		for (i = TK_reserved;
+		     i < g->num_syms && g->symtab[i]->type == Terminal;
+		     i++)
+			fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
+				g->symtab[i]->name.txt);
+		fprintf(f, "};\n\n");
+	}
+
+### States and the goto tables.
+
+For each state we record the goto table and the reducible production if
+there is one.
+Some of the details of the reducible production are stored in the
+`do_reduce` function to come later.  Here we store the production number,
+the body size (useful for stack management) and the resulting symbol (useful
+for knowing how to free data later).
+
+The go to table is stored in a simple array of `sym` and corresponding
+`state`.
+
+###### exported types
+
+	struct lookup {
+		short sym;
+		short state;
+	};
+	struct state {
+		short go_to_cnt;
+		const struct lookup * go_to;
+		short reduce_prod;
+		short reduce_size;
+		short reduce_sym;
+	};
+
+
+###### functions
+
+	static void gen_goto(FILE *f, struct grammar *g)
+	{
+		int i;
+		fprintf(f, "#line 0 \"gen_goto\"\n");
+		for (i = 0; i < g->states; i++) {
+			int j;
+			fprintf(f, "static const struct lookup goto_%d[] = {\n",
+				i);
+			struct symset gt = g->statetab[i]->go_to;
+			for (j = 0; j < gt.cnt; j++)
+				fprintf(f, "\t{ %d, %d },\n",
+					gt.syms[j], gt.data[j]);
+			fprintf(f, "};\n");
+		}
+	}
+
+###### functions
+
+	static void gen_states(FILE *f, struct grammar *g)
+	{
+		int i;
+		fprintf(f, "#line 0 \"gen_states\"\n");
+		fprintf(f, "static const struct state states[] = {\n");
+		for (i = 0; i < g->states; i++) {
+			struct itemset *is = g->statetab[i];
+			int j, prod = -1;
+			for (j = 0; j < is->items.cnt; j++) {
+				int itm = is->items.syms[j];
+				int p = item_prod(itm);
+				int bp = item_index(itm);
+				struct production *pr = g->productions[p];
+
+				if (bp < pr->body_size)
+					continue;
+				/* This is what we reduce */
+				prod = p;
+				break;
+			}
+
+			fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d },\n",
+				i, is->go_to.cnt, i, prod,
+				prod < 0 ? -1 : g->productions[prod]->body_size,
+				prod < 0 ? -1 : g->productions[prod]->head->num);
+		}
+		fprintf(f, "};\n\n");
+	}
+
+### The `do_reduce` function and the code
+
+When the parser engine decides to reduce a production, it calls `do_reduce`.
+This has two functions.
+
+Firstly, if a non-NULL `trace` file is passed, it prints out details of the
+production being reduced.  Secondly it runs the code that was included with
+the production if any.
+
+This code needs to be able to store data somewhere.  Rather than requiring
+`do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
+`do_reduce` return the size to be saved.
+
+The code fragment requires translation when written out.  Any `$N` needs to
+be converted to a reference either to that buffer (if `$0`) or to the
+structure returned by a previous reduction.  These pointer need to be cast
+to the appropriate type for each access.  All this is handling in
+`gen_code`.
+
+
+###### functions
+
+	static void gen_code(struct production *p, FILE *f, struct grammar *g)
+	{
+		char *c;
+		fprintf(f, "\t\t\t");
+		for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
+			int n;
+			if (*c != '$') {
+				fputc(*c, f);
+				if (*c == '\n')
+					fputs("\t\t\t", f);
+				continue;
+			}
+			c++;
+			if (*c < '0' || *c > '9') {
+				fputc(*c, f);
+				continue;
+			}
+			n = *c - '0';
+			while (c[1] >= '0' && c[1] <= '9') {
+				c += 1;
+				n = n * 10 + *c - '0';
+			}
+			if (n == 0)
+				fprintf(f, "(*(struct %.*s*)ret)",
+					p->head->struct_name.len,
+					p->head->struct_name.txt);
+			else if (n > p->body_size)
+				fprintf(f, "$%d", n);
+			else if (p->body[n-1]->type == Terminal)
+				fprintf(f, "(*(struct token *)body[%d])",
+					n-1);
+			else if (p->body[n-1]->struct_name.txt == NULL)
+				fprintf(f, "$%d", n);
+			else
+				fprintf(f, "(*(struct %.*s*)body[%d])",
+					p->body[n-1]->struct_name.len,
+					p->body[n-1]->struct_name.txt, n-1);
+		}
+		fputs("\n", f);
+	}
+
+###### functions
+
+	static void gen_reduce(FILE *f, struct grammar *g, char *file)
+	{
+		int i, j;
+		fprintf(f, "#line 0 \"gen_reduce\"\n");
+		fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n");
+		fprintf(f, "                      void *ret, FILE *trace)\n");
+		fprintf(f, "{\n");
+		fprintf(f, "\tint ret_size = 0;\n");
+
+		fprintf(f, "\tswitch(prod) {\n");
+		for (i = 0; i < g->production_count; i++) {
+			struct production *p = g->productions[i];
+			fprintf(f, "\tcase %d:\n", i);
+
+			if (p->code.txt)
+				gen_code(p, f, g);
+
+			fprintf(f, "\t\tif (trace) {\n");
+			fprintf(f, "\t\t\tfprintf(trace, \"[%%2d]%.*s ->\", depth);\n",
+			        p->head->name.len, p->head->name.txt);
+			for (j = 0; j < p->body_size; j++) {
+				if (p->body[j]->type == Terminal) {
+					fputs("\t\t\tfputs(\" \", trace);\n", f);
+					fprintf(f, "\t\t\ttext_dump(trace, (*(struct token*)body[%d]).txt, 20);\n", j);
+				} else {
+					fprintf(f, "\t\t\tfprintf(trace, \" %.*s\");\n",
+					        p->body[j]->name.len,
+					        p->body[j]->name.txt);
+				}
+			}
+			fprintf(f, "\t\t}\n");
+
+			if (p->head->struct_name.txt)
+				fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
+					p->head->struct_name.len,
+					p->head->struct_name.txt);
+
+			fprintf(f, "\t\tbreak;\n");
+		}
+		fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
+	}
+
+### `do_free`
+
+As each non-terminal can potentially cause a different type of data
+structure to be allocated and filled in, we need to be able to free it when
+done.
+
+It is particularly important to have fine control over freeing during error
+recovery where individual stack frames might need to be freed.
+
+For this, the grammar author required to defined a `free_XX` function for
+each structure that is used by a non-terminal.  `do_free` all call whichever
+is appropriate given a symbol number, and will call `free` (as is
+appropriate for tokens` on any terminal symbol.
+
+###### functions
+
+	static void gen_free(FILE *f, struct grammar *g)
+	{
+		int i;
+
+		fprintf(f, "#line 0 \"gen_free\"\n");
+		fprintf(f, "static void do_free(short sym, void *asn)\n");
+		fprintf(f, "{\n");
+		fprintf(f, "\tif (!asn) return;\n");
+		fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
+		fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
+		fprintf(f, "\tswitch(sym) {\n");
+
+		for (i = 0; i < g->num_syms; i++) {
+			struct symbol *s = g->symtab[i];
+			if (!s ||
+			    s->type != Nonterminal ||
+			    s->struct_name.txt == NULL)
+				continue;
+
+			fprintf(f, "\tcase %d:\n", s->num);
+			fprintf(f, "\t\tfree_%.*s(asn);\n",
+			        s->struct_name.len,
+			        s->struct_name.txt);
+			fprintf(f, "\t\tbreak;\n");
+		}
+		fprintf(f, "\t}\n}\n\n");
+	}
+
+## The main routine.
+
+There are three key parts to the "main" function of parsergen: processing
+the arguments, loading the grammar file, and dealing with the grammar.
+
+### Argument processing.
+
+Command line options allow the selection of analysis mode, name of output
+file, and whether or not a report should be generated.  By default we create
+a report only if no code output was requested.
+
+The `parse_XX` function name uses the basename of the output file which
+should not have a suffix (`.c`).  `.c` is added to the given name for the
+code, and `.h` is added for the header (if header text is specifed in the
+grammar file).
+
+###### includes
+	#include <getopt.h>
+
+###### declarations
+	static const struct option long_options[] = {
+		{ "LR0",	0, NULL, '0' },
+		{ "LR05",	0, NULL, '5' },
+		{ "SLR",	0, NULL, 'S' },
+		{ "LALR",	0, NULL, 'L' },
+		{ "LR1",	0, NULL, '1' },
+		{ "report",	0, NULL, 'R' },
+		{ "output",	1, NULL, 'o' },
+		{ NULL,		0, NULL, 0   }
+	};
+	const char *options = "05SL1Ro:";
+
+###### process arguments
+	int opt;
+	char *outfile = NULL;
+	char *infile;
+	char *name;
+	int report = 1;
+	enum grammar_type type = LR05;
+	while ((opt = getopt_long(argc, argv, options,
+	                          long_options, NULL)) != -1) {
+		switch(opt) {
+		case '0':
+			type = LR0; break;
+		case '5':
+			type = LR05; break;
+		case 'S':
+			type = SLR; break;
+		case 'L':
+			type = LALR; break;
+		case '1':
+			type = LR1; break;
+		case 'R':
+			report = 2; break;
+		case 'o':
+			outfile = optarg; break;
+		default:
+			fprintf(stderr, "Usage: parsergen ...\n");
+			exit(1);
+		}
+	}
+	if (optind < argc)
+		infile = argv[optind++];
+	else {
+		fprintf(stderr, "No input file given\n");
+		exit(1);
+	}
+	if (outfile && report == 1)
+		report = 0;
+	name = outfile;
+	if (name && strchr(name, '/'))
+		name = strrchr(name, '/')+1;
+
+	if (optind < argc) {
+		fprintf(stderr, "Excess command line arguments\n");
+		exit(1);
+	}
+
+### Loading the grammar file
+
+To be able to run `mdcode` and `scanner` on the grammar we need to memory
+map it.
+
+One we have extracted the code (with `mdcode`) we expect to file three
+sections: header, code, and grammar.  Anything else is an error.
+
+"header" and "code" are optional, though it is hard to build a working
+parser with neither. "grammar" must be provided.
+
+###### includes
+	#include <fcntl.h>
+	#include <sys/mman.h>
+	#include <errno.h>
+
+###### functions
+	static int errs;
+	static void pr_err(char *msg)
+	{
+		errs++;
+		fprintf(stderr, "%s\n", msg);
+	}
+
+###### load file
+	struct section *table;
+	int fd;
+	int len;
+	char *file;
+	fd = open(infile, O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr, "parsergen: cannot open %s: %s\n",
+			infile, strerror(errno));
+		exit(1);
+	}
+	len = lseek(fd, 0, 2);
+	file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+	table = code_extract(file, file+len, pr_err);
+
+	struct code_node *hdr = NULL;
+	struct code_node *code = NULL;
+	struct code_node *gram = NULL;
+	for (s = table; s; s = s->next) {
+		if (text_is(s->section, "header"))
+			hdr = s->code;
+		else if (text_is(s->section, "code"))
+			code = s->code;
+		else if (text_is(s->section, "grammar"))
+			gram = s->code;
+		else {
+			fprintf(stderr, "Unknown content section: %.*s\n",
+			        s->section.len, s->section.txt);
+			rv |= 2;
+		}
+	}
+
+### Processing the input
+
+As we need to append an extention to a filename and then open it for
+writing, and we need to do this twice, it helps to have a separate function.
+
+###### functions
+
+	static FILE *open_ext(char *base, char *ext)
+	{
+		char *fn = malloc(strlen(base) + strlen(ext) + 1);
+		FILE *f;
+		strcat(strcpy(fn, base), ext);
+		f = fopen(fn, "w");
+		free(fn);
+		return f;
+	}
+
+If we can read the grammar, then we analyse and optionally report on it.  If
+the report finds conflicts we will exit with an error status.
+
+###### process input
+	struct grammar *g = NULL;
+	if (gram == NULL) {
+		fprintf(stderr, "No grammar section provided\n");
+		rv |= 2;
+	} else {
+		g = grammar_read(gram);
+		if (!g) {
+			fprintf(stderr, "Failure to parse grammar\n");
+			rv |= 2;
+		}
+	}
+	if (g) {
+		grammar_analyse(g, type);
+		if (report)
+			if (grammar_report(g, type))
+				rv |= 1;
+	}
+
+If a headers section is defined, we write it out and include a declaration
+for the `parse_XX` function so it can be used from separate code.
+
+	if (rv == 0 && hdr && outfile) {
+		FILE *f = open_ext(outfile, ".h");
+		if (f) {
+			code_node_print(f, hdr, infile);
+			fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
+			        name);
+			fclose(f);
+		} else {
+			fprintf(stderr, "Cannot create %s.h\n",
+			        outfile);
+			rv |= 4;
+		}
+	}
+
+And if all goes well and an output file was provided, we create the `.c`
+file with the code section (if any) and the parser tables and function.
+
+	if (rv == 0 && outfile) {
+		FILE *f = open_ext(outfile, ".c");
+		if (f) {
+			if (code)
+				code_node_print(f, code, infile);
+			gen_parser(f, g, infile, name);
+			fclose(f);
+		} else {
+			fprintf(stderr, "Cannot create %s.c\n",
+			        outfile);
+			rv |= 4;
+		}
+	}
+
+And that about wraps it up.  We need to set the locale so that UTF-8 is
+recognised properly, and link with `libicuuc` is `libmdcode` requires that.
+
+###### File: parsergen.mk
+	parsergen : parsergen.o libscanner.o libmdcode.o
+		$(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
+
+###### includes
+	#include <locale.h>
+
+###### main
+
+	int main(int argc, char *argv[])
+	{
+		struct section *s;
+		int rv = 0;
+
+		setlocale(LC_ALL,"");
+
+		## process arguments
+		## load file
+		## process input
+
+		return rv;
+	}
+
+## The SHIFT/REDUCE parser
+
+Having analysed the grammar and generated all the table, we only need the
+shift/reduce engine to bring it all together.
+
+### Goto table lookup
+
+The parser generator has nicely provided us with goto tables sorted by
+symbol number.  We need a binary search function to find a symbol in the
+table.
+
+###### parser
+
+	static int search(const struct state *l, int sym)
+	{
+		int lo = 0;
+		int hi = l->go_to_cnt;
+
+		if (hi == 0)
+			return -1;
+		while (lo + 1 < hi) {
+			int mid = (lo + hi) / 2;
+			if (l->go_to[mid].sym <= sym)
+				lo = mid;
+			else
+				hi = mid;
+		}
+		if (l->go_to[lo].sym == sym)
+			return l->go_to[lo].state;
+		else
+			return -1;
+	}
+
+### The state stack.
+
+The core data structure for the parser is the stack.  This track all the
+symbols that have been recognised or partially recognised.
+
+The stack usually won't grow very large - maybe a few tens of entries.  So
+we dynamically resize and array as required but never bother to shrink it
+down again.
+
+We keep the stack as two separate allocations.  One, `asn_stack` stores the
+"abstract syntax nodes" which are created by each reduction.  When we call
+`do_reduce` we need to pass an array of the `asn`s of the body of the
+production, and by keeping a separate `asn` stack, we can just pass a
+pointer into this stack.
+
+The other allocation store all other stack fields of which there are two.
+The `state` is the most important one and guides the parsing process.  The
+`sym` is nearly unnecessary.  However when we want to free entries from the
+`asn_stack`, it helps to know what type they are so we can call the right
+freeing function.  The symbol leads us to the right free function through
+`do_free`.
+
+###### parser
+
+	struct parser {
+		int state;
+		struct frame {
+			short state;
+			short sym;
+		} *stack;
+		void **asn_stack;
+		int stack_size;
+		int tos;
+	};
+
+#### Shift and pop
+
+The operations are needed on the stack - shift (which is like push) and pop.
+
+Shift applies no only to terminals but also to non-terminals.  When we
+reduce a production we will pop off entries corresponding to the body
+symbols, then push on an item for the head of the production.  This last is
+exactly the same process as shifting in a terminal so we use the same
+function for both.
+
+To simplify other code we arrange for `shift` to fail if there is no `goto`
+state for the symbol.  This is useful in basic parsing due to our design
+that we shift when we can, and reduce when we cannot.  So the `shift`
+function reports if it could.
+
+So `shift` finds the next state.  If that succeed it extends the allocations
+if needed and pushed all the information onto the stacks.
+
+###### parser
+
+	static int shift(struct parser *p,
+	                 int sym, void *asn,
+	                 const struct state states[])
+	{
+		// Push an entry onto the stack
+		int newstate = search(&states[p->state], sym);
+		if (newstate < 0)
+			return 0;
+		if (p->tos >= p->stack_size) {
+			p->stack_size += 10;
+			p->stack = realloc(p->stack, p->stack_size
+		                           * sizeof(p->stack[0]));
+			p->asn_stack = realloc(p->asn_stack, p->stack_size
+		                           * sizeof(p->asn_stack[0]));
+		}
+		p->stack[p->tos].state = p->state;
+		p->stack[p->tos].sym = sym;
+		p->asn_stack[p->tos] = asn;
+		p->tos++;
+		p->state = newstate;
+		return 1;
+	}
+
+`pop` simply moves the top of stack (`tos`) back down the required amount
+and frees and `asn` entries that need to be freed.  It is called _after_ we
+reduce a production, just before we `shift` the nonterminal in.
+
+###### parser
+
+	static void pop(struct parser *p, int num,
+	                void(*do_free)(short sym, void *asn))
+	{
+		int i;
+		p->tos -= num;
+		for (i = 0; i < num; i++)
+			do_free(p->stack[p->tos+i].sym,
+			        p->asn_stack[p->tos+i]);
+
+		p->state = p->stack[p->tos].state;
+	}
+
+### Memory allocation
+
+The `scanner` returns tokens in a local variable - we want them in allocated
+memory so they can live in the `asn_stack`.  Similarly the `asn` produce by
+a reduce is in a large buffer.  Both of these require some allocation and
+copying, hence `memdup` and `tokcopy`.
+
+###### parser includes
+	#include <memory.h>
+
+###### parser
+
+	void *memdup(void *m, int len)
+	{
+		void *ret;
+		if (len == 0)
+			return NULL;
+		ret = malloc(len);
+		memcpy(ret, m, len);
+		return ret;
+	}
+
+	static struct token *tok_copy(struct token tk)
+	{
+		struct token *new = malloc(sizeof(*new));
+		*new = tk;
+		return new;
+	}
+
+### The heart of the parser.
+
+Now we have the parser.  If we can shift, we do.  If not and we can reduce
+we do.  If the production we reduced was production zero, then we have
+accepted the input and can finish.
+
+If we can neither shift nor reduce we have an error to handle.  We pop
+single entries off the stack until we can shift the `TK_error` symbol, the
+drop input tokens until we find one we can shift into the new error state.
+
+We return whatever `asn` was returned by reducing production zero.
+
+###### parser includes
+	#include "parser.h"
+###### parser
+	void *parser_run(struct token_state *tokens,
+	                 const struct state states[],
+	                 int (*do_reduce)(int, int, void**, void*, FILE*),
+	                 void (*do_free)(short, void*),
+	                 FILE *trace)
+	{
+		struct parser p = { 0 };
+		struct token *tk;
+		int accepted = 0;
+		void *ret;
+
+		tk = tok_copy(token_next(tokens));
+		while (!accepted) {
+			if (shift(&p, tk->num, tk, states)) {
+				if (trace) {
+					fputs("Shift ", trace);
+					text_dump(trace, tk->txt, 20);
+					fputs("\n", trace);
+				}
+				tk = tok_copy(token_next(tokens));
+				continue;
+			}
+			if (states[p.state].reduce_prod >= 0) {
+				void **body;
+				int prod = states[p.state].reduce_prod;
+				int size = states[p.state].reduce_size;
+				int sym = states[p.state].reduce_sym;
+				int bufsize;
+				static char buf[16*1024];
+
+				body = p.asn_stack +
+					(p.tos - states[p.state].reduce_size);
+
+				bufsize = do_reduce(prod, p.tos, body,
+				                    buf, trace);
+				if (trace)
+					fputs("\n", trace);
+
+				pop(&p, size, do_free);
+				shift(&p, sym, memdup(buf, bufsize), states);
+				if (prod == 0)
+					accepted = 1;
+				continue;
+			}
+			/* Error. we walk up the stack until we
+			 * find a state which will accept TK_error.
+			 * We then shift in TK_error and see what state
+			 * that takes us too.
+			 * Then we discard input tokens until
+			 * we find one that is acceptable.
+			 */
+			while (shift(&p, TK_error, tk, states) < 0
+			       && p.tos > 0)
+				// discard this state
+				pop(&p, 1, do_free);
+			tk = tok_copy(*tk);
+			while (search(&states[p.state], tk->num) < 0 &&
+			       tk->num != TK_eof) {
+				free(tk);
+				tk = tok_copy(token_next(tokens));
+			}
+			if (p.tos == 0 && tk->num == TK_eof)
+				break;
+		}
+		free(tk);
+		if (accepted)
+			ret = p.asn_stack[0];
+		else
+			pop(&p, p.tos, do_free);
+		free(p.asn_stack);
+		free(p.stack);
+		return ret;
+	}
+
+###### exported functions
+	void *parser_run(struct token_state *tokens,
+	                 const struct state states[],
+	                 int (*do_reduce)(int, int, void**, void*, FILE*),
+	                 void (*do_free)(short, void*),
+	                 FILE *trace);
+
+
+# A Worked Example
+
+The obvious example for a parser is a calculator.
+
+As `scanner` provides number parsing function using `libgmp` is it not much
+work to perform arbitrary rational number calculations.
+
+This calculator takes one expression, or an equality test per line.  The
+results are printed and in any equality test fails, the program exits with
+an error.
+
+Embedding mdcode inside mdcode is rather horrible.  I'd like to find a
+better approach, but as the grammar file must have 3 components I need
+something like this.
+
+###### File: parsergen.mk
+	calc.c : parsergen calc.cgm
+		./parsergen -o calc calc.cgm
+	calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
+		$(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
+
+###### demo grammar
+
+	# header
+	~~~~~
+
+	#include "number.h"
+	// what do we use for a demo-grammar?  A calculator of course.
+	struct number {
+		mpq_t val;
+		char tail[2];
+		int err;
+	};
+
+	~~~~~
+	# code
+	~~~~~
+
+	#include <stdlib.h>
+	#include <unistd.h>
+	#include <fcntl.h>
+	#include <sys/mman.h>
+	#include <stdio.h>
+	#include <malloc.h>
+	#include <gmp.h>
+	#include "mdcode.h"
+	#include "scanner.h"
+	#include "number.h"
+	#include "parser.h"
+
+	#include "calc.h"
+
+	static void free_number(struct number *n)
+	{
+		mpq_clear(n->val);
+		free(n);
+	}
+
+	int main(int argc, char *argv[])
+	{
+		int fd = open(argv[1], O_RDONLY);
+		int len = lseek(fd, 0, 2);
+		char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+		struct section *s = code_extract(file, file+len, NULL);
+		struct token_config config = {
+			.ignored = (1 << TK_line_comment)
+			         | (1 << TK_block_comment)
+			         | (1 << TK_indent)
+			         | (1 << TK_undent),
+			.number_chars = ".,_+-",
+			.word_start = "",
+			.word_cont = "",
+		};
+		parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
+		exit(0);
+	}
+
+	~~~~~
+	# grammar
+	~~~~~
+
+	Session -> Session Line
+		| Line
+
+	Line -> Expression NEWLINE ${ gmp_printf( "Answer = %Qd\n", $1.val);
+					{ mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
+					gmp_printf( "  or as a decimal: %Fg\n", fl);
+					mpf_clear(fl);
+					}
+				     }$
+		| Expression = Expression NEWLINE ${
+			if (mpq_equal($1.val, $3.val))
+				gmp_printf( "Both equal %Qd\n", $1.val);
+			else {
+				gmp_printf( "NOT EQUAL: %Qd\n      != : %Qd\n",
+					$1.val, $3.val);
+				exit(1);
+			}
+		}$
+		|  NEWLINE ${ printf( "Blank line\n"); }$
+		| ERROR NEWLINE ${ printf( "Skipped a bad line\n"); }$
+
+	$number
+	Expression -> Expression +  Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
+		| Expression -  Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
+		| Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
+
+	Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
+		| Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+		| Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
+
+	Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
+		| ( Expression  ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
-- 
2.43.0