New file: parsergen

author NeilBrown <neilb@suse.de>

Fri, 12 Jul 2013 21:21:37 +0000 (07:21 +1000)

committer NeilBrown <neilb@suse.de>

Fri, 12 Jul 2013 21:28:40 +0000 (07:28 +1000)
author NeilBrown <neilb@suse.de>
Fri, 12 Jul 2013 21:21:37 +0000 (07:21 +1000)
committer NeilBrown <neilb@suse.de>
Fri, 12 Jul 2013 21:28:40 +0000 (07:28 +1000)
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

new file mode 100644 (file)

index 0000000..ddf03ab
--- /dev/null
+++ b/csrc/parsergen.mdc
@@ -0,0 +1,2554 @@
+# LR(1) Parser Generator #
+
+This parser generator takes a grammar description combined with code
+fragments, analyses it, and can report details about the analysis and
+write out C code files which can be compiled to make a parser.
+
+There are several distinct sections.
+
+1. `grammar_read` will read a grammar from a literate-code file and
+   store the productions, symbols, and code fragments.
+2. `grammar_analyse` will take that grammar and build LR parsing
+   states and look-ahead tables.
+3. `grammar_report` will present the details of the analysis
+   in a readable format and will report any conflicts.
+4. `parser_generate` will write out C code files with various
+   tables and with the code fragments arranged in useful places.
+5. `parser_run` is a library function intended to be linked together
+   with the generated parser tables to complete the implementation of
+   a parser.
+6. `calc.cgm` is a test grammar for a simple calculator.
+
+###### File: parsergen.c
+       #include <unistd.h>
+       #include <stdlib.h>
+       #include <stdio.h>
+       ## includes
+       ## forward declarations
+       ## declarations
+       ## functions
+       ## grammar_read
+       ## grammar_analyse
+       ## grammar_report
+       ## parser_generate
+       ## main
+###### File: parser.h
+       ## exported types
+       ## exported functions
+###### File: libparser.c
+       #include <unistd.h>
+       #include <stdlib.h>
+       #include <stdio.h>
+       ## parser includes
+       ## parser
+###### File: calc.cgm
+       ## demo grammar
+###### File: parsergen.mk
+       CFLAGS += -Wall -g
+       all :: parsergen calc
+       parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
+               ./md2c parsergen.mdc
+
+## Reading the grammar
+
+The grammar must be stored in a markdown literate code file as parsed
+by "[mdcode][]".  It must have three top level (i.e. unreferenced)
+sections: `header`, `code`, and `grammar`.  The first two will be
+literally copied into the generated `.c` and `.h`. files.  The last
+contains the grammar.  This is tokenised with "[scanner][]".
+
+[mdcode]: mdcode.html
+[scanner]: scanner.html
+
+###### includes
+       #include "mdcode.h"
+       #include "scanner.h"
+
+###### parser includes
+       #include "mdcode.h"
+       #include "scanner.h"
+
+The grammar contains production sets, precedence/associativity
+declarations, and data type declarations.  These are all parsed with
+_ad hoc_ parsing as we don't have a parser generator yet.
+
+The precedence and associativity can be set for each production, but
+can be inherited from symbols.  The data type is potentially defined
+for each non-terminal and describes what C structure is used to store
+information about each symbol.
+
+###### declarations
+       enum assoc {Left, Right, Non};
+       char *assoc_names[] = {"Left","Right","Non"};
+
+       struct symbol {
+               struct text struct_name;
+               enum assoc assoc;
+               unsigned short precedence;
+               ## symbol fields
+       };
+       struct production {
+               unsigned short precedence;
+               enum assoc assoc;
+               ## production fields
+       };
+       struct grammar {
+               ## grammar fields
+       };
+
+The strings reported by `mdcode` and `scanner` are `struct text` which have
+length rather than being null terminated.  To help with printing an
+comparing we define `text_is` and `prtxt`, which should possibly go in
+`mdcode`.  `scanner` does provide `text_dump` which is useful for strings
+which might contain control characters.
+
+###### functions
+       static int text_is(struct text t, char *s)
+       {
+               return (strlen(s) == t.len &&
+                       strncmp(s, t.txt, t.len) == 0);
+       }
+       static void prtxt(struct text t)
+       {
+               printf("%.*s", t.len, t.txt);
+       }
+
+### Symbols
+
+Productions are comprised primarily of symbols - terminal and
+non-terminal.  We do not make a syntactic distinction between the two,
+though non-terminals must be identifiers.  Non-terminal symbols are
+those which appear in the head of a production, terminal symbols are
+those which don't.  There are also "virtual" symbols used for precedence
+marking discussed later, and sometimes we won't know what type a symbol
+is yet.
+
+###### forward declarations
+       enum symtype { Unknown, Virtual, Terminal, Nonterminal };
+       char *symtypes = "UVTN";
+###### symbol fields
+       enum symtype type;
+
+Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
+table of known symbols and the resulting parser will report them as
+`TK_reserved + N`.  A small set of identifiers are reserved for the
+different token types that `scanner` can report.
+
+###### declarations
+       static char *reserved_words[] = {
+               [TK_error]        = "ERROR",
+               [TK_number]       = "NUMBER",
+               [TK_ident]        = "IDENTIFIER",
+               [TK_mark]         = "MARK",
+               [TK_string]       = "STRING",
+               [TK_multi_string] = "MULTI_STRING",
+               [TK_indent]       = "INDENT",
+               [TK_undent]       = "UNDENT",
+               [TK_newline]      = "NEWLINE",
+               [TK_eof]          = "$eof",
+       };
+###### symbol fields
+       short num;
+
+Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
+recognised.  The former is automatically expected at the end of the text
+being parsed. The latter are always ignored by the parser.
+
+All of these symbols are stored in a simple symbol table.  We use the
+`struct text` from `mdcode` to store the name and link them together
+into a sorted list using an insertion sort.
+
+We don't have separate `find` and `insert` functions as any symbol we
+find needs to be remembered.  We simply expect `find` to always return a
+symbol, but its type might be `Unknown`.
+
+###### includes
+       #include <string.h>
+
+###### symbol fields
+       struct text name;
+       struct symbol *next;
+
+###### grammar fields
+       struct symbol *syms;
+       int num_syms;
+
+###### functions
+       static int text_cmp(struct text a, struct text b)
+       {
+               int len = a.len;
+               if (a.len > b.len)
+                       len = b.len;
+               int cmp = strncmp(a.txt, b.txt, len);
+               if (cmp)
+                       return cmp;
+               else
+                       return a.len - b.len;
+       }
+
+       static struct symbol *sym_find(struct grammar *g, struct text s)
+       {
+               struct symbol **l = &g->syms;
+               struct symbol *n;
+               int cmp = 1;
+
+               while (*l &&
+                       (cmp = text_cmp((*l)->name, s)) < 0)
+                               l = & (*l)->next;
+               if (cmp == 0)
+                       return *l;
+               n = calloc(1, sizeof(*n));
+               n->name = s;
+               n->type = Unknown;
+               n->next = *l;
+               n->num = -1;
+               *l = n;
+               return n;
+       }
+
+       static void symbols_init(struct grammar *g)
+       {
+               int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
+               int i;
+               for (i = 0; i < entries; i++) {
+                       struct text t;
+                       struct symbol *s;
+                       t.txt = reserved_words[i];
+                       if (!t.txt)
+                               continue;
+                       t.len = strlen(t.txt);
+                       s = sym_find(g, t);
+                       s->type = Terminal;
+                       s->num = i;
+               }
+       }
+
+### Data types and precedence.
+
+Data type specification and precedence specification are both
+introduced by a dollar sign at the start of the line.  If the next
+word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+precedence, otherwise it specifies a data type.
+
+The data type name is simply stored and applied to the head of all
+subsequent productions.  It must be the name of a structure, so `$expr`
+maps to `struct expr`.
+
+The precedence line must contain a list of symbols - typically
+terminal symbols, but not necessarily.  It can only contain symbols
+that have not been seen yet, so precedence declaration must precede
+the productions that they affect.
+
+A precedence line may also contain a "virtual" symbol which is an
+identifier preceded by `$$`.  Virtual terminals carry precedence
+information but are not included in the grammar.  A production can
+declare that it inherits the precedence of a given virtual symbol.
+
+This use for `$$` precludes it from being used as a symbol in the
+described language.  Two other symbols: `${` and `}$` are also
+unavailable.
+
+Each new precedence line introduces a new precedence level and
+declares how it associates.  This level is stored in each symbol
+listed and may be inherited by any production which uses the symbol.  A
+production inherits from the last symbol which has a precedence.
+
+###### grammar fields
+       struct text current_type;
+       int prec_levels;
+
+###### declarations
+       enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
+       static const char *known[] = { "$$", "${", "}$" };
+
+###### functions
+       static char *dollar_line(struct token_state *ts, struct grammar *g)
+       {
+               struct token t = token_next(ts);
+               char *err;
+               enum assoc assoc;
+               int found;
+
+               if (t.num != TK_ident) {
+                       err = "type or assoc expected after '$'";
+                       goto abort;
+               }
+               if (text_is(t.txt, "LEFT"))
+                       assoc = Left;
+               else if (text_is(t.txt, "RIGHT"))
+                       assoc = Right;
+               else if (text_is(t.txt, "NON"))
+                       assoc = Non;
+               else {
+                       g->current_type = t.txt;
+                       t = token_next(ts);
+                       if (t.num != TK_newline) {
+                               err = "Extra tokens after type name";
+                               goto abort;
+                       }
+                       return NULL;
+               }
+
+               // This is a precedence line, need some symbols.
+               found = 0;
+               g->prec_levels += 1;
+               t = token_next(ts);
+               while (t.num != TK_newline) {
+                       enum symtype type = Terminal;
+                       struct symbol *s;
+                       if (t.num == TK_virtual) {
+                               type = Virtual;
+                               t = token_next(ts);
+                               if (t.num != TK_ident) {
+                                       err = "$$ must be followed by a word";
+                                       goto abort;
+                               }
+                       } else if (t.num != TK_ident &&
+                                  t.num != TK_mark) {
+                               err = "Illegal token in precedence line";
+                               goto abort;
+                       }
+                       s = sym_find(g, t.txt);
+                       if (s->type != Unknown) {
+                               err = "Symbols in precedence line must not already be known.";
+                               goto abort;
+                       }
+                       s->type = type;
+                       s->precedence = g->prec_levels;
+                       s->assoc = assoc;
+                       found += 1;
+               }
+               if (found == 0)
+                       err = "No symbols given on precedence line";
+                       goto abort;
+               return NULL;
+       abort:
+               while (t.num != TK_newline && t.num != TK_eof)
+                       t = token_next(ts);
+               return err;
+       }
+
+### Productions
+
+A production either starts with an identifier which is the head
+non-terminal, or a vertical bar (`|`) in which case this production
+uses the same head as the previous one.  The identifier must be
+followed by a `->` mark.  All productions for a given non-terminal must
+be grouped together with the `nonterminal ->` given only once.
+
+After this a (possibly empty) sequence of identifiers and marks form
+the body of the production.  A virtual symbol may be given after the
+body by preceding it with `$$`.  If a virtual symbol is given, the
+precedence of the production is that for the virtual symbol.  If none
+is given, the precedence is inherited from the last symbol in the
+production which has a precedence specified.
+
+After the optional precedence may come  the `${` mark.  This indicates
+the start of a code fragment.  If present, this must be on the same
+line as the start of the production.
+
+All of the text from the `${` through to the matching `}$` is
+collected and forms the code-fragment for the production.  It must all
+be in one `code_node` of the literate code.  The `}$` must be
+at the end of a line.
+
+Text in the code fragment will undergo substitutions where `$N` for
+some numeric `N` will be replaced with a variable holding the parse
+information for the particular symbol in the production.  `$0` is the
+head of the production, `$1` is the first symbol of the body, etc.
+The type of `$N` for a terminal symbol is `struct token`.  For
+non-terminal, it is whatever has been declared for that symbol.
+
+While building productions we will need to add to an array which needs to
+grow dynamically.
+
+###### functions
+       static void array_add(void *varray, int *cnt, void *new)
+       {
+               void ***array = varray;
+               int current = 0;
+               const int step = 8;
+               current = ((*cnt-1) | (step-1))+1;
+               if (*cnt == current) {
+                       /* must grow */
+                       current += step;
+                       *array = realloc(*array, current * sizeof(void*));
+               }
+               (*array)[*cnt] = new;
+               (*cnt) += 1;
+       }
+
+Collecting the code fragment simply involves reading tokens until we
+hit the end token `}$`, and noting the character position of the start and
+the end.
+
+###### functions
+       static struct text collect_code(struct token_state *state,
+                                       struct token start)
+       {
+               struct text code;
+               struct token t;
+               code.txt = start.txt.txt + start.txt.len;
+               do
+                       t = token_next(state);
+               while (t.node == start.node &&
+                      t.num != TK_close && t.num != TK_error &&
+                      t.num != TK_eof);
+               if (t.num == TK_close && t.node == start.node)
+                       code.len = t.txt.txt - code.txt;
+               else
+                       code.txt = NULL;
+               return code;
+       }
+
+Now we have all the bit we need to parse a full production.
+
+###### includes
+       #include <memory.h>
+
+###### grammar fields
+       struct production **productions;
+       int production_count;
+
+###### production fields
+       struct symbol  *head;
+       struct symbol **body;
+       int             body_size;
+       struct text     code;
+
+###### symbol fields
+       int first_production;
+
+###### functions
+       static char *parse_production(struct grammar *g,
+                                     struct symbol *head,
+                                     struct token_state *state)
+       {
+               /* Head has already been parsed. */
+               struct token tk;
+               char *err;
+               struct production p, *pp;
+
+               memset(&p, 0, sizeof(p));
+               p.head = head;
+               tk = token_next(state);
+               while (tk.num == TK_ident || tk.num == TK_mark) {
+                       struct symbol *bs = sym_find(g, tk.txt);
+                       if (bs->type == Unknown)
+                               bs->type = Terminal;
+                       if (bs->type == Virtual) {
+                               err = "Virtual symbol not permitted in production";
+                               goto abort;
+                       }
+                       if (bs->precedence) {
+                               p.precedence = bs->precedence;
+                               p.assoc = bs->assoc;
+                       }
+                       array_add(&p.body, &p.body_size, bs);
+                       tk = token_next(state);
+               }
+               if (tk.num == TK_virtual) {
+                       struct symbol *vs;
+                       tk = token_next(state);
+                       if (tk.num != TK_ident) {
+                               err = "word required after $$";
+                               goto abort;
+                       }
+                       vs = sym_find(g, tk.txt);
+                       if (vs->type != Virtual) {
+                               err = "symbol after $$ must be virtual";
+                               goto abort;
+                       }
+                       p.precedence = vs->precedence;
+                       p.assoc = vs->assoc;
+                       tk = token_next(state);
+               }
+               if (tk.num == TK_open) {
+                       p.code = collect_code(state, tk);
+                       if (p.code.txt == NULL) {
+                               err = "code fragment not closed properly";
+                               goto abort;
+                       }
+                       tk = token_next(state);
+               }
+               if (tk.num != TK_newline && tk.num != TK_eof) {
+                       err = "stray tokens at end of line";
+                       goto abort;
+               }
+               pp = malloc(sizeof(*pp));
+               *pp = p;
+               array_add(&g->productions, &g->production_count, pp);
+               return NULL;
+       abort:
+               while (tk.num != TK_newline && tk.num != TK_eof)
+                       tk = token_next(state);
+               return err;
+       }
+
+With the ability to parse production and dollar-lines, we have nearly all
+that we need to parse a grammar from a `code_node`.
+
+The head of the first production will effectively be the `start` symbol of
+the grammar.  However it wont _actually_ be so.  Processing the grammar is
+greatly simplified if the real start symbol only has a single production,
+and expect `$eof` as the final terminal.  So when we find the first explicit
+production we insert an extra production as production zero which looks like
+
+###### Example: production 0
+       $start -> START $eof
+
+where `START` is the first non-terminal give.
+
+###### create production zero
+       struct production *p = calloc(1,sizeof(*p));
+       struct text start = {"$start",6};
+       struct text eof = {"$eof",4};
+       p->head = sym_find(g, start);
+       p->head->type = Nonterminal;
+       array_add(&p->body, &p->body_size, head);
+       array_add(&p->body, &p->body_size, sym_find(g, eof));
+       g->start  = p->head->num;
+       p->head->first_production = g->production_count;
+       array_add(&g->productions, &g->production_count, p);
+
+Now we are ready to read in the grammar.
+
+###### grammar fields
+       int start;      // the 'start' symbol.
+
+###### grammar_read
+       static struct grammar *grammar_read(struct code_node *code)
+       {
+               struct token_config conf = {
+                       .word_start = "",
+                       .word_cont = "",
+                       .words_marks = known,
+                       .known_count = sizeof(known)/sizeof(known[0]),
+                       .number_chars = "",
+                       .ignored = (1 << TK_line_comment)
+                                | (1 << TK_block_comment)
+                                | (0 << TK_number)
+                                | (0 << TK_string)
+                                | (1 << TK_multi_string)
+                                | (1 << TK_indent)
+                                | (1 << TK_undent),
+               };
+
+               struct token_state *state = token_open(code, &conf);
+               struct token tk = token_next(state);
+               struct symbol *head = NULL;
+               struct grammar *g;
+               char *err = NULL;
+
+               g = calloc(1, sizeof(*g));
+               symbols_init(g);
+
+               for (tk = token_next(state); tk.num != TK_eof;
+                    tk = token_next(state)) {
+                       if (tk.num == TK_newline)
+                               continue;
+                       if (tk.num == TK_ident) {
+                               // new non-terminal
+                               head = sym_find(g, tk.txt);
+                               if (head->type == Nonterminal)
+                                       err = "This non-terminal has already be used.";
+                               else if (head->type == Virtual)
+                                       err = "Virtual symbol not permitted in head of production";
+                               else {
+                                       head->type = Nonterminal;
+                                       head->struct_name = g->current_type;
+                                       if (g->start == 0) {
+                                               ## create production zero
+                                       }
+                                       head->first_production = g->production_count;
+                                       tk = token_next(state);
+                                       if (tk.num == TK_mark &&
+                                           text_is(tk.txt, "->"))
+                                               err = parse_production(g, head, state);
+                                       else
+                                               err = "'->' missing in production";
+                               }
+                       } else if (tk.num == TK_mark
+                                  && text_is(tk.txt, "|")) {
+                               // another production for same non-term
+                               if (head)
+                                       err = parse_production(g, head, state);
+                               else
+                                       err = "First production must have a head";
+                       } else if (tk.num == TK_mark
+                                  &&  text_is(tk.txt, "$")) {
+                               err = dollar_line(state, g);
+                       } else {
+                               err = "Unrecognised token at start of line.";
+                       }
+                       if (err)
+                               goto abort;
+               }
+               token_close(state);
+               return g;
+       abort:
+               fprintf(stderr, "Error at line %d: %s\n",
+                       tk.line, err);
+               token_close(state);
+               free(g);
+               return NULL;
+       }
+
+## Analysing the grammar
+
+The central task in analysing the grammar is to determine a set of
+states to drive the parsing process.  This will require finding
+various sets of symbols and of "items".  Some of these sets will need
+to attach information to each element in the set, so they are more
+like maps.
+
+Each "item" may have a 'look-ahead' or `LA` set associated with
+it.  Many of these will be the same as each other.  To avoid memory
+wastage, and to simplify some comparisons of sets, these sets will be
+stored in a list of unique sets, each assigned a number.
+
+Once we have the data structures in hand to manage these sets and
+lists, we can start setting the 'nullable' flag, build the 'FIRST'
+sets, and then create the item sets which define the various states.
+
+### Symbol sets.
+
+Though we don't only store symbols in these sets, they are the main
+things we store, so they are called symbol sets or "symsets".
+
+A symset has a size, an array of shorts, and an optional array of data
+values, which are also shorts.  If the array of data is not present,
+we store an impossible pointer, as `NULL` is used to indicate that no
+memory has been allocated yet;
+
+###### declarations
+       struct symset {
+               short cnt;
+               unsigned short *syms, *data;
+       };
+       #define NO_DATA ((unsigned short *)1)
+       const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
+       const struct symset INIT_DATASET = { 0, NULL, NULL };
+
+The arrays of shorts are allocated in blocks of 8 and are kept sorted
+by using an insertion sort.  We don't explicitly record the amount of
+allocated space as it can be derived directly from the current `cnt` using
+`((cnt - 1) | 7) + 1`.
+
+###### functions
+       static void symset_add(struct symset *s, int key, int val)
+       {
+               int i;
+               int current = ((s->cnt-1) | 7) + 1;
+               if (current == s->cnt) {
+                       current += 8;
+                       s->syms = realloc(s->syms, sizeof(*s->syms) * current);
+                       if (s->data != NO_DATA)
+                               s->data = realloc(s->data, sizeof(*s->data) * current);
+               }
+               i = s->cnt;
+               while (i > 0 && s->syms[i-1] > key) {
+                       s->syms[i] = s->syms[i-1];
+                       if (s->data != NO_DATA)
+                               s->data[i] = s->data[i-1];
+                       i--;
+               }
+               s->syms[i] = key;
+               if (s->data != NO_DATA)
+                       s->data[i] = val;
+               s->cnt += 1;
+       }
+
+Finding a symbol (or item) in a `symset` uses a simple binary search.
+We return the index where the value was found (so data can be accessed),
+or `-1` to indicate failure.
+
+       static int symset_find(struct symset *ss, int key)
+       {
+               int lo = 0;
+               int hi = ss->cnt;
+
+               if (hi == 0)
+                       return -1;
+               while (lo + 1 < hi) {
+                       int mid = (lo + hi) / 2;
+                       if (ss->syms[mid] <= key)
+                               lo = mid;
+                       else
+                               hi = mid;
+               }
+               if (ss->syms[lo] == key)
+                       return lo;
+               return -1;
+       }
+
+We will often want to form the union of two symsets.  When we do, we
+will often want to know if anything changed (as they might mean there
+is more work to do).  So `symset_union` reports whether anything was
+added to the first set.  We use a slow quadratic approach as these
+sets don't really get very big.  If profiles shows this to be a problem is
+can be optimised later.
+
+       static int symset_union(struct symset *a, struct symset *b)
+       {
+               int i;
+               int added = 0;
+               for (i = 0; i < b->cnt; i++)
+                       if (symset_find(a, b->syms[i]) < 0) {
+                               int data = 0;
+                               if (b->data != NO_DATA)
+                                       data = b->data[i];
+                               symset_add(a, b->syms[i], data);
+                               added++;
+                       }
+               return added;
+       }
+
+And of course we must be able to free a symset.
+
+       static void symset_free(struct symset ss)
+       {
+               free(ss.syms);
+               if (ss.data != NO_DATA)
+                       free(ss.data);
+       }
+
+### Symset Storage
+
+Some symsets are simply stored somewhere appropriate in the `grammar`
+data structure, others needs a bit of indirection.  This applies
+particularly to "LA" sets.  These will be paired with "items" in an
+"itemset".  As itemsets will be stored in a symset, the "LA" set needs to be
+stored in the `data` field, so we need a mapping from a small (short)
+number to an LA `symset`.
+
+As mentioned earlier this involves creating a list of unique symsets.
+
+For now, we just use a linear list sorted by insertion.  A skiplist
+would be more efficient and may be added later.
+
+###### declarations
+
+       struct setlist {
+               struct symset ss;
+               int num;
+               struct setlist *next;
+       };
+
+###### grammar fields
+       struct setlist *sets;
+       int nextset;
+
+###### functions
+
+       static int ss_cmp(struct symset a, struct symset b)
+       {
+               int i;
+               int diff = a.cnt - b.cnt;
+               if (diff)
+                       return diff;
+               for (i = 0; i < a.cnt; i++) {
+                       diff = (int)a.syms[i] - (int)b.syms[i];
+                       if (diff)
+                               return diff;
+               }
+               return 0;
+       }
+
+       static int save_set(struct grammar *g, struct symset ss)
+       {
+               struct setlist **sl = &g->sets;
+               int cmp = 1;
+               struct setlist *s;
+
+               while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
+                       sl = & (*sl)->next;
+               if (cmp == 0) {
+                       symset_free(ss);
+                       return (*sl)->num;
+               }
+
+               s = malloc(sizeof(*s));
+               s->ss = ss;
+               s->num = g->nextset;
+               g->nextset += 1;
+               s->next = *sl;
+               *sl = s;
+               return s->num;
+       }
+
+Finding a set by number is currently performed by a simple linear search.
+If this turns out to hurt performance, we can store the sets in a dynamic
+array like the productions.
+
+       static struct symset set_find(struct grammar *g, int num)
+       {
+               struct setlist *sl = g->sets;
+               while (sl && sl->num != num)
+                       sl = sl->next;
+               return sl->ss;
+       }
+
+
+### Setting `nullable`
+
+We set `nullable` on the head symbol for any production for which all
+the body symbols (if any) are nullable.  As this is a recursive
+definition, any change in the `nullable` setting means that we need to
+re-evaluate where it needs to be set.
+
+We simply loop around performing the same calculations until no more
+changes happen.
+
+###### symbol fields
+       int nullable;
+
+###### functions
+       static void set_nullable(struct grammar *g)
+       {
+               int check_again = 1;
+               while (check_again) {
+                       int p;
+                       check_again = 0;
+                       for (p = 0; p < g->production_count; p++) {
+                               struct production *pr = g->productions[p];
+                               int s;
+
+                               if (pr->head->nullable)
+                                       continue;
+                               for (s = 0; s < pr->body_size; s++)
+                                       if (! pr->body[s]->nullable)
+                                               break;
+                               if (s == pr->body_size) {
+                                       pr->head->nullable = 1;
+                                       check_again = 1;
+                               }
+                       }
+               }
+       }
+
+### Building the `first` sets
+
+When calculating what can follow a particular non-terminal, we will need to
+know what the "first" terminal in an subsequent non-terminal might be.  So
+we calculate the `first` set for every non-terminal and store them in an
+array.  We don't bother recording the "first" set for terminals as they are
+trivial.
+
+As the "first" for one symbol might depend on the "first" of another,
+we repeat the calculations until no changes happen, just like with
+`set_nullable`.  This makes use of the fact that `symset_union`
+reports if any change happens.
+
+The core of this which finds the "first" of part of a production body
+will be reused for computing the "follow" sets, so we split it out
+into a separate function.
+
+###### grammar fields
+       struct symset *first;
+
+###### functions
+
+       static int add_first(struct production *pr, int start,
+                            struct symset *target, struct grammar *g,
+                            int *to_end)
+       {
+               int s;
+               int changed = 0;
+               for (s = start; s < pr->body_size; s++) {
+                       struct symbol *bs = pr->body[s];
+                       if (bs->type == Terminal) {
+                               if (symset_find(target, bs->num) < 0) {
+                                       symset_add(target, bs->num, 0);
+                                       changed = 1;
+                               }
+                               break;
+                       } else if (symset_union(target, &g->first[bs->num]))
+                               changed = 1;
+                       if (!bs->nullable)
+                               break;
+               }
+               if (to_end)
+                       *to_end = (s == pr->body_size);
+               return changed;
+       }
+
+       static void build_first(struct grammar *g)
+       {
+               int check_again = 1;
+               int s;
+               g->first = calloc(g->num_syms, sizeof(g->first[0]));
+               for (s = 0; s < g->num_syms; s++)
+                       g->first[s] = INIT_SYMSET;
+
+               while (check_again) {
+                       int p;
+                       check_again = 0;
+                       for (p = 0; p < g->production_count; p++) {
+                               struct production *pr = g->productions[p];
+                               struct symset *head = &g->first[pr->head->num];
+
+                               if (add_first(pr, 0, head, g, NULL))
+                                       check_again = 1;
+                       }
+               }
+       }
+
+### Building the `follow` sets.
+
+There are two different situations when we will want to generate "follow"
+sets.  If we are doing an SLR analysis, we want to generate a single
+"follow" set for each non-terminal in the grammar.  That is what is
+happening here.  If we are doing an LALR or LR analysis we will want
+to generate a separate "LA" set for each item.  We do that later
+in state generation.
+
+There are two parts to generating a "follow" set.  Firstly we look at
+every place that any non-terminal appears in the body of any
+production, and we find the set of possible "first" symbols after
+there.  This is done using `add_first` above and only needs to be done
+once as the "first" sets are now stable and will not change.
+
+###### follow code
+
+       for (p = 0; p < g->production_count; p++) {
+               struct production *pr = g->productions[p];
+               int b;
+
+               for (b = 0; b < pr->body_size - 1; b++) {
+                       struct symbol *bs = pr->body[b];
+                       if (bs->type == Terminal)
+                               continue;
+                       add_first(pr, b+1, &g->follow[bs->num], g, NULL);
+               }
+       }
+
+The second part is to add the "follow" set of the head of a production
+to the "follow" sets of the final symbol in the production, and any
+other symbol which is followed only by `nullable` symbols.  As this
+depends on "follow" itself we need to repeatedly perform the process
+until no further changes happen.
+
+###### follow code
+
+       for (again = 0, p = 0;
+            p < g->production_count;
+            p < g->production_count-1
+               ? p++ : again ? (again = 0, p = 0)
+                             : p++) {
+               struct production *pr = g->productions[p];
+               int b;
+
+               for (b = pr->body_size - 1; b >= 0; b--) {
+                       struct symbol *bs = pr->body[b];
+                       if (bs->type == Terminal)
+                               break;
+                       if (symset_union(&g->follow[bs->num],
+                                        &g->follow[pr->head->num]))
+                               again = 1;
+                       if (!bs->nullable)
+                               break;
+               }
+       }
+
+We now just need to create and initialise the `follow` list to get a
+complete function.
+
+###### grammar fields
+       struct symset *follow;
+
+###### functions
+       static void build_follow(struct grammar *g)
+       {
+               int s, again, p;
+               g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
+               for (s = 0; s < g->num_syms; s++)
+                       g->follow[s] = INIT_SYMSET;
+               ## follow code
+       }
+
+### Building itemsets and states
+
+There are three different levels of detail that can be chosen for
+building the itemsets and states for the LR grammar.  They are:
+
+1. LR(0) or SLR(1), where no look-ahead is considered.
+2. LALR(1) where we build look-ahead sets with each item and merge
+   the LA sets when we find two paths to the same "kernel" set of items.
+3. LR(1) where different look-ahead for any item in the code means
+   a different state must be created.
+
+###### forward declarations
+       enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
+
+We need to be able to look through existing states to see if a newly
+generated state already exists.  For now we use a simple sorted linked
+list.
+
+An item is a pair of numbers: the production number and the position of
+"DOT", which is an index into the body of the production.
+As the numbers are not enormous we can combine them into a single "short"
+and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
+production number (so 4000 productions with maximum of 15 symbols in the
+body).
+
+Comparing the itemsets will be a little different to comparing symsets
+as we want to do the lookup after generating the "kernel" of an
+itemset, so we need to ignore the offset=zero items which are added during
+completion.
+
+To facilitate this, we modify the "DOT" number so that "0" sorts to the end of
+the list in the symset, and then only compare items before the first "0".
+
+###### declarations
+       static inline unsigned short item_num(int production, int index)
+       {
+               return production | (((index-1)&0x1f) << 11);
+       }
+       static inline int item_prod(unsigned short item)
+       {
+               return item & 0x7ff;
+       }
+       static inline int item_index(unsigned short item)
+       {
+               return ((item >> 11)+1) & 0x1f;
+       }
+
+For LR(1) analysis we need to compare not just the itemset in a state
+but also the LA sets.  As we assign each unique LA set a number, we
+can just compare the symset and the data values together.
+
+###### functions
+       static int itemset_cmp(struct symset a, struct symset b,
+                              enum grammar_type type)
+       {
+               int i;
+               int av, bv;
+
+               for (i = 0;
+                    i < a.cnt && i < b.cnt &&
+                    item_index(a.syms[i]) > 0 &&
+                    item_index(b.syms[i]) > 0;
+                    i++) {
+                       int diff = a.syms[i] - b.syms[i];
+                       if (diff)
+                               return diff;
+                       if (type == LR1) {
+                               diff = a.data[i] - b.data[i];
+                               if (diff)
+                                       return diff;
+                       }
+               }
+               if (i == a.cnt || item_index(a.syms[i]) == 0)
+                       av = -1;
+               else
+                       av = a.syms[i];
+               if (i == b.cnt || item_index(b.syms[i]) == 0)
+                       bv = -1;
+               else
+                       bv = b.syms[i];
+               if (av - bv)
+                       return av - bv;
+               if (type < LR1 || av == -1)
+                       return 0;
+               return
+                       a.data[i] - b.data[i];
+       }
+
+And now we can build the list of itemsets.  The lookup routine returns
+both a success flag and a pointer to where in the list an insert
+should happen, so we don't need to search a second time.
+
+###### declarations
+       struct itemset {
+               struct itemset *next;
+               short state;
+               struct symset items;
+               struct symset go_to;
+               char completed;
+       };
+
+###### grammar fields
+       struct itemset *items;
+       int states;
+
+###### functions
+       static int itemset_find(struct grammar *g, struct itemset ***where,
+                               struct symset kernel, enum grammar_type type)
+       {
+               struct itemset **ip;
+
+               for (ip = &g->items; *ip ; ip = & (*ip)->next) {
+                       struct itemset *i = *ip;
+                       int diff;
+                       diff = itemset_cmp(i->items, kernel, type);
+                       if (diff < 0)
+                               continue;
+                       if (diff > 0)
+                               break;
+                       /* found */
+                       *where = ip;
+                       return 1;
+               }
+               *where = ip;
+               return 0;
+       }
+
+Adding an itemset may require merging the LA sets if LALR analysis is
+happening. If any new LA set add symbol that weren't in the old LA set, we
+clear the `completed` flag so that the dependants of this itemset will be
+recalculated and their LA sets updated.
+
+`add_itemset` must consume the symsets it is passed, either by adding
+them to a data structure, of freeing them.
+
+       static int add_itemset(struct grammar *g, struct symset ss,
+                              enum grammar_type type)
+       {
+               struct itemset **where, *is;
+               int i;
+               int found = itemset_find(g, &where, ss, type);
+               if (!found) {
+                       is = calloc(1, sizeof(*is));
+                       is->state = g->states;
+                       g->states += 1;
+                       is->items = ss;
+                       is->next = *where;
+                       is->go_to = INIT_DATASET;
+                       *where = is;
+                       return is->state;
+               }
+               is = *where;
+               if (type != LALR) {
+                       symset_free(ss);
+                       return is->state;
+               }
+               for (i = 0; i < ss.cnt; i++) {
+                       struct symset temp = INIT_SYMSET, s;
+                       if (ss.data[i] == is->items.data[i])
+                               continue;
+                       s = set_find(g, is->items.data[i]);
+                       symset_union(&temp, &s);
+                       s = set_find(g, ss.data[i]);
+                       if (symset_union(&temp, &s)) {
+                               is->items.data[i] = save_set(g, temp);
+                               is->completed = 0;
+                       } else
+                               symset_free(temp);
+               }
+               symset_free(ss);
+               return is->state;
+       }
+
+#### The build
+
+To build all the itemsets, we first insert the initial itemset made from the
+start symbol, complete each itemset, and then generate new itemsets from old
+until no new ones can be made.
+
+Completing an itemset means finding all the items where "DOT" is followed by
+a nonterminal and adding "DOT=0" items for every production from that
+non-terminal - providing each item hasn't already been added.
+
+If LA sets are needed, the LA set for each new item is found using
+`add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This may
+be supplemented by the LA set for the item which produce the new item.
+
+We also collect a set of all symbols which follow "DOT" (in `done`) as this
+is used in the next stage.
+
+NOTE: precedence handling should happen here - I haven't written this yet
+though.
+
+###### complete itemset
+       for (i = 0; i < is->items.cnt; i++) {
+               int p = item_prod(is->items.syms[i]);
+               int bs = item_index(is->items.syms[i]);
+               struct production *pr = g->productions[p];
+               int p2;
+               struct symbol *s;
+               struct symset LA = INIT_SYMSET;
+               int sn = 0;
+
+               if (bs == pr->body_size)
+                       continue;
+               s = pr->body[bs];
+               if (symset_find(&done, s->num) < 0)
+                       symset_add(&done, s->num, 0);
+               if (s->type != Nonterminal)
+                       continue;
+               again = 1;
+               if (type >= LALR) {
+                       // Need the LA set.
+                       int to_end;
+                       add_first(pr, bs+1, &LA, g, &to_end);
+                       if (to_end) {
+                               struct symset ss = set_find(g, is->items.data[i]);
+                               symset_union(&LA, &ss);
+                       }
+                       sn = save_set(g, LA);
+                       LA = set_find(g, sn);
+               }
+               /* Add productions for this symbol */
+               for (p2 = s->first_production;
+                    p2 < g->production_count &&
+                     g->productions[p2]->head == s;
+                    p2++) {
+                       int itm = item_num(p2, 0);
+                       int pos = symset_find(&is->items, itm);
+                       if (pos < 0) {
+                               symset_add(&is->items, itm, sn);
+                               /* Will have re-ordered, so start
+                                * from beginning again */
+                               i = -1;
+                       } else if (type >= LALR) {
+                               struct symset ss = set_find(g, is->items.data[pos]);
+                               struct symset tmp = INIT_SYMSET;
+
+                               symset_union(&tmp, &ss);
+                               if (symset_union(&tmp, &LA)) {
+                                       is->items.data[pos] = save_set(g, tmp);
+                                       i = -1;
+                               }else
+                                       symset_free(tmp);
+                       }
+               }
+       }
+
+For each symbol we found (and placed in `done`) we collect all the items for
+which this symbol is next, and create a new itemset with "DOT" advanced over
+the symbol.  This is then added to the collection of itemsets (or merged
+with a pre-existing itemset).
+
+###### derive itemsets
+       // Now we have a completed itemset, so we need to
+       // create all the 'next' itemset and create them
+       // if they don't exist.
+       for (i = 0; i < done.cnt; i++) {
+               int j;
+               int state;
+               struct symset newitemset = INIT_SYMSET;
+               if (type >= LALR)
+                       newitemset = INIT_DATASET;
+
+               for (j = 0; j < is->items.cnt; j++) {
+                       int itm = is->items.syms[j];
+                       int p = item_prod(itm);
+                       int bp = item_index(itm);
+                       struct production *pr = g->productions[p];
+                       int la = 0;
+                       int pos;
+
+                       if (bp == pr->body_size)
+                               continue;
+                       if (pr->body[bp]->num != done.syms[i])
+                               continue;
+                       if (type >= LALR)
+                               la = is->items.data[j];
+                       pos = symset_find(&newitemset, pr->head->num);
+                       if (pos < 0)
+                               symset_add(&newitemset, item_num(p, bp+1), la);
+                       else if (type >= LALR) {
+                               // Need to merge la set.
+                               int la2 = newitemset.data[pos];
+                               if (la != la2) {
+                                       struct symset ss = set_find(g, la2);
+                                       struct symset LA = INIT_SYMSET;
+                                       symset_union(&LA, &ss);
+                                       ss = set_find(g, la);
+                                       if (symset_union(&LA, &ss))
+                                               newitemset.data[pos] = save_set(g, LA);
+                                       else
+                                               symset_free(LA);
+                               }
+                       }
+               }
+               state = add_itemset(g, newitemset, type);
+               if (symset_find(&is->go_to, done.syms[i]) < 0)
+                       symset_add(&is->go_to, done.syms[i], state);
+       }
+
+All that is left is to crate the initial itemset from production zero, and
+with `TK_eof` as the LA set.
+
+###### functions
+       static void build_itemsets(struct grammar *g, enum grammar_type type)
+       {
+               struct symset first = INIT_SYMSET;
+               struct itemset *is;
+               int again;
+               int la = 0;
+               if (type >= LALR) {
+                       // LA set just has eof
+                       struct symset eof = INIT_SYMSET;
+                       symset_add(&eof, TK_eof, 0);
+                       la = save_set(g, eof);
+                       first = INIT_DATASET;
+               }
+               // production 0, offset 0  (with no data)
+               symset_add(&first, item_num(0, 0), la);
+               add_itemset(g, first, type);
+               for (again = 0, is = g->items;
+                    is;
+                    is = is->next ?: again ? (again = 0, g->items) : NULL) {
+                       int i;
+                       struct symset done = INIT_SYMSET;
+                       if (is->completed)
+                               continue;
+                       is->completed = 1;
+                       ## complete itemset
+                       ## derive itemsets
+                       symset_free(done);
+               }
+       }
+
+### Completing the analysis.
+
+The exact process of analysis depends on which level was requested.  For
+`LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
+`LR1` we need first, but not follow.  For `SLR` we need both.
+
+We don't build the "action" tables that you might expect as the parser
+doesn't use them.  They will be calculated to some extent if needed for
+a report.
+
+Once we have built everything we allocate arrays for the two lists:
+symbols and itemsets.  This allows more efficient access during reporting.
+The symbols are grouped as terminals and non-terminals and we record the
+changeover point in `first_nonterm`.
+
+###### grammar fields
+       struct symbol **symtab;
+       struct itemset **statetab;
+       int first_nonterm;
+
+###### grammar_analyse
+
+       static void grammar_analyse(struct grammar *g, enum grammar_type type)
+       {
+               struct symbol *s;
+               struct itemset *is;
+               int snum = TK_reserved;
+               for (s = g->syms; s; s = s->next)
+                       if (s->num < 0 && s->type == Terminal) {
+                               s->num = snum;
+                               snum++;
+                       }
+               g->first_nonterm = snum;
+               for (s = g->syms; s; s = s->next)
+                       if (s->num < 0) {
+                               s->num = snum;
+                               snum++;
+                       }
+               g->num_syms = snum;
+               g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
+               for (s = g->syms; s; s = s->next)
+                       g->symtab[s->num] = s;
+
+               if (type >= SLR) {
+                       set_nullable(g);
+                       build_first(g);
+               }
+               if (type == SLR)
+                       build_follow(g);
+
+               build_itemsets(g, type);
+
+               g->statetab = calloc(g->states, sizeof(g->statetab[0]));
+               for (is = g->items; is ; is = is->next)
+                       g->statetab[is->state] = is;
+       }
+
+## Reporting on the Grammar
+
+The purpose of the report is to give the grammar developer insight into
+how the grammar parser will work.  It is basically a structured dump of
+all the tables that have been generated, plus an description of any conflicts.
+
+###### grammar_report
+       static int grammar_report(struct grammar *g, enum grammar_type type)
+       {
+               report_symbols(g);
+               if (g->follow)
+                       report_follow(g);
+               report_itemsets(g);
+               return report_conflicts(g, type);
+       }
+
+Firstly we have the complete list of symbols, together with the "FIRST"
+set if that was generated.
+
+###### functions
+
+       static void report_symbols(struct grammar *g)
+       {
+               int n;
+               if (g->first)
+                       printf("SYMBOLS + FIRST:\n");
+               else
+                       printf("SYMBOLS:\n");
+
+               for (n = 0; n < g->num_syms; n++) {
+                       struct symbol *s = g->symtab[n];
+                       if (!s)
+                               continue;
+
+                       printf(" %c%3d%c: ",
+                              s->nullable ? '*':' ',
+                              s->num, symtypes[s->type]);
+                       prtxt(s->name);
+                       if (s->precedence)
+                               printf(" (%d%s)", s->precedence,
+                                      assoc_names[s->assoc]);
+
+                       if (g->first && s->type == Nonterminal) {
+                               int j;
+                               char c = ':';
+                               for (j = 0; j < g->first[n].cnt; j++) {
+                                       printf("%c ", c);
+                                       c = ',';
+                                       prtxt(g->symtab[g->first[n].syms[j]]->name);
+                               }
+                       }
+                       printf("\n");
+               }
+       }
+
+Then we have to follow sets if they were computed.
+
+       static void report_follow(struct grammar *g)
+       {
+               int n;
+               printf("FOLLOW:\n");
+               for (n = 0; n < g->num_syms; n++)
+                       if (g->follow[n].cnt) {
+                               int j;
+                               char c = ':';
+                               printf("  ");
+                               prtxt(g->symtab[n]->name);
+                               for (j = 0; j < g->follow[n].cnt; j++) {
+                                       printf("%c ", c);
+                                       c = ',';
+                                       prtxt(g->symtab[g->follow[n].syms[j]]->name);
+                               }
+                               printf("\n");
+                       }
+       }
+
+And finally the item sets.  These include the GO TO tables and, for
+LALR and LR1, the LA set for each item.  Lots of stuff, so we break
+it up a bit.  First the items, with production number and associativity.
+
+       static void report_item(struct grammar *g, int itm)
+       {
+               int p = item_prod(itm);
+               int dot = item_index(itm);
+               struct production *pr = g->productions[p];
+               int i;
+
+               printf("    ");
+               prtxt(pr->head->name);
+               printf(" ->");
+               for (i = 0; i < pr->body_size; i++) {
+                       printf(" %s", (dot == i ? ". ": ""));
+                       prtxt(pr->body[i]->name);
+               }
+               if (dot == pr->body_size)
+                       printf(" .");
+               printf(" [%d]", p);
+               if (pr->precedence)
+                       printf(" (%d%s)", pr->precedence,
+                              assoc_names[pr->assoc]);
+               printf("\n");
+       }
+
+The LA sets which are (possibly) reported with each item:
+
+       static void report_la(struct grammar *g, int lanum)
+       {
+               struct symset la = set_find(g, lanum);
+               int i;
+               char c = ':';
+
+               printf("        LOOK AHEAD(%d)", lanum);
+               for (i = 0; i < la.cnt; i++) {
+                       printf("%c ", c);
+                       c = ',';
+                       prtxt(g->symtab[la.syms[i]]->name);
+               }
+               printf("\n");
+       }
+
+Then the go to sets:
+
+
+       static void report_goto(struct grammar *g, struct symset gt)
+       {
+               int i;
+               printf("    GOTO:\n");
+
+               for (i = 0; i < gt.cnt; i++) {
+                       printf("      ");
+                       prtxt(g->symtab[gt.syms[i]]->name);
+                       printf(" -> %d\n", gt.data[i]);
+               }
+       }
+
+Now we can report all the item sets complete with items, LA sets, and GO TO.
+
+       static void report_itemsets(struct grammar *g)
+       {
+               int s;
+               printf("ITEM SETS(%d)\n", g->states);
+               for (s = 0; s < g->states; s++) {
+                       int j;
+                       struct itemset *is = g->statetab[s];
+                       printf("  Itemset %d:\n", s);
+                       for (j = 0; j < is->items.cnt; j++) {
+                               report_item(g, is->items.syms[j]);
+                               if (is->items.data != NO_DATA)
+                                       report_la(g, is->items.data[j]);
+                       }
+                       report_goto(g, is->go_to);
+               }
+       }
+
+### Reporting conflicts
+
+Conflict detection varies a lot among different analysis levels.  However
+LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
+LR1 are also similar as they have FOLLOW or LA sets.
+
+###### functions
+
+       ## conflict functions
+
+       static int report_conflicts(struct grammar *g, enum grammar_type type)
+       {
+               int cnt = 0;
+               printf("Conflicts:\n");
+               if (type < SLR)
+                       cnt = conflicts_lr0(g, type);
+               else
+                       cnt = conflicts_slr(g, type);
+               if (cnt == 0)
+                       printf(" - no conflicts\n");
+               return cnt;
+       }
+
+LR0 conflicts are any state which have both a reducible item and
+a shiftable item.
+
+LR05 conflicts only occurs if two possibly reductions exist,
+as shifts always over-ride reductions.
+
+###### conflict functions
+       static int conflicts_lr0(struct grammar *g, enum grammar_type type)
+       {
+               int i;
+               int cnt = 0;
+               for (i = 0; i < g->states; i++) {
+                       struct itemset *is = g->statetab[i];
+                       int last_reduce = -1;
+                       int prev_reduce = -1;
+                       int last_shift = -1;
+                       int j;
+                       if (!is)
+                               continue;
+                       for (j = 0; j < is->items.cnt; j++) {
+                               int itm = is->items.syms[j];
+                               int p = item_prod(itm);
+                               int bp = item_index(itm);
+                               struct production *pr = g->productions[p];
+
+                               if (bp == pr->body_size) {
+                                       prev_reduce = last_reduce;
+                                       last_reduce = j;
+                                       continue;
+                               }
+                               if (pr->body[bp]->type == Terminal)
+                                       last_shift = j;
+                       }
+                       if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
+                               printf("  State %d has both SHIFT and REDUCE:\n", i);
+                               report_item(g, is->items.syms[last_shift]);
+                               report_item(g, is->items.syms[last_reduce]);
+                               cnt++;
+                       }
+                       if (prev_reduce >= 0) {
+                               printf("  State %d has 2 (or more) reducible items\n", i);
+                               report_item(g, is->items.syms[prev_reduce]);
+                               report_item(g, is->items.syms[last_reduce]);
+                               cnt++;
+                       }
+               }
+               return cnt;
+       }
+
+SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
+look ahead, or if a symbol in a look-ahead can be shifted.  The differ only
+in the source of the look ahead set.
+
+We build a dataset mapping terminal to item for possible SHIFTs and then
+another for possible REDUCE operations. We report when we get conflicts
+between the two.
+
+       static int conflicts_slr(struct grammar *g, enum grammar_type type)
+       {
+               int i;
+               int cnt = 0;
+
+               for (i = 0; i < g->states; i++) {
+                       struct itemset *is = g->statetab[i];
+                       struct symset shifts = INIT_DATASET;
+                       struct symset reduce = INIT_DATASET;
+                       int j;
+                       if (!is)
+                               continue;
+                       /* First collect the shifts */
+                       for (j = 0; j < is->items.cnt; j++) {
+                               int itm = is->items.syms[j];
+                               int p = item_prod(itm);
+                               int bp = item_index(itm);
+                               struct production *pr = g->productions[p];
+
+                               if (bp < pr->body_size &&
+                                   pr->body[bp]->type == Terminal) {
+                                       /* shiftable */
+                                       int sym = pr->body[bp]->num;
+                                       if (symset_find(&shifts, sym) < 0)
+                                               symset_add(&shifts, sym, itm);
+                               }
+                       }
+                       /* Now look for reduction and conflicts */
+                       for (j = 0; j < is->items.cnt; j++) {
+                               int itm = is->items.syms[j];
+                               int p = item_prod(itm);
+                               int bp = item_index(itm);
+                               struct production *pr = g->productions[p];
+
+                               if (bp < pr->body_size)
+                                       continue;
+                               /* reducible */
+                               struct symset la;
+                               if (type == SLR)
+                                       la = g->follow[pr->head->num];
+                               else
+                                       la = set_find(g, is->items.data[j]);
+                               int k;
+                               for (k = 0; k < la.cnt; k++) {
+                                       int pos = symset_find(&shifts, la.syms[k]);
+                                       if (pos >= 0) {
+                                               printf("  State %d has SHIFT/REDUCE conflict on ", i);
+                                               prtxt(g->symtab[la.syms[k]]->name);
+                                               printf(":\n");
+                                               report_item(g, shifts.data[pos]);
+                                               report_item(g, itm);
+                                               cnt++;
+                                       }
+                                       pos = symset_find(&reduce, la.syms[k]);
+                                       if (pos < 0) {
+                                               symset_add(&reduce, la.syms[k], itm);
+                                               continue;
+                                       }
+                                       printf("  State %d has REDUCE/REDUCE conflict on ", i);
+                                       prtxt(g->symtab[la.syms[k]]->name);
+                                       printf(":\n");
+                                       report_item(g, itm);
+                                       report_item(g, reduce.data[pos]);
+                                       cnt++;
+                               }
+                       }
+                       symset_free(shifts);
+                       symset_free(reduce);
+               }
+               return cnt;
+       }
+
+
+## Generating the parser
+
+The export part of the parser is the `parse_XX` function, where the name
+`XX` is based on the name of the parser files.
+
+This takes a `code_node`, a partially initialized `token_config`, and an
+optional `FILE` to send tracing to.  The `token_config` gets the list of
+known words added and then is used with the `code_node` to initialize the
+scanner.
+
+`parse_XX` then call the library function `parser_run` to actually complete
+the parse,  This needs the `states` table and function to call the various
+pieces of code provided in the grammar file, so they are generated first.
+
+###### parser_generate
+
+       static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
+       {
+               gen_known(f, g);
+               gen_goto(f, g);
+               gen_states(f, g);
+               gen_reduce(f, g, file);
+               gen_free(f, g);
+
+               fprintf(f, "#line 0 \"gen_parser\"\n");
+               fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
+                       name);
+               fprintf(f, "{\n");
+               fprintf(f, "\tstruct token_state *tokens;\n");
+               fprintf(f, "\tconfig->words_marks = known;\n");
+               fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
+               fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
+               fprintf(f, "\ttokens = token_open(code, config);\n");
+               fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace);\n");
+               fprintf(f, "\ttoken_close(tokens);\n");
+               fprintf(f, "\treturn rv;\n");
+               fprintf(f, "}\n\n");
+       }
+
+### Table words table
+
+The know words is simply an array of terminal symbols.
+
+###### functions
+
+       static void gen_known(FILE *f, struct grammar *g)
+       {
+               int i;
+               fprintf(f, "#line 0 \"gen_known\"\n");
+               fprintf(f, "static const char *known[] = {\n");
+               for (i = TK_reserved;
+                    i < g->num_syms && g->symtab[i]->type == Terminal;
+                    i++)
+                       fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
+                               g->symtab[i]->name.txt);
+               fprintf(f, "};\n\n");
+       }
+
+### States and the goto tables.
+
+For each state we record the goto table and the reducible production if
+there is one.
+Some of the details of the reducible production are stored in the
+`do_reduce` function to come later.  Here we store the production number,
+the body size (useful for stack management) and the resulting symbol (useful
+for knowing how to free data later).
+
+The go to table is stored in a simple array of `sym` and corresponding
+`state`.
+
+###### exported types
+
+       struct lookup {
+               short sym;
+               short state;
+       };
+       struct state {
+               short go_to_cnt;
+               const struct lookup * go_to;
+               short reduce_prod;
+               short reduce_size;
+               short reduce_sym;
+       };
+
+
+###### functions
+
+       static void gen_goto(FILE *f, struct grammar *g)
+       {
+               int i;
+               fprintf(f, "#line 0 \"gen_goto\"\n");
+               for (i = 0; i < g->states; i++) {
+                       int j;
+                       fprintf(f, "static const struct lookup goto_%d[] = {\n",
+                               i);
+                       struct symset gt = g->statetab[i]->go_to;
+                       for (j = 0; j < gt.cnt; j++)
+                               fprintf(f, "\t{ %d, %d },\n",
+                                       gt.syms[j], gt.data[j]);
+                       fprintf(f, "};\n");
+               }
+       }
+
+###### functions
+
+       static void gen_states(FILE *f, struct grammar *g)
+       {
+               int i;
+               fprintf(f, "#line 0 \"gen_states\"\n");
+               fprintf(f, "static const struct state states[] = {\n");
+               for (i = 0; i < g->states; i++) {
+                       struct itemset *is = g->statetab[i];
+                       int j, prod = -1;
+                       for (j = 0; j < is->items.cnt; j++) {
+                               int itm = is->items.syms[j];
+                               int p = item_prod(itm);
+                               int bp = item_index(itm);
+                               struct production *pr = g->productions[p];
+
+                               if (bp < pr->body_size)
+                                       continue;
+                               /* This is what we reduce */
+                               prod = p;
+                               break;
+                       }
+
+                       fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d },\n",
+                               i, is->go_to.cnt, i, prod,
+                               prod < 0 ? -1 : g->productions[prod]->body_size,
+                               prod < 0 ? -1 : g->productions[prod]->head->num);
+               }
+               fprintf(f, "};\n\n");
+       }
+
+### The `do_reduce` function and the code
+
+When the parser engine decides to reduce a production, it calls `do_reduce`.
+This has two functions.
+
+Firstly, if a non-NULL `trace` file is passed, it prints out details of the
+production being reduced.  Secondly it runs the code that was included with
+the production if any.
+
+This code needs to be able to store data somewhere.  Rather than requiring
+`do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
+`do_reduce` return the size to be saved.
+
+The code fragment requires translation when written out.  Any `$N` needs to
+be converted to a reference either to that buffer (if `$0`) or to the
+structure returned by a previous reduction.  These pointer need to be cast
+to the appropriate type for each access.  All this is handling in
+`gen_code`.
+
+
+###### functions
+
+       static void gen_code(struct production *p, FILE *f, struct grammar *g)
+       {
+               char *c;
+               fprintf(f, "\t\t\t");
+               for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
+                       int n;
+                       if (*c != '$') {
+                               fputc(*c, f);
+                               if (*c == '\n')
+                                       fputs("\t\t\t", f);
+                               continue;
+                       }
+                       c++;
+                       if (*c < '0' || *c > '9') {
+                               fputc(*c, f);
+                               continue;
+                       }
+                       n = *c - '0';
+                       while (c[1] >= '0' && c[1] <= '9') {
+                               c += 1;
+                               n = n * 10 + *c - '0';
+                       }
+                       if (n == 0)
+                               fprintf(f, "(*(struct %.*s*)ret)",
+                                       p->head->struct_name.len,
+                                       p->head->struct_name.txt);
+                       else if (n > p->body_size)
+                               fprintf(f, "$%d", n);
+                       else if (p->body[n-1]->type == Terminal)
+                               fprintf(f, "(*(struct token *)body[%d])",
+                                       n-1);
+                       else if (p->body[n-1]->struct_name.txt == NULL)
+                               fprintf(f, "$%d", n);
+                       else
+                               fprintf(f, "(*(struct %.*s*)body[%d])",
+                                       p->body[n-1]->struct_name.len,
+                                       p->body[n-1]->struct_name.txt, n-1);
+               }
+               fputs("\n", f);
+       }
+
+###### functions
+
+       static void gen_reduce(FILE *f, struct grammar *g, char *file)
+       {
+               int i, j;
+               fprintf(f, "#line 0 \"gen_reduce\"\n");
+               fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n");
+               fprintf(f, "                      void *ret, FILE *trace)\n");
+               fprintf(f, "{\n");
+               fprintf(f, "\tint ret_size = 0;\n");
+
+               fprintf(f, "\tswitch(prod) {\n");
+               for (i = 0; i < g->production_count; i++) {
+                       struct production *p = g->productions[i];
+                       fprintf(f, "\tcase %d:\n", i);
+
+                       if (p->code.txt)
+                               gen_code(p, f, g);
+
+                       fprintf(f, "\t\tif (trace) {\n");
+                       fprintf(f, "\t\t\tfprintf(trace, \"[%%2d]%.*s ->\", depth);\n",
+                               p->head->name.len, p->head->name.txt);
+                       for (j = 0; j < p->body_size; j++) {
+                               if (p->body[j]->type == Terminal) {
+                                       fputs("\t\t\tfputs(\" \", trace);\n", f);
+                                       fprintf(f, "\t\t\ttext_dump(trace, (*(struct token*)body[%d]).txt, 20);\n", j);
+                               } else {
+                                       fprintf(f, "\t\t\tfprintf(trace, \" %.*s\");\n",
+                                               p->body[j]->name.len,
+                                               p->body[j]->name.txt);
+                               }
+                       }
+                       fprintf(f, "\t\t}\n");
+
+                       if (p->head->struct_name.txt)
+                               fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
+                                       p->head->struct_name.len,
+                                       p->head->struct_name.txt);
+
+                       fprintf(f, "\t\tbreak;\n");
+               }
+               fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
+       }
+
+### `do_free`
+
+As each non-terminal can potentially cause a different type of data
+structure to be allocated and filled in, we need to be able to free it when
+done.
+
+It is particularly important to have fine control over freeing during error
+recovery where individual stack frames might need to be freed.
+
+For this, the grammar author required to defined a `free_XX` function for
+each structure that is used by a non-terminal.  `do_free` all call whichever
+is appropriate given a symbol number, and will call `free` (as is
+appropriate for tokens` on any terminal symbol.
+
+###### functions
+
+       static void gen_free(FILE *f, struct grammar *g)
+       {
+               int i;
+
+               fprintf(f, "#line 0 \"gen_free\"\n");
+               fprintf(f, "static void do_free(short sym, void *asn)\n");
+               fprintf(f, "{\n");
+               fprintf(f, "\tif (!asn) return;\n");
+               fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
+               fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
+               fprintf(f, "\tswitch(sym) {\n");
+
+               for (i = 0; i < g->num_syms; i++) {
+                       struct symbol *s = g->symtab[i];
+                       if (!s ||
+                           s->type != Nonterminal ||
+                           s->struct_name.txt == NULL)
+                               continue;
+
+                       fprintf(f, "\tcase %d:\n", s->num);
+                       fprintf(f, "\t\tfree_%.*s(asn);\n",
+                               s->struct_name.len,
+                               s->struct_name.txt);
+                       fprintf(f, "\t\tbreak;\n");
+               }
+               fprintf(f, "\t}\n}\n\n");
+       }
+
+## The main routine.
+
+There are three key parts to the "main" function of parsergen: processing
+the arguments, loading the grammar file, and dealing with the grammar.
+
+### Argument processing.
+
+Command line options allow the selection of analysis mode, name of output
+file, and whether or not a report should be generated.  By default we create
+a report only if no code output was requested.
+
+The `parse_XX` function name uses the basename of the output file which
+should not have a suffix (`.c`).  `.c` is added to the given name for the
+code, and `.h` is added for the header (if header text is specifed in the
+grammar file).
+
+###### includes
+       #include <getopt.h>
+
+###### declarations
+       static const struct option long_options[] = {
+               { "LR0",        0, NULL, '0' },
+               { "LR05",       0, NULL, '5' },
+               { "SLR",        0, NULL, 'S' },
+               { "LALR",       0, NULL, 'L' },
+               { "LR1",        0, NULL, '1' },
+               { "report",     0, NULL, 'R' },
+               { "output",     1, NULL, 'o' },
+               { NULL,         0, NULL, 0   }
+       };
+       const char *options = "05SL1Ro:";
+
+###### process arguments
+       int opt;
+       char *outfile = NULL;
+       char *infile;
+       char *name;
+       int report = 1;
+       enum grammar_type type = LR05;
+       while ((opt = getopt_long(argc, argv, options,
+                                 long_options, NULL)) != -1) {
+               switch(opt) {
+               case '0':
+                       type = LR0; break;
+               case '5':
+                       type = LR05; break;
+               case 'S':
+                       type = SLR; break;
+               case 'L':
+                       type = LALR; break;
+               case '1':
+                       type = LR1; break;
+               case 'R':
+                       report = 2; break;
+               case 'o':
+                       outfile = optarg; break;
+               default:
+                       fprintf(stderr, "Usage: parsergen ...\n");
+                       exit(1);
+               }
+       }
+       if (optind < argc)
+               infile = argv[optind++];
+       else {
+               fprintf(stderr, "No input file given\n");
+               exit(1);
+       }
+       if (outfile && report == 1)
+               report = 0;
+       name = outfile;
+       if (name && strchr(name, '/'))
+               name = strrchr(name, '/')+1;
+
+       if (optind < argc) {
+               fprintf(stderr, "Excess command line arguments\n");
+               exit(1);
+       }
+
+### Loading the grammar file
+
+To be able to run `mdcode` and `scanner` on the grammar we need to memory
+map it.
+
+One we have extracted the code (with `mdcode`) we expect to file three
+sections: header, code, and grammar.  Anything else is an error.
+
+"header" and "code" are optional, though it is hard to build a working
+parser with neither. "grammar" must be provided.
+
+###### includes
+       #include <fcntl.h>
+       #include <sys/mman.h>
+       #include <errno.h>
+
+###### functions
+       static int errs;
+       static void pr_err(char *msg)
+       {
+               errs++;
+               fprintf(stderr, "%s\n", msg);
+       }
+
+###### load file
+       struct section *table;
+       int fd;
+       int len;
+       char *file;
+       fd = open(infile, O_RDONLY);
+       if (fd < 0) {
+               fprintf(stderr, "parsergen: cannot open %s: %s\n",
+                       infile, strerror(errno));
+               exit(1);
+       }
+       len = lseek(fd, 0, 2);
+       file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+       table = code_extract(file, file+len, pr_err);
+
+       struct code_node *hdr = NULL;
+       struct code_node *code = NULL;
+       struct code_node *gram = NULL;
+       for (s = table; s; s = s->next) {
+               if (text_is(s->section, "header"))
+                       hdr = s->code;
+               else if (text_is(s->section, "code"))
+                       code = s->code;
+               else if (text_is(s->section, "grammar"))
+                       gram = s->code;
+               else {
+                       fprintf(stderr, "Unknown content section: %.*s\n",
+                               s->section.len, s->section.txt);
+                       rv |= 2;
+               }
+       }
+
+### Processing the input
+
+As we need to append an extention to a filename and then open it for
+writing, and we need to do this twice, it helps to have a separate function.
+
+###### functions
+
+       static FILE *open_ext(char *base, char *ext)
+       {
+               char *fn = malloc(strlen(base) + strlen(ext) + 1);
+               FILE *f;
+               strcat(strcpy(fn, base), ext);
+               f = fopen(fn, "w");
+               free(fn);
+               return f;
+       }
+
+If we can read the grammar, then we analyse and optionally report on it.  If
+the report finds conflicts we will exit with an error status.
+
+###### process input
+       struct grammar *g = NULL;
+       if (gram == NULL) {
+               fprintf(stderr, "No grammar section provided\n");
+               rv |= 2;
+       } else {
+               g = grammar_read(gram);
+               if (!g) {
+                       fprintf(stderr, "Failure to parse grammar\n");
+                       rv |= 2;
+               }
+       }
+       if (g) {
+               grammar_analyse(g, type);
+               if (report)
+                       if (grammar_report(g, type))
+                               rv |= 1;
+       }
+
+If a headers section is defined, we write it out and include a declaration
+for the `parse_XX` function so it can be used from separate code.
+
+       if (rv == 0 && hdr && outfile) {
+               FILE *f = open_ext(outfile, ".h");
+               if (f) {
+                       code_node_print(f, hdr, infile);
+                       fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
+                               name);
+                       fclose(f);
+               } else {
+                       fprintf(stderr, "Cannot create %s.h\n",
+                               outfile);
+                       rv |= 4;
+               }
+       }
+
+And if all goes well and an output file was provided, we create the `.c`
+file with the code section (if any) and the parser tables and function.
+
+       if (rv == 0 && outfile) {
+               FILE *f = open_ext(outfile, ".c");
+               if (f) {
+                       if (code)
+                               code_node_print(f, code, infile);
+                       gen_parser(f, g, infile, name);
+                       fclose(f);
+               } else {
+                       fprintf(stderr, "Cannot create %s.c\n",
+                               outfile);
+                       rv |= 4;
+               }
+       }
+
+And that about wraps it up.  We need to set the locale so that UTF-8 is
+recognised properly, and link with `libicuuc` is `libmdcode` requires that.
+
+###### File: parsergen.mk
+       parsergen : parsergen.o libscanner.o libmdcode.o
+               $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
+
+###### includes
+       #include <locale.h>
+
+###### main
+
+       int main(int argc, char *argv[])
+       {
+               struct section *s;
+               int rv = 0;
+
+               setlocale(LC_ALL,"");
+
+               ## process arguments
+               ## load file
+               ## process input
+
+               return rv;
+       }
+
+## The SHIFT/REDUCE parser
+
+Having analysed the grammar and generated all the table, we only need the
+shift/reduce engine to bring it all together.
+
+### Goto table lookup
+
+The parser generator has nicely provided us with goto tables sorted by
+symbol number.  We need a binary search function to find a symbol in the
+table.
+
+###### parser
+
+       static int search(const struct state *l, int sym)
+       {
+               int lo = 0;
+               int hi = l->go_to_cnt;
+
+               if (hi == 0)
+                       return -1;
+               while (lo + 1 < hi) {
+                       int mid = (lo + hi) / 2;
+                       if (l->go_to[mid].sym <= sym)
+                               lo = mid;
+                       else
+                               hi = mid;
+               }
+               if (l->go_to[lo].sym == sym)
+                       return l->go_to[lo].state;
+               else
+                       return -1;
+       }
+
+### The state stack.
+
+The core data structure for the parser is the stack.  This track all the
+symbols that have been recognised or partially recognised.
+
+The stack usually won't grow very large - maybe a few tens of entries.  So
+we dynamically resize and array as required but never bother to shrink it
+down again.
+
+We keep the stack as two separate allocations.  One, `asn_stack` stores the
+"abstract syntax nodes" which are created by each reduction.  When we call
+`do_reduce` we need to pass an array of the `asn`s of the body of the
+production, and by keeping a separate `asn` stack, we can just pass a
+pointer into this stack.
+
+The other allocation store all other stack fields of which there are two.
+The `state` is the most important one and guides the parsing process.  The
+`sym` is nearly unnecessary.  However when we want to free entries from the
+`asn_stack`, it helps to know what type they are so we can call the right
+freeing function.  The symbol leads us to the right free function through
+`do_free`.
+
+###### parser
+
+       struct parser {
+               int state;
+               struct frame {
+                       short state;
+                       short sym;
+               } *stack;
+               void **asn_stack;
+               int stack_size;
+               int tos;
+       };
+
+#### Shift and pop
+
+The operations are needed on the stack - shift (which is like push) and pop.
+
+Shift applies no only to terminals but also to non-terminals.  When we
+reduce a production we will pop off entries corresponding to the body
+symbols, then push on an item for the head of the production.  This last is
+exactly the same process as shifting in a terminal so we use the same
+function for both.
+
+To simplify other code we arrange for `shift` to fail if there is no `goto`
+state for the symbol.  This is useful in basic parsing due to our design
+that we shift when we can, and reduce when we cannot.  So the `shift`
+function reports if it could.
+
+So `shift` finds the next state.  If that succeed it extends the allocations
+if needed and pushed all the information onto the stacks.
+
+###### parser
+
+       static int shift(struct parser *p,
+                        int sym, void *asn,
+                        const struct state states[])
+       {
+               // Push an entry onto the stack
+               int newstate = search(&states[p->state], sym);
+               if (newstate < 0)
+                       return 0;
+               if (p->tos >= p->stack_size) {
+                       p->stack_size += 10;
+                       p->stack = realloc(p->stack, p->stack_size
+                                          * sizeof(p->stack[0]));
+                       p->asn_stack = realloc(p->asn_stack, p->stack_size
+                                          * sizeof(p->asn_stack[0]));
+               }
+               p->stack[p->tos].state = p->state;
+               p->stack[p->tos].sym = sym;
+               p->asn_stack[p->tos] = asn;
+               p->tos++;
+               p->state = newstate;
+               return 1;
+       }
+
+`pop` simply moves the top of stack (`tos`) back down the required amount
+and frees and `asn` entries that need to be freed.  It is called _after_ we
+reduce a production, just before we `shift` the nonterminal in.
+
+###### parser
+
+       static void pop(struct parser *p, int num,
+                       void(*do_free)(short sym, void *asn))
+       {
+               int i;
+               p->tos -= num;
+               for (i = 0; i < num; i++)
+                       do_free(p->stack[p->tos+i].sym,
+                               p->asn_stack[p->tos+i]);
+
+               p->state = p->stack[p->tos].state;
+       }
+
+### Memory allocation
+
+The `scanner` returns tokens in a local variable - we want them in allocated
+memory so they can live in the `asn_stack`.  Similarly the `asn` produce by
+a reduce is in a large buffer.  Both of these require some allocation and
+copying, hence `memdup` and `tokcopy`.
+
+###### parser includes
+       #include <memory.h>
+
+###### parser
+
+       void *memdup(void *m, int len)
+       {
+               void *ret;
+               if (len == 0)
+                       return NULL;
+               ret = malloc(len);
+               memcpy(ret, m, len);
+               return ret;
+       }
+
+       static struct token *tok_copy(struct token tk)
+       {
+               struct token *new = malloc(sizeof(*new));
+               *new = tk;
+               return new;
+       }
+
+### The heart of the parser.
+
+Now we have the parser.  If we can shift, we do.  If not and we can reduce
+we do.  If the production we reduced was production zero, then we have
+accepted the input and can finish.
+
+If we can neither shift nor reduce we have an error to handle.  We pop
+single entries off the stack until we can shift the `TK_error` symbol, the
+drop input tokens until we find one we can shift into the new error state.
+
+We return whatever `asn` was returned by reducing production zero.
+
+###### parser includes
+       #include "parser.h"
+###### parser
+       void *parser_run(struct token_state *tokens,
+                        const struct state states[],
+                        int (*do_reduce)(int, int, void**, void*, FILE*),
+                        void (*do_free)(short, void*),
+                        FILE *trace)
+       {
+               struct parser p = { 0 };
+               struct token *tk;
+               int accepted = 0;
+               void *ret;
+
+               tk = tok_copy(token_next(tokens));
+               while (!accepted) {
+                       if (shift(&p, tk->num, tk, states)) {
+                               if (trace) {
+                                       fputs("Shift ", trace);
+                                       text_dump(trace, tk->txt, 20);
+                                       fputs("\n", trace);
+                               }
+                               tk = tok_copy(token_next(tokens));
+                               continue;
+                       }
+                       if (states[p.state].reduce_prod >= 0) {
+                               void **body;
+                               int prod = states[p.state].reduce_prod;
+                               int size = states[p.state].reduce_size;
+                               int sym = states[p.state].reduce_sym;
+                               int bufsize;
+                               static char buf[16*1024];
+
+                               body = p.asn_stack +
+                                       (p.tos - states[p.state].reduce_size);
+
+                               bufsize = do_reduce(prod, p.tos, body,
+                                                   buf, trace);
+                               if (trace)
+                                       fputs("\n", trace);
+
+                               pop(&p, size, do_free);
+                               shift(&p, sym, memdup(buf, bufsize), states);
+                               if (prod == 0)
+                                       accepted = 1;
+                               continue;
+                       }
+                       /* Error. we walk up the stack until we
+                        * find a state which will accept TK_error.
+                        * We then shift in TK_error and see what state
+                        * that takes us too.
+                        * Then we discard input tokens until
+                        * we find one that is acceptable.
+                        */
+                       while (shift(&p, TK_error, tk, states) < 0
+                              && p.tos > 0)
+                               // discard this state
+                               pop(&p, 1, do_free);
+                       tk = tok_copy(*tk);
+                       while (search(&states[p.state], tk->num) < 0 &&
+                              tk->num != TK_eof) {
+                               free(tk);
+                               tk = tok_copy(token_next(tokens));
+                       }
+                       if (p.tos == 0 && tk->num == TK_eof)
+                               break;
+               }
+               free(tk);
+               if (accepted)
+                       ret = p.asn_stack[0];
+               else
+                       pop(&p, p.tos, do_free);
+               free(p.asn_stack);
+               free(p.stack);
+               return ret;
+       }
+
+###### exported functions
+       void *parser_run(struct token_state *tokens,
+                        const struct state states[],
+                        int (*do_reduce)(int, int, void**, void*, FILE*),
+                        void (*do_free)(short, void*),
+                        FILE *trace);
+
+
+# A Worked Example
+
+The obvious example for a parser is a calculator.
+
+As `scanner` provides number parsing function using `libgmp` is it not much
+work to perform arbitrary rational number calculations.
+
+This calculator takes one expression, or an equality test per line.  The
+results are printed and in any equality test fails, the program exits with
+an error.
+
+Embedding mdcode inside mdcode is rather horrible.  I'd like to find a
+better approach, but as the grammar file must have 3 components I need
+something like this.
+
+###### File: parsergen.mk
+       calc.c : parsergen calc.cgm
+               ./parsergen -o calc calc.cgm
+       calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
+               $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
+
+###### demo grammar
+
+       # header
+       ~~~~~
+
+       #include "number.h"
+       // what do we use for a demo-grammar?  A calculator of course.
+       struct number {
+               mpq_t val;
+               char tail[2];
+               int err;
+       };
+
+       ~~~~~
+       # code
+       ~~~~~
+
+       #include <stdlib.h>
+       #include <unistd.h>
+       #include <fcntl.h>
+       #include <sys/mman.h>
+       #include <stdio.h>
+       #include <malloc.h>
+       #include <gmp.h>
+       #include "mdcode.h"
+       #include "scanner.h"
+       #include "number.h"
+       #include "parser.h"
+
+       #include "calc.h"
+
+       static void free_number(struct number *n)
+       {
+               mpq_clear(n->val);
+               free(n);
+       }
+
+       int main(int argc, char *argv[])
+       {
+               int fd = open(argv[1], O_RDONLY);
+               int len = lseek(fd, 0, 2);
+               char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+               struct section *s = code_extract(file, file+len, NULL);
+               struct token_config config = {
+                       .ignored = (1 << TK_line_comment)
+                                | (1 << TK_block_comment)
+                                | (1 << TK_indent)
+                                | (1 << TK_undent),
+                       .number_chars = ".,_+-",
+                       .word_start = "",
+                       .word_cont = "",
+               };
+               parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
+               exit(0);
+       }
+
+       ~~~~~
+       # grammar
+       ~~~~~
+
+       Session -> Session Line
+               | Line
+
+       Line -> Expression NEWLINE ${ gmp_printf( "Answer = %Qd\n", $1.val);
+                                       { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
+                                       gmp_printf( "  or as a decimal: %Fg\n", fl);
+                                       mpf_clear(fl);
+                                       }
+                                    }$
+               | Expression = Expression NEWLINE ${
+                       if (mpq_equal($1.val, $3.val))
+                               gmp_printf( "Both equal %Qd\n", $1.val);
+                       else {
+                               gmp_printf( "NOT EQUAL: %Qd\n      != : %Qd\n",
+                                       $1.val, $3.val);
+                               exit(1);
+                       }
+               }$
+               |  NEWLINE ${ printf( "Blank line\n"); }$
+               | ERROR NEWLINE ${ printf( "Skipped a bad line\n"); }$
+
+       $number
+       Expression -> Expression +  Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
+               | Expression -  Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
+               | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
+
+       Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
+               | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+               | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
+
+       Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
+               | ( Expression  ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
author	NeilBrown <neilb@suse.de>
	Fri, 12 Jul 2013 21:21:37 +0000 (07:21 +1000)
committer	NeilBrown <neilb@suse.de>
	Fri, 12 Jul 2013 21:28:40 +0000 (07:28 +1000)