parsergen: move EOL handling out of shift()

[ocean] / csrc / parsergen.mdc
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index 2da1d65bb7b2f962af024ef6b7d7f20a9e2e1f3e..d9467442eeab1dfec4720f7a2b4f5c6666d43867 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -1,9 +1,14 @@
-# LR(1) Parser Generator #
+# LR(1) Parser Generator with 2D support #
  
  This parser generator takes a grammar description combined with code
  fragments, analyses it, and can report details about the analysis and
  write out C code files which can be compiled to make a parser.
  
+"2D support" means that indentation and line breaks can be significant.
+Indent tokens (IN and OUT) and end-of-line tokens (EOL and NEWLINE) can
+be used to describe the grammar and the parser can selectively ignore
+these where they aren't relevant.
+
  There are several distinct sections.
  
  1. `grammar_read` will read a grammar from a literate-code file and
@@ -52,18 +57,20 @@ There are several distinct sections.
  
  ## Reading the grammar
  
-The grammar must be stored in a markdown literate code file as parsed
-by "[mdcode][]".  It must have three top level (i.e. unreferenced)
-sections: `header`, `code`, and `grammar`.  The first two will be
-literally copied into the generated `.c` and `.h`. files.  The last
-contains the grammar.  This is tokenised with "[scanner][]".
+The grammar must be stored in a markdown literate code file as parsed by
+"[mdcode][]".  It may have up to four top level (i.e.  unreferenced)
+sections: `header`, `code`, `grammar` and `reduce`.  The first two, if
+present, will be literally copied into the generated `.c` and `.h`
+files.  The third is required and contains the grammar which is
+tokenised with "[scanner][]".  `reduce` can provide variable
+declarations and common intialization code for all the reduction code
+fragments in the grammar.
  
  If the `--tag` option is given, then any top level header that doesn't
  start with the tag is ignored, and the tag is striped from the rest.  So
-`--tag Foo`
-means that the three needed sections must be `Foo: header`, `Foo: code`,
-and `Foo: grammar`.  The tag `calc` is used to extract the same calculator
-from this file.
+`--tag Foo` means that the four sections expected are `Foo: header`,
+`Foo: code`, `Foo: grammar` and `Foo: reduce`.  The tag `calc` is used
+to extract the sample calculator from this file.
  
  [mdcode]: mdcode.html
  [scanner]: scanner.html
@@ -77,10 +84,11 @@ from this file.
         #include "scanner.h"
  
  The grammar contains production sets, precedence/associativity
-declarations, and data type declarations.  These are all parsed with
-_ad hoc_ parsing as we don't have a parser generator yet.
+declarations, general terminal declarations, and data type declarations.
+These are all parsed with _ad hoc_ parsing as we don't have a parser
+generator yet.
  
-The precedence and associativity can be set for each production, but
+The precedence and associativity can be set for each production, and
  can be inherited from symbols.  The data type (either structure or a
  reference to a structure) is potentially defined for each non-terminal
  and describes what C structure is used to store information about each
@@ -100,18 +108,17 @@ symbol.
         struct production {
                 unsigned short precedence;
                 enum assoc assoc;
-               char line_like;
                 ## production fields
         };
         struct grammar {
                 ## grammar fields
         };
  
-The strings reported by `mdcode` and `scanner` are `struct text` which have
-length rather than being null terminated.  To help with printing and
+The strings reported by `mdcode` and `scanner` are `struct text` which
+have length rather than being nul terminated.  To help with printing and
  comparing we define `text_is` and `prtxt`, which should possibly go in
-`mdcode`.  `scanner` does provide `text_dump` which is useful for strings
-which might contain control characters.
+`mdcode`.  `scanner` does provide `text_dump` which is useful for
+strings which might contain control characters.
  
  `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
  because that is what we need to detect tags.
@@ -145,15 +152,16 @@ because that is what we need to detect tags.
  
  Productions are comprised primarily of symbols - terminal and
  non-terminal.  We do not make a syntactic distinction between the two,
-though non-terminals must be identifiers.  Non-terminal symbols are
-those which appear in the head of a production, terminal symbols are
-those which don't.  There are also "virtual" symbols used for precedence
-marking discussed later, and sometimes we won't know what type a symbol
-is yet.
+though non-terminals must be identifiers while terminals can also be
+marks.  Non-terminal symbols are those which appear in the head of a
+production, terminal symbols are those which don't.  There are also
+"virtual" symbols used for precedence marking discussed later, and
+sometimes we won't know what type a symbol is yet.
  
  To help with code safety it is possible to declare the terminal symbols.
  If this is done, then any symbol used in a production that does not
-appear in a head and is not declared is treated as an error.
+appear in the head of any production and is not declared as a terminal
+is treated as an error.
  
  ###### forward declarations
         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
@@ -166,7 +174,11 @@ appear in a head and is not declared is treated as an error.
  Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
  table of known symbols and the resulting parser will report them as
  `TK_reserved + N`.  A small set of identifiers are reserved for the
-different token types that `scanner` can report.
+different token types that `scanner` can report, and an even smaller set
+are reserved for a special token that the parser can generate (`EOL`) as
+will be described later.  This latter set cannot use predefined numbers,
+so they are marked as `isspecial` for now and will get assigned a number
+with the non-terminals later.
  
  ###### declarations
  
@@ -181,9 +193,12 @@ different token types that `scanner` can report.
                 { TK_out,          "OUT" },
                 { TK_newline,      "NEWLINE" },
                 { TK_eof,          "$eof" },
+               { -1,              "EOL" },
         };
+
  ###### symbol fields
         short num;
+       unsigned int isspecial:1;
  
  Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
  recognised.  The former is automatically expected at the end of the text
@@ -241,6 +256,7 @@ symbol, but its type might be `Unknown`.
                         s = sym_find(g, t);
                         s->type = Terminal;
                         s->num = reserved_words[i].num;
+                       s->isspecial = 1;
                 }
         }
  
@@ -249,7 +265,7 @@ symbol, but its type might be `Unknown`.
  Data type specification, precedence specification, and declaration of
  terminals are all introduced by a dollar sign at the start of the line.
  If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
-precedence, if it is `TERM` the the line declares terminals without
+precedence, if it is `TERM` then the line declares terminals without
  precedence, otherwise it specifies a data type.
  
  The data type name is simply stored and applied to the head of all
@@ -262,20 +278,20 @@ Any productions given before the first data type declaration will have
  no data type associated with them and can carry no information.  In
  order to allow other non-terminals to have no type, the data type
  `$void` can be given.  This does *not* mean that `struct void` will be
-used, but rather than no type will be associated with future
+used, but rather than no type will be associated with subsequent
  non-terminals.
  
-The precedence line must contain a list of symbols - typically
-terminal symbols, but not necessarily.  It can only contain symbols
-that have not been seen yet, so precedence declaration must precede
-the productions that they affect.
+The precedence line must contain a list of symbols, either terminal
+symbols or virtual symbols.  It can only contain symbols that have not
+been seen yet, so precedence declaration must precede the productions
+that they affect.
  
-A precedence line may also contain a "virtual" symbol which is an
-identifier preceded by `$$`.  Virtual terminals carry precedence
-information but are not included in the grammar.  A production can
-declare that it inherits the precedence of a given virtual symbol.
+A "virtual" symbol is an identifier preceded by `$$`.  Virtual symbols
+carry precedence information but are not included in the grammar.  A
+production can declare that it inherits the precedence of a given
+virtual symbol.
  
-This use for `$$` precludes it from being used as a symbol in the
+This use of `$$` precludes it from being used as a symbol in the
  described language.  Two other symbols: `${` and `}$` are also
  unavailable.
  
@@ -285,7 +301,12 @@ listed and may be inherited by any production which uses the symbol.  A
  production inherits from the last symbol which has a precedence.
  
  The symbols on the first precedence line have the lowest precedence.
-Subsequent lines introduce symbols with higher precedence.
+Subsequent lines introduce symbols with higher precedence and so bind
+more tightly.
+
+Note that a declaration line (or "dollar line") can start with either of
+two different marks: "$" or "$*".  The `dollar_line()` function defined
+here is told which was found in the `isref` argument.
  
  ###### grammar fields
         struct text current_type;
@@ -372,9 +393,10 @@ Subsequent lines introduce symbols with higher precedence.
                         found += 1;
                         t = token_next(ts);
                 }
-               if (found == 0)
+               if (found == 0) {
                         err = "No symbols given on precedence/TERM line";
                         goto abort;
+               }
                 return NULL;
         abort:
                 while (t.num != TK_newline && t.num != TK_eof)
@@ -415,7 +437,7 @@ for a terminal symbol is `struct token`.  For a non-terminal, it is
  whatever has been declared for that symbol.  The `<` may be included and
  means that the value (usually a reference) is being moved out, so it
  will not automatically be freed.  The effect of using '<' is that the
-variable is cleareed to all-zeros.
+variable is cleared to all-zeros.
  
  While building productions we will need to add to an array which needs to
  grow dynamically.
@@ -493,10 +515,6 @@ Now we have all the bits we need to parse a full production.
                 tk = token_next(state);
                 while (tk.num == TK_ident || tk.num == TK_mark) {
                         struct symbol *bs = sym_find(g, tk.txt);
-                       if (bs->type == Unknown) {
-                               if (!g->terminals_declared)
-                                       bs->type = Terminal;
-                       }
                         if (bs->type == Virtual) {
                                 err = "Virtual symbol not permitted in production";
                                 goto abort;
@@ -516,11 +534,7 @@ Now we have all the bits we need to parse a full production.
                                 goto abort;
                         }
                         vs = sym_find(g, tk.txt);
-                       if (vs->num == TK_newline)
-                               p.line_like = 1;
-                       else if (vs->num == TK_out)
-                               p.line_like = 2;
-                       else if (vs->precedence == 0) {
+                       if (vs->precedence == 0) {
                                 err = "symbol after $$ must have precedence";
                                 goto abort;
                         } else {
@@ -550,18 +564,19 @@ Now we have all the bits we need to parse a full production.
         abort:
                 while (tk.num != TK_newline && tk.num != TK_eof)
                         tk = token_next(state);
+               free(p.body);
                 return err;
         }
  
-With the ability to parse production and dollar-lines, we have nearly all
+With the ability to parse productions and dollar-lines, we have nearly all
  that we need to parse a grammar from a `code_node`.
  
-The head of the first production will effectively be the `start` symbol of
-the grammar.  However it won't _actually_ be so.  Processing the grammar is
-greatly simplified if the real start symbol only has a single production,
-and expects `$eof` as the final terminal.  So when we find the first
-explicit production we insert an extra production as production zero which
-looks like
+The head of the first production will effectively be the `start` symbol
+of the grammar.  However it won't _actually_ be so.  Processing the
+grammar is greatly simplified if the real start symbol only has a single
+production, and expects `$eof` as the final terminal.  So when we find
+the first explicit production we insert an extra implicit production as
+production zero which looks like
  
  ###### Example: production 0
         $start -> START $eof
@@ -589,6 +604,11 @@ and strings so that the marks which introduce them can be
  used as terminals in the grammar.  We don't ignore numbers
  even though we don't allow them as that causes the scanner
  to produce errors that the parser is better positioned to handle.
+We also ignore indentation, but we expect and use newlines.
+
+To allow comments in the grammar, we explicitly check for "//" as the
+first token on the line and those lines are skipped.  "//" can still be
+used as a terminal anywhere that a terminal is expected.
  
  ###### grammar_read
         static struct grammar *grammar_read(struct code_node *code)
@@ -668,45 +688,46 @@ to produce errors that the parser is better positioned to handle.
                                 goto abort;
                 }
                 token_close(state);
-               if (g->terminals_declared) {
-                       struct symbol *s;
-                       int errs = 0;
-                       for (s = g->syms; s; s = s->next) {
-                               if (s->type != Unknown)
-                                       continue;
-                               errs += 1;
-                               fprintf(stderr, "Token %.*s not declared\n",
-                                       s->name.len, s->name.txt);
-                       }
-                       if (errs) {
-                               free(g);
-                               g = NULL;
+
+               struct symbol *s;
+               for (s = g->syms; s; s = s->next) {
+                       if (s->type != Unknown)
+                               continue;
+                       if (!g->terminals_declared) {
+                               s->type = Terminal;
+                               continue;
                         }
+                       err = "not declared";
+                       fprintf(stderr, "Token %.*s not declared\n",
+                               s->name.len, s->name.txt);
+               }
+               if (err) {
+                       free(g); // FIXME free content
+                       g = NULL;
                 }
                 return g;
         abort:
                 fprintf(stderr, "Error at line %d: %s\n",
                         tk.line, err);
                 token_close(state);
-               free(g);
+               free(g); // FIXME free content
                 return NULL;
         }
  
  ## Analysing the grammar
  
  The central task in analysing the grammar is to determine a set of
-states to drive the parsing process.  This will require finding
-various sets of symbols and of "items".  Some of these sets will need
-to attach information to each element in the set, so they are more
-like maps.
-
-Each "item" may have a 'look-ahead' or `LA` set associated with
-it.  Many of these will be the same as each other.  To avoid memory
-wastage, and to simplify some comparisons of sets, these sets will be
-stored in a list of unique sets, each assigned a number.
-
-Once we have the data structures in hand to manage these sets and
-lists, we can start setting the 'nullable' flag, build the 'FIRST'
+states to drive the parsing process.  This will require finding various
+sets of symbols and of "items".  Some of these sets will need to attach
+information to each element in the set, so they are more like maps.
+
+Each "item" may have a 'look-ahead' or `LA` set associated with it.
+Many of these will be the same as each other.  To avoid memory wastage,
+and to simplify some comparisons of sets, these sets will be stored in a
+list of unique sets, each assigned a number.
+
+Once we have the data structures in hand to manage these sets and lists,
+we can start setting the 'nullable' flag, build the 'FIRST' and 'FOLLOW'
  sets, and then create the item sets which define the various states.
  
  ### Symbol sets.
@@ -715,8 +736,8 @@ Though we don't only store symbols in these sets, they are the main
  things we store, so they are called symbol sets or "symsets".
  
  A symset has a size, an array of shorts, and an optional array of data
-values, which are also shorts.  If the array of data is not present,
-we store an impossible pointer, as `NULL` is used to indicate that no
+values, which are also shorts.  If the array of data is not present, we
+store an impossible pointer, as `NULL` is used to indicate that no
  memory has been allocated yet;
  
  ###### declarations
@@ -728,10 +749,10 @@ memory has been allocated yet;
         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
         const struct symset INIT_DATASET = { 0, NULL, NULL };
  
-The arrays of shorts are allocated in blocks of 8 and are kept sorted
-by using an insertion sort.  We don't explicitly record the amount of
-allocated space as it can be derived directly from the current `cnt` using
-`((cnt - 1) | 7) + 1`.
+The arrays of shorts are allocated in blocks of 8 and are kept sorted by
+using an insertion sort.  We don't explicitly record the amount of
+allocated space as it can be derived directly from the current `cnt`
+using `((cnt - 1) | 7) + 1`.
  
  ###### functions
         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
@@ -781,11 +802,11 @@ or `-1` to indicate failure.
         }
  
  We will often want to form the union of two symsets.  When we do, we
-will often want to know if anything changed (as that might mean there
-is more work to do).  So `symset_union` reports whether anything was
-added to the first set.  We use a slow quadratic approach as these
-sets don't really get very big.  If profiles shows this to be a problem it
-can be optimised later.
+will often want to know if anything changed (as that might mean there is
+more work to do).  So `symset_union` reports whether anything was added
+to the first set.  We use a slow quadratic approach as these sets are
+rarely large.  If profiling shows this to be a problem it can be
+optimised later.
  
         static int symset_union(struct symset *a, struct symset *b)
         {
@@ -816,9 +837,9 @@ And of course we must be able to free a symset.
  Some symsets are simply stored somewhere appropriate in the `grammar`
  data structure, others needs a bit of indirection.  This applies
  particularly to "LA" sets.  These will be paired with "items" in an
-"itemset".  As itemsets will be stored in a symset, the "LA" set needs to be
-stored in the `data` field, so we need a mapping from a small (short)
-number to an LA `symset`.
+"itemset".  As itemsets will be stored in a symset, the "LA" set needs
+to be stored in the `data` field, so we need a mapping from a small
+(short) number to an LA `symset`.
  
  As mentioned earlier this involves creating a list of unique symsets.
  
@@ -875,9 +896,9 @@ would be more efficient and may be added later.
                 return s->num;
         }
  
-Finding a set by number is currently performed by a simple linear search.
-If this turns out to hurt performance, we can store the sets in a dynamic
-array like the productions.
+Finding a set by number is currently performed by a simple linear
+search.  If this turns out to hurt performance, we can store the sets in
+a dynamic array like the productions.
  
         static struct symset set_find(struct grammar *g, int num)
         {
@@ -924,70 +945,24 @@ changes happen.
                 }
         }
  
-### Setting `line_like`
-
-In order to be able to ignore newline tokens when not relevant, but
-still include them in the parse when needed, we will need to know
-which states can start a "line-like" section of code.  We ignore
-newlines when there is an indent since the most recent start of a
-line-like symbol.
-
-A "line_like" symbol is simply any symbol that can derive a NEWLINE.
-If a symbol cannot derive a NEWLINE, then it is only part of a line -
-so is word-like.  If it can derive a NEWLINE, then we consider it to
-be like a line.
-
-Clearly the `TK_newline` token can derive a NEWLINE.  Any symbol which
-is the head of a production that contains a line_like symbol is also a
-line-like symbol.  We use a new field `line_like` to record this
-attribute of symbols, and compute it in a repetitive manner similar to
-`set_nullable`.
-
-###### symbol fields
-       int line_like;
-
-###### functions
-       static void set_line_like(struct grammar *g)
-       {
-               int check_again = 1;
-               g->symtab[TK_newline]->line_like = 1;
-               while (check_again) {
-                       int p;
-                       check_again = 0;
-                       for (p = 0; p < g->production_count; p++) {
-                               struct production *pr = g->productions[p];
-                               int s;
-
-                               if (pr->head->line_like)
-                                       continue;
-
-                               for (s = 0 ; s < pr->body_size; s++) {
-                                       if (pr->body[s]->line_like) {
-                                               pr->head->line_like = 1;
-                                               check_again = 1;
-                                               break;
-                                       }
-                               }
-                       }
-               }
-       }
-
  ### Building the `first` sets
  
-When calculating what can follow a particular non-terminal, we will need to
-know what the "first" terminal in any subsequent non-terminal might be.  So
-we calculate the `first` set for every non-terminal and store them in an
-array.  We don't bother recording the "first" set for terminals as they are
-trivial.
+When calculating what can follow a particular non-terminal, we will need
+to know what the "first" terminal in any subsequent non-terminal might
+be.  So we calculate the `first` set for every non-terminal and store
+them in an array.  We don't bother recording the "first" set for
+terminals as they are trivial.
  
-As the "first" for one symbol might depend on the "first" of another,
-we repeat the calculations until no changes happen, just like with
-`set_nullable`.  This makes use of the fact that `symset_union`
-reports if any change happens.
+As the "first" for one symbol might depend on the "first" of another, we
+repeat the calculations until no changes happen, just like with
+`set_nullable`.  This makes use of the fact that `symset_union` reports
+if any change happens.
  
  The core of this, which finds the "first" of part of a production body,
-will be reused for computing the "follow" sets, so we split it out
-into a separate function.
+will be reused for computing the "follow" sets, so we split that out
+into a separate function, `add_first`, which also reports if it got all
+the way to the end of the production without finding a non-nullable
+symbol.
  
  ###### grammar fields
         struct symset *first;
@@ -1041,18 +1016,21 @@ into a separate function.
  
  ### Building the `follow` sets.
  
-There are two different situations when we will want to generate "follow"
-sets.  If we are doing an SLR analysis, we want to generate a single
-"follow" set for each non-terminal in the grammar.  That is what is
-happening here.  If we are doing an LALR or LR analysis we will want
-to generate a separate "LA" set for each item.  We do that later
-in state generation.
+There are two different situations when we will want to generate
+"follow" sets.  If we are doing an SLR analysis, we want to generate a
+single "follow" set for each non-terminal in the grammar.  That is what
+is happening here.  If we are doing an LALR or LR analysis we will want
+to generate a separate "LA" set for each item.  We do that later in
+state generation.
  
  There are two parts to generating a "follow" set.  Firstly we look at
-every place that any non-terminal appears in the body of any
-production, and we find the set of possible "first" symbols after
-there.  This is done using `add_first` above and only needs to be done
-once as the "first" sets are now stable and will not change.
+every place that any non-terminal appears in the body of any production,
+and we find the set of possible "first" symbols after there.  This is
+done using `add_first` above and only needs to be done once as the
+"first" sets are now stable and will not change.
+
+###### grammar fields
+       struct symset *follow;
  
  ###### follow code
  
@@ -1074,12 +1052,16 @@ other symbol which is followed only by `nullable` symbols.  As this
  depends on "follow" itself we need to repeatedly perform the process
  until no further changes happen.
  
+Rather than 2 nested loops, one that repeats the whole process until
+there is no change, and one that iterates of the set of productions, we
+combine these two functions into a single loop.
+
  ###### follow code
  
-       for (again = 0, p = 0;
+       for (check_again = 0, p = 0;
              p < g->production_count;
              p < g->production_count-1
-               ? p++ : again ? (again = 0, p = 0)
+               ? p++ : check_again ? (check_again = 0, p = 0)
                               : p++) {
                 struct production *pr = g->productions[p];
                 int b;
@@ -1090,7 +1072,7 @@ until no further changes happen.
                                 break;
                         if (symset_union(&g->follow[bs->num],
                                          &g->follow[pr->head->num]))
-                               again = 1;
+                               check_again = 1;
                         if (!bs->nullable)
                                 break;
                 }
@@ -1099,13 +1081,10 @@ until no further changes happen.
  We now just need to create and initialise the `follow` list to get a
  complete function.
  
-###### grammar fields
-       struct symset *follow;
-
  ###### functions
         static void build_follow(struct grammar *g)
         {
-               int s, again, p;
+               int s, check_again, p;
                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
                 for (s = 0; s < g->num_syms; s++)
                         g->follow[s] = INIT_SYMSET;
@@ -1117,11 +1096,13 @@ complete function.
  There are three different levels of detail that can be chosen for
  building the itemsets and states for the LR grammar.  They are:
  
-1. LR(0) or SLR(1), where no look-ahead is considered.
+1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
+   itemsets - look-ahead, if used at all, is simply the 'follow' sets
+   already computed,
  2. LALR(1) where we build look-ahead sets with each item and merge
     the LA sets when we find two paths to the same "kernel" set of items.
  3. LR(1) where different look-ahead for any item in the set means
-   a different state must be created.
+   a different item set must be created.
  
  ###### forward declarations
         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
@@ -1131,10 +1112,10 @@ generated state already exists.  For now we use a simple sorted linked
  list.
  
  An item is a pair of numbers: the production number and the position of
-"DOT", which is an index into the body of the production.
-As the numbers are not enormous we can combine them into a single "short"
-and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
-production number (so 4000 productions with maximum of 15 symbols in the
+"DOT", which is an index into the body of the production.  As the
+numbers are not enormous we can combine them into a single "short" and
+store them in a `symset` - 5 bits for "DOT" and 11 bits for the
+production number (so 2000 productions with maximum of 31 symbols in the
  body).
  
  Comparing the itemsets will be a little different to comparing symsets
@@ -1203,27 +1184,12 @@ can just compare the symset and the data values together.
  
  It will be helpful to know if an itemset has been "completed" or not,
  particularly for LALR where itemsets get merged, at which point they
-need to be consider for completion again.  So  a `completed` flag is needed.
-
-For correct handling of `TK_newline` when parsing, we will need to
-know which states (itemsets) can occur at the start of a line, so we
-will record a `starts_line` flag too whenever DOT is at the start of a
-`line_like` symbol.
-
-Finally, for handling `TK_out` we need to know whether productions in the
-current state started *before* the most recent indent.  A state
-doesn't usually keep details of individual productions, so we need to
-add one extra detail. `min_prefix` is the smallest non-zero number of
-symbols *before* DOT in any production in an itemset.  This will allow
-us to determine if the the most recent indent is sufficiently recent
-to cancel it against a `TK_out`.  If it was seen longer ago than the
-`min_prefix`, and if the current state cannot be reduced, then the
-indented section must have ended in the middle of a syntactic unit, so
-an error must be signaled.
+need to be consider for completion again.  So a `completed` flag is
+needed.
  
  And now we can build the list of itemsets.  The lookup routine returns
-both a success flag and a pointer to where in the list an insert
-should happen, so we don't need to search a second time.
+both a success flag and a pointer to where in the list an insert should
+happen, so we don't need to search a second time.
  
  ###### declarations
         struct itemset {
@@ -1234,8 +1200,6 @@ should happen, so we don't need to search a second time.
                 enum assoc assoc;
                 unsigned short precedence;
                 char completed;
-               char starts_line;
-               int min_prefix;
         };
  
  ###### grammar fields
@@ -1265,9 +1229,9 @@ should happen, so we don't need to search a second time.
         }
  
  Adding an itemset may require merging the LA sets if LALR analysis is
-happening. If any new LA set adds any symbols that weren't in the old LA set, we
-clear the `completed` flag so that the dependants of this itemset will be
-recalculated and their LA sets updated.
+happening.  If any new LA set adds any symbols that weren't in the old
+LA set, we clear the `completed` flag so that the dependants of this
+itemset will be recalculated and their LA sets updated.
  
  `add_itemset` must consume the symsets it is passed, either by adding
  them to a data structure, of freeing them.
@@ -1315,32 +1279,34 @@ them to a data structure, of freeing them.
  
  #### The build
  
-To build all the itemsets, we first insert the initial itemset made
-from production zero, complete each itemset, and then generate new
-itemsets from old until no new ones can be made.
+To build all the itemsets, we first insert the initial itemset made from
+production zero, complete each itemset, and then generate new itemsets
+from old until no new ones can be made.
  
-Completing an itemset means finding all the items where "DOT" is followed by
-a nonterminal and adding "DOT=0" items for every production from that
-non-terminal - providing each item hasn't already been added.
+Completing an itemset means finding all the items where "DOT" is
+followed by a nonterminal and adding "DOT=0" items for every production
+from that non-terminal - providing each item hasn't already been added.
+When we add such an item it might get inserted before the current
+one, so to make sure we process it we reset the loop counter to the
+start.
  
  If LA sets are needed, the LA set for each new item is found using
-`add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This may
-be supplemented by the LA set for the item which produce the new item.
-
-We also collect a set of all symbols which follow "DOT" (in `done`) as this
-is used in the next stage.
-If any of these symbols are flagged as `line_like`, then this
-state must be a `starts_line` state so now is a good time to record that.
-
-When itemsets are created we assign a precedence to the itemset from
-the complete item, if there is one.  We ignore the possibility of
-there being two and don't (currently) handle precedence in such
-grammars.  When completing a grammar we ignore any item where DOT is
-followed by a terminal with a precedence lower than that for the
-itemset.  Unless the terminal has right associativity, we also ignore
-items where the terminal has the same precedence.  The result is that
-unwanted items are still in the itemset, but the terminal doesn't get
-into the go to set, so the item is ineffective.
+`add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This
+may be supplemented by the LA set for the item which produced the new
+item.
+
+We also collect a set of all symbols which follow "DOT" (in `done`) as
+this is used in the next stage.
+
+When itemsets are created we assign a precedence to the itemset from the
+complete item, if there is one.  We ignore the possibility of there
+being two and don't (currently) handle precedence in such grammars.
+When completing a grammar we ignore any item where DOT is followed by a
+terminal with a precedence lower than that for the itemset.  Unless the
+terminal has right associativity, we also ignore items where the
+terminal has the same precedence.  The result is that unwanted items are
+still in the itemset, but the terminal doesn't get into the go to set,
+so the item is ineffective.
  
  ###### complete itemset
         for (i = 0; i < is->items.cnt; i++) {
@@ -1351,12 +1317,7 @@ into the go to set, so the item is ineffective.
                 struct symbol *s;
                 struct symset LA = INIT_SYMSET;
                 unsigned short sn = 0;
-               struct symset LAnl = INIT_SYMSET;
-               unsigned short snnl = 0;
  
-               if (is->min_prefix == 0 ||
-                   (bs > 0 && bs < is->min_prefix))
-                       is->min_prefix = bs;
                 if (bs == pr->body_size)
                         continue;
                 s = pr->body[bs];
@@ -1372,14 +1333,12 @@ into the go to set, so the item is ineffective.
                          * not Right-associative, so we mustn't shift it.
                          */
                         continue;
-               if (symset_find(&done, s->num) < 0) {
+               if (symset_find(&done, s->num) < 0)
                         symset_add(&done, s->num, 0);
-               }
+
                 if (s->type != Nonterminal)
                         continue;
-               if (s->line_like)
-                       is->starts_line = 1;
-               again = 1;
+               check_again = 1;
                 if (type >= LALR) {
                         // Need the LA set.
                         int to_end;
@@ -1390,10 +1349,6 @@ into the go to set, so the item is ineffective.
                         }
                         sn = save_set(g, LA);
                         LA = set_find(g, sn);
-                       if (symset_find(&LA, TK_newline))
-                               symset_add(&LAnl, TK_newline, 0);
-                       snnl = save_set(g, LAnl);
-                       LAnl = set_find(g, snnl);
                 }
  
                 /* Add productions for this symbol */
@@ -1404,10 +1359,7 @@ into the go to set, so the item is ineffective.
                         int itm = item_num(p2, 0);
                         int pos = symset_find(&is->items, itm);
                         if (pos < 0) {
-                               if (g->productions[p2]->line_like)
-                                       symset_add(&is->items, itm, snnl);
-                               else
-                                       symset_add(&is->items, itm, sn);
+                               symset_add(&is->items, itm, sn);
                                 /* Will have re-ordered, so start
                                  * from beginning again */
                                 i = -1;
@@ -1416,8 +1368,6 @@ into the go to set, so the item is ineffective.
                                 struct symset tmp = INIT_SYMSET;
                                 struct symset *la = &LA;
  
-                               if (g->productions[p2]->line_like)
-                                       la = &LAnl;
                                 symset_union(&tmp, &ss);
                                 if (symset_union(&tmp, la)) {
                                         is->items.data[pos] = save_set(g, tmp);
@@ -1428,10 +1378,10 @@ into the go to set, so the item is ineffective.
                 }
         }
  
-For each symbol we found (and placed in `done`) we collect all the items for
-which this symbol is next, and create a new itemset with "DOT" advanced over
-the symbol.  This is then added to the collection of itemsets (or merged
-with a pre-existing itemset).
+For each symbol we found (and placed in `done`) we collect all the items
+for which this symbol is next, and create a new itemset with "DOT"
+advanced over the symbol.  This is then added to the collection of
+itemsets (or merged with a pre-existing itemset).
  
  ###### derive itemsets
         // Now we have a completed itemset, so we need to
@@ -1459,18 +1409,21 @@ with a pre-existing itemset).
                                 continue;
                         if (pr->body[bp] != sym)
                                 continue;
+
+                       bp += 1;
                         if (type >= LALR)
                                 la = is->items.data[j];
-                       pos = symset_find(&newitemset, pr->head->num);
-                       if (bp + 1 == pr->body_size &&
+                       if (bp == pr->body_size &&
                             pr->precedence > 0 &&
                             pr->precedence > precedence) {
                                 // new itemset is reducible and has a precedence.
                                 precedence = pr->precedence;
                                 assoc = pr->assoc;
                         }
+                       pos = symset_find(&newitemset, item_num(p, bp));
+
                         if (pos < 0)
-                               symset_add(&newitemset, item_num(p, bp+1), la);
+                               symset_add(&newitemset, item_num(p, bp), la);
                         else if (type >= LALR) {
                                 // Need to merge la set.
                                 int la2 = newitemset.data[pos];
@@ -1499,7 +1452,7 @@ with `TK_eof` as the LA set.
         {
                 struct symset first = INIT_SYMSET;
                 struct itemset *is;
-               int again;
+               int check_again;
                 unsigned short la = 0;
                 if (type >= LALR) {
                         // LA set just has eof
@@ -1511,15 +1464,15 @@ with `TK_eof` as the LA set.
                 // production 0, offset 0 (with no data)
                 symset_add(&first, item_num(0, 0), la);
                 add_itemset(g, first, Non, 0, type);
-               for (again = 0, is = g->items;
+               for (check_again = 0, is = g->items;
                      is;
-                    is = is->next ?: again ? (again = 0, g->items) : NULL) {
+                    is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
                         int i;
                         struct symset done = INIT_SYMSET;
                         if (is->completed)
                                 continue;
                         is->completed = 1;
-                       again = 1;
+                       check_again = 1;
                         ## complete itemset
                         ## derive itemsets
                         symset_free(done);
@@ -1537,9 +1490,11 @@ doesn't use them.  They will be calculated to some extent if needed for
  a report.
  
  Once we have built everything we allocate arrays for the two lists:
-symbols and itemsets.  This allows more efficient access during reporting.
-The symbols are grouped as terminals and non-terminals and we record the
-changeover point in `first_nonterm`.
+symbols and itemsets.  This allows more efficient access during
+reporting.  The symbols are grouped as terminals, then non-terminals,
+then virtual, with the start of non-terminals recorded as `first_nonterm`.
+Special terminals -- meaning just EOL -- are included with the
+non-terminals so that they are not expected by the scanner.
  
  ###### grammar fields
         struct symbol **symtab;
@@ -1554,7 +1509,7 @@ changeover point in `first_nonterm`.
                 struct itemset *is;
                 int snum = TK_reserved;
                 for (s = g->syms; s; s = s->next)
-                       if (s->num < 0 && s->type == Terminal) {
+                       if (s->num < 0 && s->type == Terminal && !s->isspecial) {
                                 s->num = snum;
                                 snum++;
                         }
@@ -1575,7 +1530,6 @@ changeover point in `first_nonterm`.
                         g->symtab[s->num] = s;
  
                 set_nullable(g);
-               set_line_like(g);
                 if (type >= SLR)
                         build_first(g);
  
@@ -1607,8 +1561,7 @@ all the tables that have been generated, plus a description of any conflicts.
  
  Firstly we have the complete list of symbols, together with the
  "FIRST" set if that was generated.  We add a mark to each symbol to
-show if it can end in a newline (`>`), if it is considered to be
-"line-like" (`<`), or if it is nullable (`.`).
+show if it is nullable (`.`).
  
  ###### functions
  
@@ -1625,9 +1578,8 @@ show if it can end in a newline (`>`), if it is considered to be
                         if (!s)
                                 continue;
  
-                       printf(" %c%c%3d%c: ",
+                       printf(" %c%3d%c: ",
                                s->nullable ? '.':' ',
-                              s->line_like ? '<':' ',
                                s->num, symtypes[s->type]);
                         prtxt(s->name);
                         if (s->precedence)
@@ -1698,10 +1650,6 @@ it up a bit.  First the items, with production number and associativity.
                         printf(" [%d%s]", s->precedence,
                                assoc_names[s->assoc]);
                 }
-               if (pr->line_like == 1)
-                       printf(" $$NEWLINE");
-               else if (pr->line_like)
-                       printf(" $$OUT");
                 printf("\n");
         }
  
@@ -1745,8 +1693,7 @@ Now we can report all the item sets complete with items, LA sets, and GO TO.
                 for (s = 0; s < g->states; s++) {
                         int j;
                         struct itemset *is = g->statetab[s];
-                       printf("  Itemset %d:%s min prefix=%d",
-                              s, is->starts_line?" (startsline)":"", is->min_prefix);
+                       printf("  Itemset %d:", s);
                         if (is->precedence)
                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
                         printf("\n");
@@ -1762,7 +1709,7 @@ Now we can report all the item sets complete with items, LA sets, and GO TO.
  ### Reporting conflicts
  
  Conflict detection varies a lot among different analysis levels.  However
-LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
+LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
  LR1 are also similar as they have FOLLOW or LA sets.
  
  ###### functions
@@ -1840,11 +1787,6 @@ terminals to items where that terminal could be shifted and another
  which maps terminals to items that could be reduced when the terminal
  is in look-ahead.  We report when we get conflicts between the two.
  
-As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
-terminal, we ignore it.  NEWLINES are handled specially with its own
-rules for when to shift and when to reduce.  Conflicts are expected,
-but handled internally.
-
         static int conflicts_slr(struct grammar *g, enum grammar_type type)
         {
                 int i;
@@ -1896,7 +1838,7 @@ but handled internally.
                                 int k;
                                 for (k = 0; k < la.cnt; k++) {
                                         int pos = symset_find(&shifts, la.syms[k]);
-                                       if (pos >= 0 && la.syms[k] != TK_newline) {
+                                       if (pos >= 0) {
                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
                                                 cnt++;
                                                         prtxt(g->symtab[la.syms[k]]->name);
@@ -1934,9 +1876,10 @@ optional `FILE` to send tracing to.  The `token_config` gets the list of
  known words added and then is used with the `code_node` to initialize the
  scanner.
  
-`parse_XX` then calls the library function `parser_run` to actually complete
-the parse.  This needs the `states` table and function to call the various
-pieces of code provided in the grammar file, so they are generated first.
+`parse_XX` then calls the library function `parser_run` to actually
+complete the parse.  This needs the `states` table, the `reductions`
+table and functions to call the various pieces of code provided in the
+grammar file, so they are generated first.
  
  ###### parser_generate
  
@@ -1947,6 +1890,7 @@ pieces of code provided in the grammar file, so they are generated first.
                 gen_non_term(f, g);
                 gen_goto(f, g);
                 gen_states(f, g);
+               gen_reductions(f, g);
                 gen_reduce(f, g, file, pre_reduce);
                 gen_free(f, g);
  
@@ -1958,7 +1902,7 @@ pieces of code provided in the grammar file, so they are generated first.
                 fprintf(f, "\tconfig->words_marks = known;\n");
                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
                 fprintf(f, "\ttokens = token_open(code, config);\n");
-               fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
+               fprintf(f, "\tvoid *rv = parser_run(tokens, states, reductions, do_reduce, do_free, trace, non_term, config);\n");
                 fprintf(f, "\ttoken_close(tokens);\n");
                 fprintf(f, "\treturn rv;\n");
                 fprintf(f, "}\n\n");
@@ -1967,9 +1911,7 @@ pieces of code provided in the grammar file, so they are generated first.
  ### Known words table
  
  The known words table is simply an array of terminal symbols.
-The table of nonterminals used for tracing is a similar array.  We
-include virtual symbols in the table of non_terminals to keep the
-numbers right.
+The table of nonterminals used for tracing is a similar array.
  
  ###### functions
  
@@ -1994,20 +1936,23 @@ numbers right.
                 for (i = TK_reserved;
                      i < g->num_syms;
                      i++)
-                       if (g->symtab[i]->type == Nonterminal)
+                       if (g->symtab[i]->type == Nonterminal ||
+                           g->symtab[i]->isspecial)
                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
                                         g->symtab[i]->name.txt);
                 fprintf(f, "};\n\n");
         }
  
-### States and the goto tables.
+### States, reductions, and the go to tables.
  
-For each state we record the goto table, the reducible production if
-there is one, or a symbol to shift for error recovery.
-Some of the details of the reducible production are stored in the
-`do_reduce` function to come later.  Here we store the production number,
-the body size (useful for stack management) and the resulting symbol (useful
-for knowing how to free data later).
+For each state we record the go to table and the reducible production if
+there is one, the details of which are in a separate table of
+reductions.  Some of the details of the reducible production are stored
+in the `do_reduce` function to come later.  In the go to table we store
+the production number and in the reductions table: the body size (useful
+for stack management), the resulting symbol (useful for knowing how to
+free data later), and the size of the resulting asn object (useful for
+preallocation space.
  
  The go to table is stored in a simple array of `sym` and corresponding
  `state`.
@@ -2018,15 +1963,15 @@ The go to table is stored in a simple array of `sym` and corresponding
                 short sym;
                 short state;
         };
+       struct reduction {
+               short size;
+               short sym;
+               short result_size;
+       };
         struct state {
+               short reduce_prod;
                 short go_to_cnt;
                 const struct lookup * go_to;
-               short reduce_prod;
-               short reduce_size;
-               short reduce_sym;
-               char starts_line;
-               char newline_only;
-               short min_prefix;
         };
  
  ###### functions
@@ -2036,10 +1981,13 @@ The go to table is stored in a simple array of `sym` and corresponding
                 int i;
                 fprintf(f, "#line 0 \"gen_goto\"\n");
                 for (i = 0; i < g->states; i++) {
+                       struct symset gt = g->statetab[i]->go_to;
                         int j;
+
+                       if (gt.cnt == 0)
+                               continue;
                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
                                 i);
-                       struct symset gt = g->statetab[i]->go_to;
                         for (j = 0; j < gt.cnt; j++)
                                 fprintf(f, "\t{ %d, %d },\n",
                                         gt.syms[j], gt.data[j]);
@@ -2047,7 +1995,25 @@ The go to table is stored in a simple array of `sym` and corresponding
                 }
         }
  
-###### functions
+       static void gen_reductions(FILE *f, struct grammar *g)
+       {
+               int i;
+               fprintf(f, "#line 0 \"gen_reductions\"\n");
+               fprintf(f, "static const struct reduction reductions[] = {\n");
+               for (i = 0; i < g->production_count; i++) {
+                       struct production *pr = g->productions[i];
+                       struct symbol *hd = pr->head;
+                       fprintf(f, "\t{%d, %d, ", pr->body_size, hd->num);
+                       if (hd->struct_name.txt == NULL)
+                               fprintf(f, "0 },\n");
+                       else
+                               fprintf(f, "sizeof(struct %.*s%s) },\n",
+                                       hd->struct_name.len,
+                                       hd->struct_name.txt,
+                                       hd->isref ? "*" : "");
+               }
+               fprintf(f, "};\n\n");
+       }
  
         static void gen_states(FILE *f, struct grammar *g)
         {
@@ -2066,56 +2032,48 @@ The go to table is stored in a simple array of `sym` and corresponding
  
                                 if (bp < pr->body_size)
                                         continue;
-                               /* This is what we reduce */
+                               /* This is what we reduce - choose longest */
                                 if (prod < 0 || prod_len < pr->body_size) {
                                         prod = p;
                                         prod_len = pr->body_size;
                                 }
                         }
-
-                       if (prod >= 0)
-                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n",
-                                       i, is->go_to.cnt, i, prod,
-                                       g->productions[prod]->body_size,
-                                       g->productions[prod]->head->num,
-                                       is->starts_line,
-                                       g->productions[prod]->line_like,
-                                       is->min_prefix);
+                       if (is->go_to.cnt)
+                               fprintf(f, "\t[%d] = { %d, %d, goto_%d },\n",
+                                       i, prod, is->go_to.cnt, i);
                         else
-                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n",
-                                       i, is->go_to.cnt, i,
-                                       is->starts_line, is->min_prefix);
+                               fprintf(f, "\t[%d] = { %d, 0, NULL },\n", i, prod);
                 }
                 fprintf(f, "};\n\n");
         }
  
  ### The `do_reduce` function and the code
  
-When the parser engine decides to reduce a production, it calls `do_reduce`.
-This has two functions.
-
-Firstly, if a non-NULL `trace` file is passed, it prints out details of the
-production being reduced.  Secondly it runs the code that was included with
-the production if any.
+When the parser engine decides to reduce a production, it calls
+`do_reduce` which runs the code that was included with the production,
+if any.
  
-This code needs to be able to store data somewhere.  Rather than requiring
-`do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
-`do_reduce` return the size to be saved.
+This code needs to be able to store data somewhere so we record the size
+of the data expected with each state so it can be allocated before
+`do_reduce` is called.
  
  In order for the code to access "global" context, we pass in the
-"config" pointer that was passed to parser function.  If the `struct
+"config" pointer that was passed to the parser function.  If the `struct
  token_config` is embedded in some larger structure, the reducing code
-can access the larger structure using pointer manipulation.
-
-The code fragment requires translation when written out.  Any `$N` needs to
-be converted to a reference either to that buffer (if `$0`) or to the
-structure returned by a previous reduction.  These pointers need to be cast
-to the appropriate type for each access.  All this is handled in
+can access the larger structure using pointer manipulation.  Performing
+that pointer manipulation, and any other common code or variables for
+all reduce actions, can be provided in code node called "reduce" which
+is passed around in `pre_reduce` which you might have already noticed.
+
+The code fragments require translation when written out.  Any `$N` needs
+to be converted to a reference either to that buffer (if `$0`) or to the
+structure returned by a previous reduction.  These pointers need to be
+cast to the appropriate type for each access.  All this is handled in
  `gen_code`.
  
  `gen_code` also allows symbol references to contain a '`<`' as in
  '`$<2`'.  This is particularly useful for references (or pointers), but
-can be used with structures too.  The `<` implies that the value it
+can be used with structures too.  The `<` implies that the value is
  being moved out, so the object will not be automatically freed.  It is
  equivalent to assigning `NULL` to the pointer or filling a structure
  with zeros.
@@ -2123,13 +2081,13 @@ with zeros.
  Instead of a number `N`, the `$` or `$<` can be followed by some letters
  and possibly a number.  A number by itself (other than zero) selects a
  symbol from the body of the production.  A sequence of letters selects
-the shortest symbol in the body which contains those letters in the given
-order.  If a number follows the letters, then a later occurrence of
-that symbol is chosen.  So "`$AB2`" will refer to the structure attached
-to the second occurrence of the shortest symbol which contains an `A`
-followed by a `B`.  If there is no unique shortest system, or if the
-number given is too large, then the symbol reference is not transformed,
-and will cause an error when the code is compiled.
+the shortest symbol in the body which contains those letters in the
+given order.  If a number follows the letters, then a later occurrence
+of that symbol is chosen.  So "`$AB2`" will refer to the structure
+attached to the second occurrence of the shortest symbol which contains
+an `A` followed by a `B`.  If there is no unique shortest system, or if
+the number given is too large, then the symbol reference is not
+transformed, and will cause an error when the code is compiled.
  
  ###### functions
  
@@ -2180,7 +2138,7 @@ and will cause an error when the code is compiled.
                         c = *name;
                 }
                 if (namlen == 0) {
-                       if (name == *namep)
+                       if (name == *namep || n > p->body_size)
                                 return -1;
                         *namep = name;
                         return n;
@@ -2189,8 +2147,10 @@ and will cause an error when the code is compiled.
                 for (i = 0; i < p->body_size; i++) {
                         if (!subseq_match(nam, namlen, p->body[i]->name))
                                 continue;
-                       if (slen == 0 || p->body[i]->name.len < slen)
+                       if (slen == 0 || p->body[i]->name.len < slen) {
                                 s = i;
+                               slen = p->body[i]->name.len;
+                       }
                         if (s >= 0 && p->body[i] != p->body[s] &&
                             p->body[i]->name.len == p->body[s]->name.len)
                                 /* not unique, so s cannot be used */
@@ -2198,7 +2158,7 @@ and will cause an error when the code is compiled.
                 }
                 if (s < 0)
                         return -1;
-               if (n == 0);
+               if (n == 0)
                         n = 1;
                 for (i = 0; i < p->body_size; i++)
                         if (p->body[i] == p->body[s]) {
@@ -2206,7 +2166,7 @@ and will cause an error when the code is compiled.
                                 if (n == 0)
                                         break;
                         }
-               if (n > 1)
+               if (n > 0 || i == p->body_size)
                         return -1;
                 *namep = name;
                 return i + 1;
@@ -2277,15 +2237,15 @@ and will cause an error when the code is compiled.
  ###### functions
  
         static void gen_reduce(FILE *f, struct grammar *g, char *file,
-                              struct code_node *code)
+                              struct code_node *pre_reduce)
         {
                 int i;
                 fprintf(f, "#line 1 \"gen_reduce\"\n");
                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
                 fprintf(f, "{\n");
                 fprintf(f, "\tint ret_size = 0;\n");
-               if (code)
-                       code_node_print(f, code, file);
+               if (pre_reduce)
+                       code_node_print(f, pre_reduce, file);
  
                 fprintf(f, "#line 4 \"gen_reduce\"\n");
                 fprintf(f, "\tswitch(prod) {\n");
@@ -2333,7 +2293,15 @@ appropriate for tokens) on any terminal symbol.
                 fprintf(f, "static void do_free(short sym, void *asn)\n");
                 fprintf(f, "{\n");
                 fprintf(f, "\tif (!asn) return;\n");
-               fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
+               fprintf(f, "\tif (sym < %d", g->first_nonterm);
+               /* Need to handle special terminals too */
+               for (i = 0; i < g->num_syms; i++) {
+                       struct symbol *s = g->symtab[i];
+                       if (i >= g->first_nonterm && s->type == Terminal &&
+                           s->isspecial)
+                               fprintf(f, " || sym == %d", s->num);
+               }
+               fprintf(f, ") {\n");
                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
                 fprintf(f, "\tswitch(sym) {\n");
  
@@ -2420,7 +2388,7 @@ grammar file).
                 case 't':
                         tag = optarg; break;
                 default:
-                       fprintf(stderr, "Usage: parsergen ...\n");
+                       fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
                         exit(1);
                 }
         }
@@ -2447,11 +2415,11 @@ To be able to run `mdcode` and `scanner` on the grammar we need to memory
  map it.
  
  Once we have extracted the code (with `mdcode`) we expect to find three
-sections: header, code, and grammar.  Anything else that is not
-excluded by the `--tag` option is an error.
+or four sections: header, code, grammar, reduce.
+Anything else that is not excluded by the `--tag` option is an error.
  
-"header" and "code" are optional, though it is hard to build a working
-parser with neither. "grammar" must be provided.
+"header", "code", and "reduce" are optional, though it is hard to build
+a working parser without the first two.  "grammar" must be provided.
  
  ###### includes
         #include <fcntl.h>
@@ -2605,12 +2573,12 @@ recognised properly, and link with `libicuuc` as `libmdcode` requires that.
  
  ## The SHIFT/REDUCE parser
  
-Having analysed the grammar and generated all the tables, we only need the
-shift/reduce engine to bring it all together.
+Having analysed the grammar and generated all the tables, we only need
+the shift/reduce engine to bring it all together.
  
-### Goto table lookup
+### Go to table lookup
  
-The parser generator has nicely provided us with goto tables sorted by
+The parser generator has nicely provided us with go to tables sorted by
  symbol number.  We need a binary search function to find a symbol in the
  table.
  
@@ -2636,79 +2604,80 @@ table.
                         return -1;
         }
  
-### The state stack.
-
-The core data structure for the parser is the stack.  This tracks all the
-symbols that have been recognised or partially recognised.
+### Memory allocation
  
-The stack usually won't grow very large - maybe a few tens of entries.  So
-we dynamically resize and array as required but never bother to shrink it
-down again.
+The `scanner` returns tokens in a local variable - we want them in allocated
+memory so they can live in the `asn_stack`.  So we provide `tok_copy` to
+make an allocated copy as required.
  
-We keep the stack as two separate allocations.  One, `asn_stack` stores the
-"abstract syntax nodes" which are created by each reduction.  When we call
-`do_reduce` we need to pass an array of the `asn`s of the body of the
-production, and by keeping a separate `asn` stack, we can just pass a
-pointer into this stack.
+###### parser includes
+       #include <memory.h>
  
-The other allocation stores all other stack fields of which there are six.
-The `state` is the most important one and guides the parsing process.  The
-`sym` is nearly unnecessary.  However when we want to free entries from the
-`asn_stack`, it helps to know what type they are so we can call the right
-freeing function.  The symbol leads us to the right free function through
-`do_free`.
+###### parser functions
  
-The `indents` count tracks the line indents with-in the symbol or
-immediately follow it.  These are used to allow indent information to
-guide parsing and error recovery.
+       static struct token *tok_copy(struct token tk)
+       {
+               struct token *new = malloc(sizeof(*new));
+               *new = tk;
+               return new;
+       }
  
-`since_newline` tracks how many stack frames since the last
-start-of-line (whether indented or not).  So if `since_newline` is
-zero, then this symbol is at the start of a line.  Similarly
-`since_indent` counts the number of states since an indent, it is zero
-precisely when `indents` is not zero.
+### The state stack.
  
-`newline_permitted` keeps track of whether newlines should be ignored
-or not.
+The core data structure for the parser is the stack.  This tracks all
+the symbols that have been recognised or partially recognised.
+
+The stack usually won't grow very large - maybe a few tens of entries.
+So we dynamically grow an array as required but never bother to
+shrink it down again.
+
+We keep the stack as two separate allocations.  One, `asn_stack`, stores
+the "abstract syntax nodes" which are created by each reduction.  When
+we call `do_reduce` we need to pass an array of the `asn`s of the body
+of the production, and by keeping a separate `asn` stack, we can just
+pass a pointer into this stack.
+
+The other allocation stores all other stack fields of which there are
+two.  The `state` is the most important one and guides the parsing
+process.  The `sym` is nearly unnecessary as it is implicit in the
+state.  However when we want to free entries from the `asn_stack`, it
+helps to know what type they are so we can call the right freeing
+function.  The symbol leads us to the right free function through
+`do_free`.
  
  The stack is most properly seen as alternating states and symbols -
  states, like the 'DOT' in items, are between symbols.  Each frame in
  our stack holds a state and the symbol that was before it.  The
  bottom of stack holds the start state but no symbol, as nothing came
-before the beginning.
+before the beginning.  As we need to store some value, `TK_eof` is used
+to mark the beginning of the file as well as the end.
  
  ###### parser functions
  
         struct parser {
                 struct frame {
                         short state;
-                       short newline_permitted;
-
                         short sym;
-                       short indents;
-                       short since_newline;
-                       short since_indent;
                 } *stack;
                 void **asn_stack;
                 int stack_size;
                 int tos;
+
+               ## parser state
         };
  
  #### Shift and pop
  
  Two operations are needed on the stack - shift (which is like push) and pop.
  
-Shift applies not only to terminals but also to non-terminals.  When
-we reduce a production we will pop off entries corresponding to the
-body symbols, then push on an item for the head of the production.
-This last is exactly the same process as shifting in a terminal so we
-use the same function for both.  In both cases we provide the symbol,
-the number of indents the symbol contains (which will be zero for a
-terminal symbol) and a flag indicating the the symbol was at (or was
-reduced from a symbol which was at) the start of a line.  The state is
+Shift applies not only to terminals but also to non-terminals.  When we
+reduce a production we will pop off frames corresponding to the body
+symbols, then push on a frame for the head of the production.  This last
+is exactly the same process as shifting in a terminal so we use the same
+function for both.  In both cases we provide the symbol.  The state is
  deduced from the current top-of-stack state and the new symbol.
  
-To simplify other code we arrange for `shift` to fail if there is no `goto`
+To simplify other code we arrange for `shift` to fail if there is no `go to`
  state for the symbol.  This is useful in basic parsing due to our design
  that we shift when we can, and reduce when we cannot.  So the `shift`
  function reports if it could.
@@ -2719,19 +2688,12 @@ stack is empty, it always chooses zero as the next state.
  So `shift` finds the next state.  If that succeeds it extends the
  allocations if needed and pushes all the information onto the stacks.
  
-Newlines are permitted after a `starts_line` state until an internal
-indent.  If the new frame has neither a `starts_line` state nor an
-indent, newlines are permitted if the previous stack frame permitted
-them.
-
  ###### parser functions
  
         static int shift(struct parser *p,
-                        short sym, short indents, short start_of_line,
-                        void *asn,
+                        short sym, void *asn,
                          const struct state states[])
         {
-               // Push an entry onto the stack
                 struct frame next = {0};
                 int newstate = p->tos
                         ? search(&states[p->stack[p->tos-1].state],
@@ -2739,6 +2701,7 @@ them.
                         : 0;
                 if (newstate < 0)
                         return 0;
+
                 if (p->tos >= p->stack_size) {
                         p->stack_size += 10;
                         p->stack = realloc(p->stack, p->stack_size
@@ -2747,30 +2710,7 @@ them.
                                            * sizeof(p->asn_stack[0]));
                 }
                 next.sym = sym;
-               next.indents = indents;
                 next.state = newstate;
-               if (states[newstate].starts_line)
-                       next.newline_permitted = 1;
-               else if (indents)
-                       next.newline_permitted = 0;
-               else if (p->tos)
-                       next.newline_permitted =
-                               p->stack[p->tos-1].newline_permitted;
-               else
-                       next.newline_permitted = 0;
-
-               if (!start_of_line) {
-                       if (p->tos)
-                               next.since_newline = p->stack[p->tos-1].since_newline + 1;
-                       else
-                               next.since_newline = 1;
-               }
-               if (indents)
-                       next.since_indent = 0;
-               else if (p->tos)
-                       next.since_indent = p->stack[p->tos-1].since_indent + 1;
-               else
-                       next.since_indent = 1;
  
                 p->stack[p->tos] = next;
                 p->asn_stack[p->tos] = asn;
@@ -2779,323 +2719,228 @@ them.
         }
  
  `pop` primarily moves the top of stack (`tos`) back down the required
-amount and frees any `asn` entries that need to be freed.  It also
-collects a summary of the indents and line starts in the symbols that
-are being removed. It is called _after_ we reduce a production, just
-before we `shift` the nonterminal in.
+amount and frees any `asn` entries that need to be freed.  It is called
+_after_ we reduce a production, just before we `shift` the nonterminal
+in.
  
  ###### parser functions
  
-       static int pop(struct parser *p, int num,
-                      short *start_of_line,
-                      void(*do_free)(short sym, void *asn))
+       static void pop(struct parser *p, int num,
+                       void(*do_free)(short sym, void *asn))
         {
                 int i;
-               short indents = 0;
-               int sol = 0;
  
                 p->tos -= num;
-               for (i = 0; i < num; i++) {
-                       sol |= !p->stack[p->tos+i].since_newline;
-                       indents += p->stack[p->tos+i].indents;
-                       do_free(p->stack[p->tos+i].sym,
-                               p->asn_stack[p->tos+i]);
-               }
-               if (start_of_line)
-                       *start_of_line = sol;
-               return indents;
+               for (i = 0; i < num; i++)
+                       do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]);
         }
  
-### Memory allocation
+### The heart of the parser.
  
-The `scanner` returns tokens in a local variable - we want them in allocated
-memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
-a reduce is in a large buffer.  Both of these require some allocation and
-copying, hence `memdup` and `tokcopy`.
+Now we have the parser.  For each token we might shift it, trigger a
+reduction, or start error handling.  2D tokens (IN, OUT, NEWLINE, EOL)
+might also be ignored.  Ignoring tokens is combined with shifting.
  
-###### parser includes
-       #include <memory.h>
+###### parser vars
  
-###### parser functions
+       struct parser p = { 0 };
+       struct token *tk = NULL;
+       int accepted = 0;
  
-       void *memdup(void *m, int len)
-       {
-               void *ret;
-               if (len == 0)
-                       return NULL;
-               ret = malloc(len);
-               memcpy(ret, m, len);
-               return ret;
+###### heart of parser
+
+       shift(&p, TK_eof, NULL, states);
+       while (!accepted && p.tos > 0) {
+               struct frame *tos = &p.stack[p.tos-1];
+               if (!tk)
+                       tk = tok_copy(token_next(tokens));
+               parser_trace(trace, &p,
+                            tk, states, non_term, config->known_count);
+
+               ## try shift or ignore
+               ## try reduce
+               ## handle error
         }
  
-       static struct token *tok_copy(struct token tk)
-       {
-               struct token *new = malloc(sizeof(*new));
-               *new = tk;
-               return new;
+Indents are ignored unless they can be shifted onto the stack
+immediately or nothing can be shifted (in which case we reduce, and try
+again).  The end of an indented section - the OUT token - is ignored
+precisely when the indent was ignored.  To keep track of this we need a
+small stack of flags, which is easily stored as bits in an `unsigned
+long`.  This will never overflow and the scanner only allows 20 levels
+of indentation.
+
+###### parser state
+       unsigned long ignored_indents;
+
+NEWLINE is ignored when in an indented section of text which was not
+explicitly expected by the grammar.  So if the most recent indent is
+ignored, so is any NEWLINE token.
+
+If a NEWLINE is seen but it cannot be shifted, we try to shift an EOL
+token instead.  If that succeeds, we make a new copy of the NEWLINE
+token and continue.  This allows a NEWLINE to appear to be preceded by
+an indefinite number of EOL tokens.
+
+The token number for `EOL` cannot be statically declared, so when the
+parser starts we need to look through the array of non-terminals to find
+the EOL.
+
+###### parser state
+       int tk_eol;
+
+###### find eol
+       p.tk_eol = 0;
+       while (strcmp(non_term[p.tk_eol], "EOL") != 0)
+               p.tk_eol += 1;
+       p.tk_eol += TK_reserved + config->known_count;
+
+For other tokens, we shift the next token if that is possible, otherwise
+we try to reduce a production.
+
+###### try shift or ignore
+
+       if ((tk->num == TK_newline || tk->num == TK_out) &&
+           (p.ignored_indents & 1)) {
+               /* indented, so ignore OUT and NEWLINE */
+               if (tk->num == TK_out)
+                       p.ignored_indents >>= 1;
+               free(tk);
+               tk = NULL;
+               parser_trace_action(trace, "Ignore");
+               continue;
+       }
+
+       if (shift(&p, tk->num, tk, states)) {
+               if (tk->num == TK_out)
+                       p.ignored_indents >>= 1;
+               if (tk->num == TK_in)
+                       p.ignored_indents <<= 1;
+
+               parser_trace_action(trace, "Shift");
+               tk = NULL;
+               ## did shift
+               continue;
+       } else if (tk->num == TK_newline &&
+                  shift(&p, p.tk_eol, tk, states)) {
+               tk = tok_copy(*tk);
+               parser_trace_action(trace, "ShiftEOL");
+               continue;
+       }
+
+       if (tk->num == TK_in && states[p.stack[p.tos-1].state].go_to_cnt > 0) {
+               /* No indent expected here and reduce is not mandatory, so ignore IN */
+               free(tk);
+               tk = NULL;
+               p.ignored_indents <<= 1;
+               p.ignored_indents |= 1;
+               parser_trace_action(trace, "Ignore");
+               continue;
         }
  
-### The heart of the parser.
+We have already discussed the bulk of the handling of a "reduce" action,
+with the `pop()` and `shift()` functions doing much of the work.  There
+is a little more complexity needed to manage storage for the asn (Abstract
+Syntax Node), and also a test of whether the reduction is permitted.
  
-Now we have the parser.  If we can shift we do, though newlines and
-reducing indenting may block that.  If not and we can reduce we do
-that.  If the production we reduced was production zero, then we have
-accepted the input and can finish.
-
-We return whatever `asn` was returned by reducing production zero.
-
-If we can neither shift nor reduce we have an error to handle.  We pop
-single entries off the stack until we can shift the `TK_error` symbol,
-then drop input tokens until we find one we can shift into the new error
-state.  We need to ensure that something is dropped or shifted after an
-error, or we could get into an infinite loop, only shifting in
-`TK_error`, then reducing.  So we track if there has been a shift since
-the last error, and if not the next error always discards one token.
-
-When we find `TK_in` and `TK_out` tokens which report indents we need
-to handle them directly as the grammar cannot express what we want to
-do with them.
-
-`TK_in` tokens are easy: we simply update indent count in the top stack frame to
-record how many indents there are following the previous token.
-
-`TK_out` tokens must be canceled against an indent count
-within the stack.  If we can reduce some symbols that are all since
-the most recent indent, then we do that first.  If the minimum prefix
-of the current state then extends back before the most recent indent,
-that indent can be cancelled.  If the minimum prefix is shorter then
-the indent had ended prematurely and we must start error handling, which
-is still a work-in-progress.
-
-`TK_newline` tokens are ignored unless the top stack frame records
-that they are permitted.  In that case they will not be considered for
-shifting if it is possible to reduce some symbols that are all since
-the most recent start of line.  This is how a newline forcibly
-terminates any line-like structure - we try to reduce down to at most
-one symbol for each line where newlines are allowed.
-A consequence of this is that a rule like
-
-###### Example: newlines - broken
-
-       Newlines ->
-               | NEWLINE Newlines
-       IfStatement -> Newlines if ....
-
-cannot work, as the NEWLINE will never be shifted as the empty string
-will be reduced first.  Optional sets of newlines need to be include
-in the thing that preceed:
-
-###### Example: newlines - works
-
-       If -> if
-               | NEWLINE If
-       IfStatement -> If ....
-
-Here the NEWLINE will be shifted because nothing can be reduced until
-the `if` is seen.
-
-When during error handling we discard tokens read in, we want to keep
-discarding until we see one that is recognised.  If we had a full set
-of LR(1) grammar states, this would mean looking in the look-ahead set,
-but we don't keep a full look-ahead set.  We only record the subset
-that leads to SHIFT.  We can, however, deduce the look-ahead set by
-looking at the SHIFT subsets for all states that we can get to by
-reducing zero or more times.  So we need a little function which
-checks if a given token is in any of these look-ahead sets.
+When we try to shift the result of reducing production-zero, it will
+fail because there is no next state.  In this case the asn will not have
+been stored on the stack, so it get stored in the `ret` variable, and we
+report that that input has been accepted.
  
-###### parser includes
-       #include "parser.h"
+###### parser vars
  
-###### parser_run
+       void *ret = NULL;
  
-       static int in_lookahead(struct token *tk, const struct state *states, int state)
-       {
-               while (state >= 0) {
-                       if (search(&states[state], tk->num) >= 0)
-                               return 1;
-                       if (states[state].reduce_prod < 0)
-                               return 0;
-                       state = search(&states[state], states[state].reduce_sym);
+###### try reduce
+
+       if (states[tos->state].reduce_prod >= 0) {
+               void **body;
+               void *res;
+               const struct state *nextstate = &states[tos->state];
+               int prod = nextstate->reduce_prod;
+               int size = reductions[prod].size;
+               int res_size = reductions[prod].result_size;
+
+               body = p.asn_stack + (p.tos - size);
+               res = res_size ? calloc(1, res_size) : NULL;
+               res_size = do_reduce(prod, body, config, res);
+               if (res_size != reductions[prod].result_size)
+                       abort();
+               pop(&p, size, do_free);
+               if (!shift(&p, reductions[prod].sym, res, states)) {
+                       accepted = 1;
+                       ret = res;
+                       parser_trace_action(trace, "Accept");
+               } else
+                       parser_trace_action(trace, "Reduce");
+               continue;
+       }
+
+If we can neither shift nor reduce we have an error to handle.  There
+are two possible responses to an error: we can pop single frames off the
+stack until we find a frame that can shift `TK_error`, or we can discard
+the current look-ahead symbol.
+
+When we first see an error we do the first of these and set a flag to
+record that we are processing an error.  If the normal shift/reduce
+tests fail to find that anything can be done from that state, we start
+dropping tokens until either we manage to shift one, or reach end-of-file.
+
+###### parser vars
+
+       int in_err = 0;
+
+###### did shift
+
+       in_err = 0;
+
+###### handle error
+
+       if (in_err) {
+               parser_trace_action(trace, "DISCARD");
+               if (tk->num == TK_eof)
+                       break;
+               free(tk);
+               tk = NULL;
+       } else {
+               struct token *err_tk;
+
+               parser_trace_action(trace, "ERROR");
+
+               err_tk = tok_copy(*tk);
+               while (p.tos > 0 && shift(&p, TK_error, err_tk, states) == 0)
+                       // discard this state
+                       pop(&p, 1, do_free);
+               if (p.tos == 0) {
+                       free(err_tk);
+                       // no state accepted TK_error
+                       break;
                 }
-               return 0;
+               in_err = 1;
         }
  
+###### parser includes
+       #include "parser.h"
+
+###### parser_run
+
         void *parser_run(struct token_state *tokens,
                          const struct state states[],
+                        const struct reduction reductions[],
                          int (*do_reduce)(int, void**, struct token_config*, void*),
                          void (*do_free)(short, void*),
                          FILE *trace, const char *non_term[],
                          struct token_config *config)
         {
-               struct parser p = { 0 };
-               struct token *tk = NULL;
-               int accepted = 0;
-               int shift_since_err = 1;
-               void *ret = NULL;
-
-               shift(&p, TK_eof, 0, 1, NULL, states);
-               while (!accepted && p.tos > 0) {
-                       struct token *err_tk;
-                       struct frame *tos = &p.stack[p.tos-1];
-                       if (!tk)
-                               tk = tok_copy(token_next(tokens));
-                       parser_trace(trace, &p,
-                                    tk, states, non_term, config->known_count);
-
-                       if (tk->num == TK_in) {
-                               tos->indents += 1;
-                               tos->since_newline = 0;
-                               tos->since_indent = 0;
-                               if (!states[tos->state].starts_line)
-                                       tos->newline_permitted = 0;
-                               free(tk);
-                               tk = NULL;
-                               parser_trace_action(trace, "Record");
-                               continue;
-                       }
-                       if (tk->num == TK_out) {
-                               if (states[tos->state].reduce_size >= 0 &&
-                                   states[tos->state].reduce_size <= tos->since_indent)
-                                       goto force_reduce;
-                               if (states[tos->state].min_prefix >= tos->since_indent) {
-                                       // OK to cancel
-                                       struct frame *in = tos - tos->since_indent;
-                                       in->indents -= 1;
-                                       if (in->indents == 0) {
-                                               /* Reassess since_indent and newline_permitted */
-                                               if (in > p.stack) {
-                                                       in->since_indent = in[-1].since_indent + 1;
-                                                       in->newline_permitted = in[-1].newline_permitted;
-                                               } else {
-                                                       in->since_indent = 0;
-                                                       in->newline_permitted = 0;
-                                               }
-                                               if (states[in->state].starts_line)
-                                                       in->newline_permitted = 1;
-                                               while (in < tos) {
-                                                       in += 1;
-                                                       in->since_indent = in[-1].since_indent + 1;
-                                                       if (states[in->state].starts_line)
-                                                               in->newline_permitted = 1;
-                                                       else
-                                                               in->newline_permitted = in[-1].newline_permitted;
-                                               }
-                                       }
-                                       free(tk);
-                                       tk = NULL;
-                                       parser_trace_action(trace, "Cancel");
-                                       continue;
-                               }
-                               // fall through to error handling as both SHIFT and REDUCE
-                               // will fail.
-                       }
-                       if (tk->num == TK_newline) {
-                               if (!tos->newline_permitted) {
-                                       free(tk);
-                                       tk = NULL;
-                                       parser_trace_action(trace, "Discard");
-                                       continue;
-                               }
-                               if (tos->since_newline > 1 &&
-                                   states[tos->state].reduce_size >= 0 &&
-                                   states[tos->state].reduce_size <= tos->since_newline)
-                                       goto force_reduce;
-                       }
-                       if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
-                               shift_since_err = 1;
-                               tk = NULL;
-                               parser_trace_action(trace, "Shift");
-                               continue;
-                       }
-               force_reduce:
-                       if (states[tos->state].reduce_prod >= 0 &&
-                           states[tos->state].newline_only &&
-                           !(tk->num == TK_newline ||
-                             tk->num == TK_eof ||
-                             tk->num == TK_out ||
-                             (tos->indents == 0 && tos->since_newline == 0))) {
-                               /* Anything other than newline or out or eof
-                                * in an error unless we are already at start
-                                * of line, as this production must end at EOL.
-                                */
-                       } else if (states[tos->state].reduce_prod >= 0) {
-                               void **body;
-                               void *res;
-                               const struct state *nextstate = &states[tos->state];
-                               int prod = nextstate->reduce_prod;
-                               int size = nextstate->reduce_size;
-                               int bufsize;
-                               static char buf[16*1024];
-                               short indents, start_of_line;
-
-                               body = p.asn_stack + (p.tos - size);
-
-                               bufsize = do_reduce(prod, body, config, buf);
-
-                               indents = pop(&p, size, &start_of_line,
-                                             do_free);
-                               res = memdup(buf, bufsize);
-                               memset(buf, 0, bufsize);
-                               if (!shift(&p, nextstate->reduce_sym,
-                                          indents, start_of_line,
-                                          res, states)) {
-                                       if (prod != 0) abort();
-                                       accepted = 1;
-                                       ret = res;
-                               }
-                               parser_trace_action(trace, "Reduce");
-                               continue;
-                       }
-                       /* Error. We walk up the stack until we
-                        * find a state which will accept TK_error.
-                        * We then shift in TK_error and see what state
-                        * that takes us too.
-                        * Then we discard input tokens until
-                        * we find one that is acceptable.
-                        */
-                       parser_trace_action(trace, "ERROR");
-                       short indents = 0, start_of_line;
-
-                       err_tk = tok_copy(*tk);
-                       while (p.tos > 0 &&
-                              shift(&p, TK_error, 0, 0,
-                                    err_tk, states) == 0)
-                               // discard this state
-                               indents += pop(&p, 1, &start_of_line, do_free);
-                       if (p.tos == 0) {
-                               free(err_tk);
-                               // no state accepted TK_error
-                               break;
-                       }
-                       if (!shift_since_err) {
-                               /* must discard at least one token to avoid
-                                * infinite loop.
-                                */
-                               if (tk->num == TK_eof)
-                                       break;
-                               free(tk);
-                               tk = tok_copy(token_next(tokens));
-                       }
-                       shift_since_err = 0;
-                       tos = &p.stack[p.tos-1];
-                       while (!in_lookahead(tk, states, tos->state) &&
-                              tk->num != TK_eof) {
-                               free(tk);
-                               tk = tok_copy(token_next(tokens));
-                               shift_since_err = 1;
-                               if (tk->num == TK_in)
-                                       indents += 1;
-                               if (tk->num == TK_out) {
-                                       if (indents == 0)
-                                               break;
-                                       indents -= 1;
-                                       // FIXME update since_indent here
-                               }
-                       }
-                       tos->indents += indents;
-               }
+               ## parser vars
+
+               ## find eol
+
+               ## heart of parser
+
                 free(tk);
-               pop(&p, p.tos, NULL, do_free);
+               pop(&p, p.tos, do_free);
                 free(p.asn_stack);
                 free(p.stack);
                 return ret;
@@ -3104,6 +2949,7 @@ checks if a given token is in any of these look-ahead sets.
  ###### exported functions
         void *parser_run(struct token_state *tokens,
                          const struct state states[],
+                        const struct reduction reductions[],
                          int (*do_reduce)(int, void**, struct token_config*, void*),
                          void (*do_free)(short, void*),
                          FILE *trace, const char *non_term[],
@@ -3135,15 +2981,6 @@ end inside square brackets.
                 [TK_newline]      = "NEWLINE",
                 [TK_eof]          = "$eof",
         };
-       static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
-       {
-               fprintf(trace, "(%d", f->state);
-               if (states[f->state].starts_line)
-                       fprintf(trace, "s");
-               if (f->newline_permitted)
-                       fprintf(trace, "n%d", f->since_newline);
-               fprintf(trace, ") ");
-       }
  
         void parser_trace(FILE *trace, struct parser *p,
                           struct token *tk, const struct state states[],
@@ -3165,13 +3002,9 @@ end inside square brackets.
                                 } else
                                         fputs(non_term[sym - TK_reserved - knowns],
                                               trace);
-                               if (f->indents)
-                                       fprintf(trace, ".%d", f->indents);
-                               if (f->since_newline == 0)
-                                       fputs("/", trace);
                                 fputs(" ", trace);
                         }
-                       parser_trace_state(trace, f, states);
+                       fprintf(trace, "(%d) ", f->state);
                 }
                 fprintf(trace, "[");
                 if (tk->num < TK_reserved &&
@@ -3192,12 +3025,12 @@ end inside square brackets.
  
  The obvious example for a parser is a calculator.
  
-As `scanner` provides number parsing function using `libgmp` is it not much
+As `scanner` provides number parsing function using `libgmp` it is not much
  work to perform arbitrary rational number calculations.
  
-This calculator takes one expression, or an equality test, per line.  The
-results are printed and if any equality test fails, the program exits with
-an error.
+This calculator takes one expression, or an equality test, per line.
+The results are printed and if any equality test fails, the program
+exits with an error.
  
  ###### File: parsergen.mk
         calc.c calc.h : parsergen parsergen.mdc