parsergen: allow $$OUT as well as $$NEWLINE

[ocean] / csrc / parsergen.mdc
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index f99d318482d9f1b3568515a3c2d4440dc75b94aa..76fec3a70a779cefe4589a9d79e861a61acd726e 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -21,7 +21,6 @@ There are several distinct sections.
     `parsergen` program built from the C code in this file can extract
     that grammar directly from this file and process it.
  
     `parsergen` program built from the C code in this file can extract
     that grammar directly from this file and process it.
  
-
  ###### File: parsergen.c
         #include <unistd.h>
         #include <stdlib.h>
  ###### File: parsergen.c
         #include <unistd.h>
         #include <stdlib.h>
@@ -63,7 +62,8 @@ If the `--tag` option is given, then any top level header that doesn't
  start with the tag is ignored, and the tag is striped from the rest.  So
  `--tag Foo`
  means that the three needed sections must be `Foo: header`, `Foo: code`,
  start with the tag is ignored, and the tag is striped from the rest.  So
  `--tag Foo`
  means that the three needed sections must be `Foo: header`, `Foo: code`,
-and `Foo: grammar`.
+and `Foo: grammar`.  The tag `calc` is used to extract the same calculator
+from this file.
  
  [mdcode]: mdcode.html
  [scanner]: scanner.html
  
  [mdcode]: mdcode.html
  [scanner]: scanner.html
@@ -100,6 +100,7 @@ symbol.
         struct production {
                 unsigned short precedence;
                 enum assoc assoc;
         struct production {
                 unsigned short precedence;
                 enum assoc assoc;
+               char line_like;
                 ## production fields
         };
         struct grammar {
                 ## production fields
         };
         struct grammar {
@@ -277,6 +278,9 @@ declares how it associates.  This level is stored in each symbol
  listed and may be inherited by any production which uses the symbol.  A
  production inherits from the last symbol which has a precedence.
  
  listed and may be inherited by any production which uses the symbol.  A
  production inherits from the last symbol which has a precedence.
  
+The symbols on the first precedence line have the lowest precedence.
+Subsequent lines introduce symbols with higher precedence.
+
  ###### grammar fields
         struct text current_type;
         int type_isref;
  ###### grammar fields
         struct text current_type;
         int type_isref;
@@ -350,6 +354,7 @@ production inherits from the last symbol which has a precedence.
                         s->precedence = g->prec_levels;
                         s->assoc = assoc;
                         found += 1;
                         s->precedence = g->prec_levels;
                         s->assoc = assoc;
                         found += 1;
+                       t = token_next(ts);
                 }
                 if (found == 0)
                         err = "No symbols given on precedence line";
                 }
                 if (found == 0)
                         err = "No symbols given on precedence line";
@@ -437,7 +442,7 @@ the end.
                 return code;
         }
  
                 return code;
         }
  
-Now we have all the bit we need to parse a full production.
+Now we have all the bits we need to parse a full production.
  
  ###### includes
         #include <memory.h>
  
  ###### includes
         #include <memory.h>
@@ -451,6 +456,7 @@ Now we have all the bit we need to parse a full production.
         struct symbol **body;
         int             body_size;
         struct text     code;
         struct symbol **body;
         int             body_size;
         struct text     code;
+       int             code_line;
  
  ###### symbol fields
         int first_production;
  
  ###### symbol fields
         int first_production;
@@ -491,15 +497,21 @@ Now we have all the bit we need to parse a full production.
                                 goto abort;
                         }
                         vs = sym_find(g, tk.txt);
                                 goto abort;
                         }
                         vs = sym_find(g, tk.txt);
-                       if (vs->type != Virtual) {
-                               err = "symbol after $$ must be virtual";
+                       if (vs->num == TK_newline)
+                               p.line_like = 1;
+                       else if (vs->num == TK_out)
+                               p.line_like = 2;
+                       else if (vs->precedence == 0) {
+                               err = "symbol after $$ must have precedence";
                                 goto abort;
                                 goto abort;
+                       } else {
+                               p.precedence = vs->precedence;
+                               p.assoc = vs->assoc;
                         }
                         }
-                       p.precedence = vs->precedence;
-                       p.assoc = vs->assoc;
                         tk = token_next(state);
                 }
                 if (tk.num == TK_open) {
                         tk = token_next(state);
                 }
                 if (tk.num == TK_open) {
+                       p.code_line = tk.line;
                         p.code = collect_code(state, tk);
                         if (p.code.txt == NULL) {
                                 err = "code fragment not closed properly";
                         p.code = collect_code(state, tk);
                         if (p.code.txt == NULL) {
                                 err = "code fragment not closed properly";
@@ -552,7 +564,11 @@ where `START` is the first non-terminal given.
         p->head->first_production = g->production_count;
         array_add(&g->productions, &g->production_count, p);
  
         p->head->first_production = g->production_count;
         array_add(&g->productions, &g->production_count, p);
  
-Now we are ready to read in the grammar.
+Now we are ready to read in the grammar.  We ignore comments
+and strings so that the marks which introduce them can be
+used as terminals in the grammar.  We don't ignore numbers
+even though we don't allow them as that causes the scanner
+to produce errors that the parser is better positioned to handle.
  
  ###### grammar_read
         static struct grammar *grammar_read(struct code_node *code)
  
  ###### grammar_read
         static struct grammar *grammar_read(struct code_node *code)
@@ -725,10 +741,10 @@ or `-1` to indicate failure.
         }
  
  We will often want to form the union of two symsets.  When we do, we
         }
  
  We will often want to form the union of two symsets.  When we do, we
-will often want to know if anything changed (as they might mean there
+will often want to know if anything changed (as that might mean there
  is more work to do).  So `symset_union` reports whether anything was
  added to the first set.  We use a slow quadratic approach as these
  is more work to do).  So `symset_union` reports whether anything was
  added to the first set.  We use a slow quadratic approach as these
-sets don't really get very big.  If profiles shows this to be a problem is
+sets don't really get very big.  If profiles shows this to be a problem it
  can be optimised later.
  
         static int symset_union(struct symset *a, struct symset *b)
  can be optimised later.
  
         static int symset_union(struct symset *a, struct symset *b)
@@ -831,7 +847,6 @@ array like the productions.
                 return sl->ss;
         }
  
                 return sl->ss;
         }
  
-
  ### Setting `nullable`
  
  We set `nullable` on the head symbol for any production for which all
  ### Setting `nullable`
  
  We set `nullable` on the head symbol for any production for which all
@@ -869,7 +884,7 @@ changes happen.
                 }
         }
  
                 }
         }
  
-### Setting `can_eol` and `line_like`
+### Setting `line_like`
  
  In order to be able to ignore newline tokens when not relevant, but
  still include them in the parse when needed, we will need to know
  
  In order to be able to ignore newline tokens when not relevant, but
  still include them in the parse when needed, we will need to know
@@ -877,30 +892,25 @@ which states can start a "line-like" section of code.  We ignore
  newlines when there is an indent since the most recent start of a
  line-like symbol.
  
  newlines when there is an indent since the most recent start of a
  line-like symbol.
  
-To know which symbols are line-like, we first need to know which
-symbols start with a NEWLINE token.  Any symbol which is followed by a
-NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol.
-Certainly when trying to parse one of these we must take not of NEWLINEs.
-
-Clearly the `TK_newline` token can start with a NEWLINE.  Any symbol
-which is the head of a production that contains a starts-with-NEWLINE
-symbol preceeded only by nullable symbols is also a
-starts-with-NEWLINE symbol.  We use a new field `can_eol` to record
-this attribute of symbols, and compute it in a repetitive manner
-similar to `set_nullable`.
+A "line_like" symbol is simply any symbol that can derive a NEWLINE.
+If a symbol cannot derive a NEWLINE, then it is only part of a line -
+so is word-like.  If it can derive a NEWLINE, then we consider it to
+be like a line.
  
  
-Once we have that, we can determine which symbols are `line_like` be
-seeing which are followed by a `can_eol` symbol in any production.
+Clearly the `TK_newline` token can derive a NEWLINE.  Any symbol which
+is the head of a production that contains a line_like symbol is also a
+line-like symbol.  We use a new field `line_like` to record this
+attribute of symbols, and compute it in a repetitive manner similar to
+`set_nullable`.
  
  ###### symbol fields
  
  ###### symbol fields
-       int can_eol;
         int line_like;
  
  ###### functions
         int line_like;
  
  ###### functions
-       static void set_can_eol(struct grammar *g)
+       static void set_line_like(struct grammar *g)
         {
                 int check_again = 1;
         {
                 int check_again = 1;
-               g->symtab[TK_newline]->can_eol = 1;
+               g->symtab[TK_newline]->line_like = 1;
                 while (check_again) {
                         int p;
                         check_again = 0;
                 while (check_again) {
                         int p;
                         check_again = 0;
@@ -908,35 +918,20 @@ seeing which are followed by a `can_eol` symbol in any production.
                                 struct production *pr = g->productions[p];
                                 int s;
  
                                 struct production *pr = g->productions[p];
                                 int s;
  
-                               if (pr->head->can_eol)
+                               if (pr->head->line_like)
                                         continue;
  
                                 for (s = 0 ; s < pr->body_size; s++) {
                                         continue;
  
                                 for (s = 0 ; s < pr->body_size; s++) {
-                                       if (pr->body[s]->can_eol) {
-                                               pr->head->can_eol = 1;
+                                       if (pr->body[s]->line_like) {
+                                               pr->head->line_like = 1;
                                                 check_again = 1;
                                                 break;
                                         }
                                                 check_again = 1;
                                                 break;
                                         }
-                                       if (!pr->body[s]->nullable)
-                                               break;
                                 }
                         }
                 }
         }
  
                                 }
                         }
                 }
         }
  
-       static void set_line_like(struct grammar *g)
-       {
-               int p;
-               for (p = 0; p < g->production_count; p++) {
-                       struct production *pr = g->productions[p];
-                       int s;
-
-                       for (s = 1; s < pr->body_size; s++)
-                               if (pr->body[s]->can_eol)
-                                       pr->body[s-1]->line_like = 1;
-               }
-       }
-
  ### Building the `first` sets
  
  When calculating what can follow a particular non-terminal, we will need to
  ### Building the `first` sets
  
  When calculating what can follow a particular non-terminal, we will need to
@@ -950,7 +945,7 @@ we repeat the calculations until no changes happen, just like with
  `set_nullable`.  This makes use of the fact that `symset_union`
  reports if any change happens.
  
  `set_nullable`.  This makes use of the fact that `symset_union`
  reports if any change happens.
  
-The core of this which finds the "first" of part of a production body
+The core of this, which finds the "first" of part of a production body,
  will be reused for computing the "follow" sets, so we split it out
  into a separate function.
  
  will be reused for computing the "follow" sets, so we split it out
  into a separate function.
  
@@ -1166,6 +1161,26 @@ can just compare the symset and the data values together.
                         a.data[i] - b.data[i];
         }
  
                         a.data[i] - b.data[i];
         }
  
+It will be helpful to know if an itemset has been "completed" or not,
+particularly for LALR where itemsets get merged, at which point they
+need to be consider for completion again.  So  a `completed` flag is needed.
+
+For correct handling of `TK_newline` when parsing, we will need to
+know which states (itemsets) can occur at the start of a line, so we
+will record a `starts_line` flag too whenever DOT is at the start of a
+`line_like` symbol.
+
+Finally, for handling `TK_out` we need to know whether productions in the
+current state started *before* the most recent indent.  A state
+doesn't usually keep details of individual productions, so we need to
+add one extra detail. `min_prefix` is the smallest non-zero number of
+symbols *before* DOT in any production in an itemset.  This will allow
+us to determine if the the most recent indent is sufficiently recent
+to cancel it against a `TK_out`.  If it was seen longer ago than the
+`min_prefix`, and if the current state cannot be reduced, then the
+indented section must have ended in the middle of a syntactic unit, so
+an error must be signaled.
+
  And now we can build the list of itemsets.  The lookup routine returns
  both a success flag and a pointer to where in the list an insert
  should happen, so we don't need to search a second time.
  And now we can build the list of itemsets.  The lookup routine returns
  both a success flag and a pointer to where in the list an insert
  should happen, so we don't need to search a second time.
@@ -1176,6 +1191,8 @@ should happen, so we don't need to search a second time.
                 short state;
                 struct symset items;
                 struct symset go_to;
                 short state;
                 struct symset items;
                 struct symset go_to;
+               enum assoc assoc;
+               unsigned short precedence;
                 char completed;
                 char starts_line;
                 int min_prefix;
                 char completed;
                 char starts_line;
                 int min_prefix;
@@ -1216,6 +1233,7 @@ recalculated and their LA sets updated.
  them to a data structure, of freeing them.
  
         static int add_itemset(struct grammar *g, struct symset ss,
  them to a data structure, of freeing them.
  
         static int add_itemset(struct grammar *g, struct symset ss,
+                              enum assoc assoc, unsigned short precedence,
                                enum grammar_type type)
         {
                 struct itemset **where, *is;
                                enum grammar_type type)
         {
                 struct itemset **where, *is;
@@ -1226,6 +1244,8 @@ them to a data structure, of freeing them.
                         is->state = g->states;
                         g->states += 1;
                         is->items = ss;
                         is->state = g->states;
                         g->states += 1;
                         is->items = ss;
+                       is->assoc = assoc;
+                       is->precedence = precedence;
                         is->next = *where;
                         is->go_to = INIT_DATASET;
                         *where = is;
                         is->next = *where;
                         is->go_to = INIT_DATASET;
                         *where = is;
@@ -1269,11 +1289,18 @@ be supplemented by the LA set for the item which produce the new item.
  
  We also collect a set of all symbols which follow "DOT" (in `done`) as this
  is used in the next stage.
  
  We also collect a set of all symbols which follow "DOT" (in `done`) as this
  is used in the next stage.
-If any of these symbols are flagged as starting a line, then this
+If any of these symbols are flagged as `line_like`, then this
  state must be a `starts_line` state so now is a good time to record that.
  
  state must be a `starts_line` state so now is a good time to record that.
  
-NOTE: precedence handling should happen here - I haven't written this yet
-though.
+When itemsets are created we assign a precedence to the itemset from
+the complete item, if there is one.  We ignore the possibility of
+there being two and don't (currently) handle precedence in such
+grammars.  When completing a grammar we ignore any item where DOT is
+followed by a terminal with a precedence lower than that for the
+itemset.  Unless the terminal has right associativity, we also ignore
+items where the terminal has the same precedence.  The result is that
+unwanted items are still in the itemset, but the terminal doesn't get
+into the go to set, so the item is ineffective.
  
  ###### complete itemset
         for (i = 0; i < is->items.cnt; i++) {
  
  ###### complete itemset
         for (i = 0; i < is->items.cnt; i++) {
@@ -1284,6 +1311,8 @@ though.
                 struct symbol *s;
                 struct symset LA = INIT_SYMSET;
                 unsigned short sn = 0;
                 struct symbol *s;
                 struct symset LA = INIT_SYMSET;
                 unsigned short sn = 0;
+               struct symset LAnl = INIT_SYMSET;
+               unsigned short snnl = 0;
  
                 if (is->min_prefix == 0 ||
                     (bs > 0 && bs < is->min_prefix))
  
                 if (is->min_prefix == 0 ||
                     (bs > 0 && bs < is->min_prefix))
@@ -1291,6 +1320,18 @@ though.
                 if (bs == pr->body_size)
                         continue;
                 s = pr->body[bs];
                 if (bs == pr->body_size)
                         continue;
                 s = pr->body[bs];
+               if (s->precedence && is->precedence &&
+                   is->precedence > s->precedence)
+                       /* This terminal has a low precedence and
+                        * shouldn't be shifted
+                        */
+                       continue;
+               if (s->precedence && is->precedence &&
+                   is->precedence == s->precedence && s->assoc != Right)
+                       /* This terminal has a matching precedence and is
+                        * not Right-associative, so we mustn't shift it.
+                        */
+                       continue;
                 if (symset_find(&done, s->num) < 0) {
                         symset_add(&done, s->num, 0);
                         if (s->line_like)
                 if (symset_find(&done, s->num) < 0) {
                         symset_add(&done, s->num, 0);
                         if (s->line_like)
@@ -1309,6 +1350,10 @@ though.
                         }
                         sn = save_set(g, LA);
                         LA = set_find(g, sn);
                         }
                         sn = save_set(g, LA);
                         LA = set_find(g, sn);
+                       if (symset_find(&LA, TK_newline))
+                               symset_add(&LAnl, TK_newline, 0);
+                       snnl = save_set(g, LAnl);
+                       LAnl = set_find(g, snnl);
                 }
  
                 /* Add productions for this symbol */
                 }
  
                 /* Add productions for this symbol */
@@ -1319,19 +1364,25 @@ though.
                         int itm = item_num(p2, 0);
                         int pos = symset_find(&is->items, itm);
                         if (pos < 0) {
                         int itm = item_num(p2, 0);
                         int pos = symset_find(&is->items, itm);
                         if (pos < 0) {
-                               symset_add(&is->items, itm, sn);
+                               if (g->productions[p2]->line_like)
+                                       symset_add(&is->items, itm, snnl);
+                               else
+                                       symset_add(&is->items, itm, sn);
                                 /* Will have re-ordered, so start
                                  * from beginning again */
                                 i = -1;
                         } else if (type >= LALR) {
                                 struct symset ss = set_find(g, is->items.data[pos]);
                                 struct symset tmp = INIT_SYMSET;
                                 /* Will have re-ordered, so start
                                  * from beginning again */
                                 i = -1;
                         } else if (type >= LALR) {
                                 struct symset ss = set_find(g, is->items.data[pos]);
                                 struct symset tmp = INIT_SYMSET;
+                               struct symset *la = &LA;
  
  
+                               if (g->productions[p2]->line_like)
+                                       la = &LAnl;
                                 symset_union(&tmp, &ss);
                                 symset_union(&tmp, &ss);
-                               if (symset_union(&tmp, &LA)) {
+                               if (symset_union(&tmp, la)) {
                                         is->items.data[pos] = save_set(g, tmp);
                                         i = -1;
                                         is->items.data[pos] = save_set(g, tmp);
                                         i = -1;
-                               }else
+                               } else
                                         symset_free(tmp);
                         }
                 }
                                         symset_free(tmp);
                         }
                 }
@@ -1350,6 +1401,8 @@ with a pre-existing itemset).
                 int j;
                 unsigned short state;
                 struct symbol *sym = g->symtab[done.syms[i]];
                 int j;
                 unsigned short state;
                 struct symbol *sym = g->symtab[done.syms[i]];
+               enum assoc assoc = Non;
+               unsigned short precedence = 0;
                 struct symset newitemset = INIT_SYMSET;
                 if (type >= LALR)
                         newitemset = INIT_DATASET;
                 struct symset newitemset = INIT_SYMSET;
                 if (type >= LALR)
                         newitemset = INIT_DATASET;
@@ -1369,6 +1422,13 @@ with a pre-existing itemset).
                         if (type >= LALR)
                                 la = is->items.data[j];
                         pos = symset_find(&newitemset, pr->head->num);
                         if (type >= LALR)
                                 la = is->items.data[j];
                         pos = symset_find(&newitemset, pr->head->num);
+                       if (bp + 1 == pr->body_size &&
+                           pr->precedence > 0 &&
+                           pr->precedence > precedence) {
+                               // new itemset is reducible and has a precedence.
+                               precedence = pr->precedence;
+                               assoc = pr->assoc;
+                       }
                         if (pos < 0)
                                 symset_add(&newitemset, item_num(p, bp+1), la);
                         else if (type >= LALR) {
                         if (pos < 0)
                                 symset_add(&newitemset, item_num(p, bp+1), la);
                         else if (type >= LALR) {
@@ -1386,12 +1446,12 @@ with a pre-existing itemset).
                                 }
                         }
                 }
                                 }
                         }
                 }
-               state = add_itemset(g, newitemset, type);
+               state = add_itemset(g, newitemset, assoc, precedence, type);
                 if (symset_find(&is->go_to, done.syms[i]) < 0)
                         symset_add(&is->go_to, done.syms[i], state);
         }
  
                 if (symset_find(&is->go_to, done.syms[i]) < 0)
                         symset_add(&is->go_to, done.syms[i], state);
         }
  
-All that is left is to crate the initial itemset from production zero, and
+All that is left is to create the initial itemset from production zero, and
  with `TK_eof` as the LA set.
  
  ###### functions
  with `TK_eof` as the LA set.
  
  ###### functions
@@ -1410,7 +1470,7 @@ with `TK_eof` as the LA set.
                 }
                 // production 0, offset 0 (with no data)
                 symset_add(&first, item_num(0, 0), la);
                 }
                 // production 0, offset 0 (with no data)
                 symset_add(&first, item_num(0, 0), la);
-               add_itemset(g, first, type);
+               add_itemset(g, first, Non, 0, type);
                 for (again = 0, is = g->items;
                      is;
                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
                 for (again = 0, is = g->items;
                      is;
                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
@@ -1470,7 +1530,6 @@ changeover point in `first_nonterm`.
                         g->symtab[s->num] = s;
  
                 set_nullable(g);
                         g->symtab[s->num] = s;
  
                 set_nullable(g);
-               set_can_eol(g);
                 set_line_like(g);
                 if (type >= SLR)
                         build_first(g);
                 set_line_like(g);
                 if (type >= SLR)
                         build_first(g);
@@ -1503,8 +1562,8 @@ all the tables that have been generated, plus a description of any conflicts.
  
  Firstly we have the complete list of symbols, together with the
  "FIRST" set if that was generated.  We add a mark to each symbol to
  
  Firstly we have the complete list of symbols, together with the
  "FIRST" set if that was generated.  We add a mark to each symbol to
-show if it can end in a newline (`>`), if it implies the start of a
-line (`<`), or if it is nullable (`.`).
+show if it can end in a newline (`>`), if it is considered to be
+"line-like" (`<`), or if it is nullable (`.`).
  
  ###### functions
  
  
  ###### functions
  
@@ -1521,9 +1580,8 @@ line (`<`), or if it is nullable (`.`).
                         if (!s)
                                 continue;
  
                         if (!s)
                                 continue;
  
-                       printf(" %c%c%c%3d%c: ",
+                       printf(" %c%c%3d%c: ",
                                s->nullable ? '.':' ',
                                s->nullable ? '.':' ',
-                              s->can_eol ? '>':' ',
                                s->line_like ? '<':' ',
                                s->num, symtypes[s->type]);
                         prtxt(s->name);
                                s->line_like ? '<':' ',
                                s->num, symtypes[s->type]);
                         prtxt(s->name);
@@ -1586,9 +1644,19 @@ it up a bit.  First the items, with production number and associativity.
                 if (dot == pr->body_size)
                         printf(" .");
                 printf(" [%d]", p);
                 if (dot == pr->body_size)
                         printf(" .");
                 printf(" [%d]", p);
-               if (pr->precedence)
+               if (pr->precedence && dot == pr->body_size)
                         printf(" (%d%s)", pr->precedence,
                                assoc_names[pr->assoc]);
                         printf(" (%d%s)", pr->precedence,
                                assoc_names[pr->assoc]);
+               if (dot < pr->body_size &&
+                   pr->body[dot]->precedence) {
+                       struct symbol *s = pr->body[dot];
+                       printf(" [%d%s]", s->precedence,
+                              assoc_names[s->assoc]);
+               }
+               if (pr->line_like == 1)
+                       printf(" $$NEWLINE");
+               else if (pr->line_like)
+                       printf(" $$OUT");
                 printf("\n");
         }
  
                 printf("\n");
         }
  
@@ -1611,7 +1679,6 @@ The LA sets which are (possibly) reported with each item:
  
  Then the go to sets:
  
  
  Then the go to sets:
  
-
         static void report_goto(struct grammar *g, struct symset gt)
         {
                 int i;
         static void report_goto(struct grammar *g, struct symset gt)
         {
                 int i;
@@ -1633,8 +1700,11 @@ Now we can report all the item sets complete with items, LA sets, and GO TO.
                 for (s = 0; s < g->states; s++) {
                         int j;
                         struct itemset *is = g->statetab[s];
                 for (s = 0; s < g->states; s++) {
                         int j;
                         struct itemset *is = g->statetab[s];
-                       printf("  Itemset %d:%s min prefix=%d\n",
+                       printf("  Itemset %d:%s min prefix=%d",
                                s, is->starts_line?" (startsline)":"", is->min_prefix);
                                s, is->starts_line?" (startsline)":"", is->min_prefix);
+                       if (is->precedence)
+                               printf(" %d%s", is->precedence, assoc_names[is->assoc]);
+                       printf("\n");
                         for (j = 0; j < is->items.cnt; j++) {
                                 report_item(g, is->items.syms[j]);
                                 if (is->items.data != NO_DATA)
                         for (j = 0; j < is->items.cnt; j++) {
                                 report_item(g, is->items.syms[j]);
                                 if (is->items.data != NO_DATA)
@@ -1670,7 +1740,7 @@ LR1 are also similar as they have FOLLOW or LA sets.
  LR0 conflicts are any state which have both a reducible item and
  a shiftable item, or two reducible items.
  
  LR0 conflicts are any state which have both a reducible item and
  a shiftable item, or two reducible items.
  
-LR05 conflicts only occurs if two possibly reductions exist,
+LR05 conflicts only occur if two possible reductions exist,
  as shifts always over-ride reductions.
  
  ###### conflict functions
  as shifts always over-ride reductions.
  
  ###### conflict functions
@@ -1725,6 +1795,11 @@ terminals to items where that terminal could be shifted and another
  which maps terminals to items that could be reduced when the terminal
  is in look-ahead.  We report when we get conflicts between the two.
  
  which maps terminals to items that could be reduced when the terminal
  is in look-ahead.  We report when we get conflicts between the two.
  
+As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
+terminal, we ignore it.  NEWLINES are handled specially with its own
+rules for when to shift and when to reduce.  Conflicts are expected,
+but handled internally.
+
         static int conflicts_slr(struct grammar *g, enum grammar_type type)
         {
                 int i;
         static int conflicts_slr(struct grammar *g, enum grammar_type type)
         {
                 int i;
@@ -1743,16 +1818,22 @@ is in look-ahead.  We report when we get conflicts between the two.
                                 int p = item_prod(itm);
                                 int bp = item_index(itm);
                                 struct production *pr = g->productions[p];
                                 int p = item_prod(itm);
                                 int bp = item_index(itm);
                                 struct production *pr = g->productions[p];
+                               struct symbol *s;
  
  
-                               if (bp < pr->body_size &&
-                                   pr->body[bp]->type == Terminal) {
-                                       /* shiftable */
-                                       int sym = pr->body[bp]->num;
-                                       if (symset_find(&shifts, sym) < 0)
-                                               symset_add(&shifts, sym, itm);
-                               }
+                               if (bp >= pr->body_size ||
+                                   pr->body[bp]->type != Terminal)
+                                       /* not shiftable */
+                                       continue;
+
+                               s = pr->body[bp];
+                               if (s->precedence && is->precedence)
+                                       /* Precedence resolves this, so no conflict */
+                                       continue;
+
+                               if (symset_find(&shifts, s->num) < 0)
+                                       symset_add(&shifts, s->num, itm);
                         }
                         }
-                       /* Now look for reduction and conflicts */
+                       /* Now look for reductions and conflicts */
                         for (j = 0; j < is->items.cnt; j++) {
                                 unsigned short itm = is->items.syms[j];
                                 int p = item_prod(itm);
                         for (j = 0; j < is->items.cnt; j++) {
                                 unsigned short itm = is->items.syms[j];
                                 int p = item_prod(itm);
@@ -1770,13 +1851,13 @@ is in look-ahead.  We report when we get conflicts between the two.
                                 int k;
                                 for (k = 0; k < la.cnt; k++) {
                                         int pos = symset_find(&shifts, la.syms[k]);
                                 int k;
                                 for (k = 0; k < la.cnt; k++) {
                                         int pos = symset_find(&shifts, la.syms[k]);
-                                       if (pos >= 0) {
+                                       if (pos >= 0 && la.syms[k] != TK_newline) {
                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
-                                               prtxt(g->symtab[la.syms[k]]->name);
+                                               cnt++;
+                                                       prtxt(g->symtab[la.syms[k]]->name);
                                                 printf(":\n");
                                                 report_item(g, shifts.data[pos]);
                                                 report_item(g, itm);
                                                 printf(":\n");
                                                 report_item(g, shifts.data[pos]);
                                                 report_item(g, itm);
-                                               cnt++;
                                         }
                                         pos = symset_find(&reduce, la.syms[k]);
                                         if (pos < 0) {
                                         }
                                         pos = symset_find(&reduce, la.syms[k]);
                                         if (pos < 0) {
@@ -1797,7 +1878,6 @@ is in look-ahead.  We report when we get conflicts between the two.
                 return cnt;
         }
  
                 return cnt;
         }
  
-
  ## Generating the parser
  
  The exported part of the parser is the `parse_XX` function, where the name
  ## Generating the parser
  
  The exported part of the parser is the `parse_XX` function, where the name
@@ -1814,13 +1894,14 @@ pieces of code provided in the grammar file, so they are generated first.
  
  ###### parser_generate
  
  
  ###### parser_generate
  
-       static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
+       static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
+                              struct code_node *pre_reduce)
         {
                 gen_known(f, g);
                 gen_non_term(f, g);
                 gen_goto(f, g);
                 gen_states(f, g);
         {
                 gen_known(f, g);
                 gen_non_term(f, g);
                 gen_goto(f, g);
                 gen_states(f, g);
-               gen_reduce(f, g, file);
+               gen_reduce(f, g, file, pre_reduce);
                 gen_free(f, g);
  
                 fprintf(f, "#line 0 \"gen_parser\"\n");
                 gen_free(f, g);
  
                 fprintf(f, "#line 0 \"gen_parser\"\n");
@@ -1841,7 +1922,9 @@ pieces of code provided in the grammar file, so they are generated first.
  ### Known words table
  
  The known words table is simply an array of terminal symbols.
  ### Known words table
  
  The known words table is simply an array of terminal symbols.
-The table of nonterminals used for tracing is a similar array.
+The table of nonterminals used for tracing is a similar array.  We
+include virtual symbols in the table of non_terminals to keep the
+numbers right.
  
  ###### functions
  
  
  ###### functions
  
@@ -1866,7 +1949,7 @@ The table of nonterminals used for tracing is a similar array.
                 for (i = TK_reserved;
                      i < g->num_syms;
                      i++)
                 for (i = TK_reserved;
                      i < g->num_syms;
                      i++)
-                       if (g->symtab[i]->type == Nonterminal)
+                       if (g->symtab[i]->type != Terminal)
                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
                                         g->symtab[i]->name.txt);
                 fprintf(f, "};\n\n");
                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
                                         g->symtab[i]->name.txt);
                 fprintf(f, "};\n\n");
@@ -1896,12 +1979,11 @@ The go to table is stored in a simple array of `sym` and corresponding
                 short reduce_prod;
                 short reduce_size;
                 short reduce_sym;
                 short reduce_prod;
                 short reduce_size;
                 short reduce_sym;
-               short shift_sym;
-               short starts_line;
+               char starts_line;
+               char newline_only;
                 short min_prefix;
         };
  
                 short min_prefix;
         };
  
-
  ###### functions
  
         static void gen_goto(FILE *f, struct grammar *g)
  ###### functions
  
         static void gen_goto(FILE *f, struct grammar *g)
@@ -1930,23 +2012,15 @@ The go to table is stored in a simple array of `sym` and corresponding
                 for (i = 0; i < g->states; i++) {
                         struct itemset *is = g->statetab[i];
                         int j, prod = -1, prod_len;
                 for (i = 0; i < g->states; i++) {
                         struct itemset *is = g->statetab[i];
                         int j, prod = -1, prod_len;
-                       int shift_sym = -1;
-                       int shift_len = 0, shift_remain = 0;
+
                         for (j = 0; j < is->items.cnt; j++) {
                                 int itm = is->items.syms[j];
                                 int p = item_prod(itm);
                                 int bp = item_index(itm);
                                 struct production *pr = g->productions[p];
  
                         for (j = 0; j < is->items.cnt; j++) {
                                 int itm = is->items.syms[j];
                                 int p = item_prod(itm);
                                 int bp = item_index(itm);
                                 struct production *pr = g->productions[p];
  
-                               if (bp < pr->body_size) {
-                                       if (shift_sym < 0 ||
-                                           (shift_len == bp && shift_remain > pr->body_size - bp)) {
-                                               shift_sym = pr->body[bp]->num;
-                                               shift_len = bp;
-                                               shift_remain = pr->body_size - bp;
-                                       }
+                               if (bp < pr->body_size)
                                         continue;
                                         continue;
-                               }
                                 /* This is what we reduce */
                                 if (prod < 0 || prod_len < pr->body_size) {
                                         prod = p;
                                 /* This is what we reduce */
                                 if (prod < 0 || prod_len < pr->body_size) {
                                         prod = p;
@@ -1955,14 +2029,16 @@ The go to table is stored in a simple array of `sym` and corresponding
                         }
  
                         if (prod >= 0)
                         }
  
                         if (prod >= 0)
-                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d, %d },\n",
+                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n",
                                         i, is->go_to.cnt, i, prod,
                                         g->productions[prod]->body_size,
                                         g->productions[prod]->head->num,
                                         i, is->go_to.cnt, i, prod,
                                         g->productions[prod]->body_size,
                                         g->productions[prod]->head->num,
-                                       is->starts_line, is->min_prefix);
+                                       is->starts_line,
+                                       g->productions[prod]->line_like,
+                                       is->min_prefix);
                         else
                         else
-                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d, %d },\n",
-                                       i, is->go_to.cnt, i, shift_sym,
+                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n",
+                                       i, is->go_to.cnt, i,
                                         is->starts_line, is->min_prefix);
                 }
                 fprintf(f, "};\n\n");
                                         is->starts_line, is->min_prefix);
                 }
                 fprintf(f, "};\n\n");
@@ -1989,7 +2065,7 @@ can access the larger structure using pointer manipulation.
  The code fragment requires translation when written out.  Any `$N` needs to
  be converted to a reference either to that buffer (if `$0`) or to the
  structure returned by a previous reduction.  These pointers need to be cast
  The code fragment requires translation when written out.  Any `$N` needs to
  be converted to a reference either to that buffer (if `$0`) or to the
  structure returned by a previous reduction.  These pointers need to be cast
-to the appropriate type for each access.  All this is handling in
+to the appropriate type for each access.  All this is handled in
  `gen_code`.
  
  `gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
  `gen_code`.
  
  `gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
@@ -2054,31 +2130,40 @@ automatically freed.  This is equivalent to assigning `NULL` to the pointer.
                 fputs("\n", f);
                 for (i = 0; i < p->body_size; i++) {
                         if (p->body[i]->struct_name.txt &&
                 fputs("\n", f);
                 for (i = 0; i < p->body_size; i++) {
                         if (p->body[i]->struct_name.txt &&
-                           p->body[i]->isref &&
-                           used[i])
+                           used[i]) {
                                 // assume this has been copied out
                                 // assume this has been copied out
-                               fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
+                               if (p->body[i]->isref)
+                                       fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
+                               else
+                                       fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
+                       }
                 }
                 free(used);
         }
  
  ###### functions
  
                 }
                 free(used);
         }
  
  ###### functions
  
-       static void gen_reduce(FILE *f, struct grammar *g, char *file)
+       static void gen_reduce(FILE *f, struct grammar *g, char *file,
+                              struct code_node *code)
         {
                 int i;
         {
                 int i;
-               fprintf(f, "#line 0 \"gen_reduce\"\n");
+               fprintf(f, "#line 1 \"gen_reduce\"\n");
                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
                 fprintf(f, "{\n");
                 fprintf(f, "\tint ret_size = 0;\n");
                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
                 fprintf(f, "{\n");
                 fprintf(f, "\tint ret_size = 0;\n");
+               if (code)
+                       code_node_print(f, code, file);
  
  
+               fprintf(f, "#line 4 \"gen_reduce\"\n");
                 fprintf(f, "\tswitch(prod) {\n");
                 for (i = 0; i < g->production_count; i++) {
                         struct production *p = g->productions[i];
                         fprintf(f, "\tcase %d:\n", i);
  
                 fprintf(f, "\tswitch(prod) {\n");
                 for (i = 0; i < g->production_count; i++) {
                         struct production *p = g->productions[i];
                         fprintf(f, "\tcase %d:\n", i);
  
-                       if (p->code.txt)
+                       if (p->code.txt) {
+                               fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
                                 gen_code(p, f, g);
                                 gen_code(p, f, g);
+                       }
  
                         if (p->head->struct_name.txt)
                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
  
                         if (p->head->struct_name.txt)
                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
@@ -2103,7 +2188,7 @@ recovery where individual stack frames might need to be freed.
  For this, the grammar author is required to defined a `free_XX` function for
  each structure that is used by a non-terminal.  `do_free` will call whichever
  is appropriate given a symbol number, and will call `free` (as is
  For this, the grammar author is required to defined a `free_XX` function for
  each structure that is used by a non-terminal.  `do_free` will call whichever
  is appropriate given a symbol number, and will call `free` (as is
-appropriate for tokens on any terminal symbol.
+appropriate for tokens) on any terminal symbol.
  
  ###### functions
  
  
  ###### functions
  
@@ -2228,7 +2313,7 @@ grammar file).
  To be able to run `mdcode` and `scanner` on the grammar we need to memory
  map it.
  
  To be able to run `mdcode` and `scanner` on the grammar we need to memory
  map it.
  
-One we have extracted the code (with `mdcode`) we expect to find three
+Once we have extracted the code (with `mdcode`) we expect to find three
  sections: header, code, and grammar.  Anything else that is not
  excluded by the `--tag` option is an error.
  
  sections: header, code, and grammar.  Anything else that is not
  excluded by the `--tag` option is an error.
  
@@ -2266,6 +2351,7 @@ parser with neither. "grammar" must be provided.
         struct code_node *hdr = NULL;
         struct code_node *code = NULL;
         struct code_node *gram = NULL;
         struct code_node *hdr = NULL;
         struct code_node *code = NULL;
         struct code_node *gram = NULL;
+       struct code_node *pre_reduce = NULL;
         for (s = table; s; s = s->next) {
                 struct text sec = s->section;
                 if (tag && !strip_tag(&sec, tag))
         for (s = table; s; s = s->next) {
                 struct text sec = s->section;
                 if (tag && !strip_tag(&sec, tag))
@@ -2276,6 +2362,8 @@ parser with neither. "grammar" must be provided.
                         code = s->code;
                 else if (text_is(sec, "grammar"))
                         gram = s->code;
                         code = s->code;
                 else if (text_is(sec, "grammar"))
                         gram = s->code;
+               else if (text_is(sec, "reduce"))
+                       pre_reduce = s->code;
                 else {
                         fprintf(stderr, "Unknown content section: %.*s\n",
                                 s->section.len, s->section.txt);
                 else {
                         fprintf(stderr, "Unknown content section: %.*s\n",
                                 s->section.len, s->section.txt);
@@ -2322,7 +2410,7 @@ the report finds conflicts we will exit with an error status.
                                 rv |= 1;
         }
  
                                 rv |= 1;
         }
  
-If a headers section is defined, we write it out and include a declaration
+If a "headers" section is defined, we write it out and include a declaration
  for the `parse_XX` function so it can be used from separate code.
  
         if (rv == 0 && hdr && outfile) {
  for the `parse_XX` function so it can be used from separate code.
  
         if (rv == 0 && hdr && outfile) {
@@ -2347,7 +2435,7 @@ file with the code section (if any) and the parser tables and function.
                 if (f) {
                         if (code)
                                 code_node_print(f, code, infile);
                 if (f) {
                         if (code)
                                 code_node_print(f, code, infile);
-                       gen_parser(f, g, infile, name);
+                       gen_parser(f, g, infile, name, pre_reduce);
                         fclose(f);
                 } else {
                         fprintf(stderr, "Cannot create %s.c\n",
                         fclose(f);
                 } else {
                         fprintf(stderr, "Cannot create %s.c\n",
@@ -2437,20 +2525,23 @@ The `state` is the most important one and guides the parsing process.  The
  freeing function.  The symbol leads us to the right free function through
  `do_free`.
  
  freeing function.  The symbol leads us to the right free function through
  `do_free`.
  
-The `indents` count tracks the line indents in the symbol.  These are
-used to allow indent information to guide parsing and error recovery.
+The `indents` count tracks the line indents with-in the symbol or
+immediately follow it.  These are used to allow indent information to
+guide parsing and error recovery.
  
  `since_newline` tracks how many stack frames since the last
  start-of-line (whether indented or not).  So if `since_newline` is
  
  `since_newline` tracks how many stack frames since the last
  start-of-line (whether indented or not).  So if `since_newline` is
-zero, then this symbol is at the start of a line.
+zero, then this symbol is at the start of a line.  Similarly
+`since_indent` counts the number of states since an indent, it is zero
+precisely when `indents` is not zero.
  
  `newline_permitted` keeps track of whether newlines should be ignored
  
  `newline_permitted` keeps track of whether newlines should be ignored
-or not, and `starts_line` records if this state stated on a newline.
+or not.
  
  The stack is most properly seen as alternating states and symbols -
  states, like the 'DOT' in items, are between symbols.  Each frame in
  our stack holds a state and the symbol that was before it.  The
  
  The stack is most properly seen as alternating states and symbols -
  states, like the 'DOT' in items, are between symbols.  Each frame in
  our stack holds a state and the symbol that was before it.  The
-bottom of stack holds the start state, but no symbol, as nothing came
+bottom of stack holds the start state but no symbol, as nothing came
  before the beginning.
  
  ###### parser functions
  before the beginning.
  
  ###### parser functions
@@ -2474,12 +2565,15 @@ before the beginning.
  
  Two operations are needed on the stack - shift (which is like push) and pop.
  
  
  Two operations are needed on the stack - shift (which is like push) and pop.
  
-Shift applies not only to terminals but also to non-terminals.  When we
-reduce a production we will pop off entries corresponding to the body
-symbols, then push on an item for the head of the production.  This last is
-exactly the same process as shifting in a terminal so we use the same
-function for both.  In both cases we provide a stack frame which
-contains the symbol to shift and related indent information.
+Shift applies not only to terminals but also to non-terminals.  When
+we reduce a production we will pop off entries corresponding to the
+body symbols, then push on an item for the head of the production.
+This last is exactly the same process as shifting in a terminal so we
+use the same function for both.  In both cases we provide the symbol,
+the number of indents the symbol contains (which will be zero for a
+terminal symbol) and a flag indicating the the symbol was at (or was
+reduced from a symbol which was at) the start of a line.  The state is
+deduced from the current top-of-stack state and the new symbol.
  
  To simplify other code we arrange for `shift` to fail if there is no `goto`
  state for the symbol.  This is useful in basic parsing due to our design
  
  To simplify other code we arrange for `shift` to fail if there is no `goto`
  state for the symbol.  This is useful in basic parsing due to our design
@@ -2489,28 +2583,26 @@ function reports if it could.
  `shift` is also used to push state zero onto the stack, so if the
  stack is empty, it always chooses zero as the next state.
  
  `shift` is also used to push state zero onto the stack, so if the
  stack is empty, it always chooses zero as the next state.
  
-So `shift` finds the next state.  If that succeed it extends the allocations
-if needed and pushes all the information onto the stacks.
-
-Newlines are permitted after a starts_line state until an internal
-indent.  So we need to find the topmost state which `starts_line` and
-see if there are any indents other than immediately after it.
-
-So we walk down:
+So `shift` finds the next state.  If that succeeds it extends the
+allocations if needed and pushes all the information onto the stacks.
  
  
--  if state starts_line, then newlines_permitted.
--  if any non-initial indents, newlines not permitted
+Newlines are permitted after a `starts_line` state until an internal
+indent.  If the new frame has neither a `starts_line` state nor an
+indent, newlines are permitted if the previous stack frame permitted
+them.
  
  ###### parser functions
  
  
  ###### parser functions
  
-       static int shift(struct parser *p, struct frame *next,
+       static int shift(struct parser *p,
+                        short sym, short indents, short start_of_line,
                          void *asn,
                          const struct state states[])
         {
                 // Push an entry onto the stack
                          void *asn,
                          const struct state states[])
         {
                 // Push an entry onto the stack
+               struct frame next = {0};
                 int newstate = p->tos
                         ? search(&states[p->stack[p->tos-1].state],
                 int newstate = p->tos
                         ? search(&states[p->stack[p->tos-1].state],
-                                next->sym)
+                                sym)
                         : 0;
                 if (newstate < 0)
                         return 0;
                         : 0;
                 if (newstate < 0)
                         return 0;
@@ -2521,31 +2613,33 @@ So we walk down:
                         p->asn_stack = realloc(p->asn_stack, p->stack_size
                                            * sizeof(p->asn_stack[0]));
                 }
                         p->asn_stack = realloc(p->asn_stack, p->stack_size
                                            * sizeof(p->asn_stack[0]));
                 }
-               next->state = newstate;
+               next.sym = sym;
+               next.indents = indents;
+               next.state = newstate;
                 if (states[newstate].starts_line)
                 if (states[newstate].starts_line)
-                       next->newline_permitted = 1;
-               else if (next->indents)
-                       next->newline_permitted = 0;
+                       next.newline_permitted = 1;
+               else if (indents)
+                       next.newline_permitted = 0;
                 else if (p->tos)
                 else if (p->tos)
-                       next->newline_permitted =
+                       next.newline_permitted =
                                 p->stack[p->tos-1].newline_permitted;
                 else
                                 p->stack[p->tos-1].newline_permitted;
                 else
-                       next->newline_permitted = 0;
+                       next.newline_permitted = 0;
  
  
-               if (next->since_newline) {
+               if (!start_of_line) {
                         if (p->tos)
                         if (p->tos)
-                               next->since_newline = p->stack[p->tos-1].since_newline + 1;
+                               next.since_newline = p->stack[p->tos-1].since_newline + 1;
                         else
                         else
-                               next->since_newline = 1;
+                               next.since_newline = 1;
                 }
                 }
-               if (next->indents)
-                       next->since_indent = 0;
+               if (indents)
+                       next.since_indent = 0;
                 else if (p->tos)
                 else if (p->tos)
-                       next->since_indent = p->stack[p->tos-1].since_indent + 1;
+                       next.since_indent = p->stack[p->tos-1].since_indent + 1;
                 else
                 else
-                       next->since_indent = 1;
+                       next.since_indent = 1;
  
  
-               p->stack[p->tos] = *next;
+               p->stack[p->tos] = next;
                 p->asn_stack[p->tos] = asn;
                 p->tos++;
                 return 1;
                 p->asn_stack[p->tos] = asn;
                 p->tos++;
                 return 1;
@@ -2553,27 +2647,30 @@ So we walk down:
  
  `pop` primarily moves the top of stack (`tos`) back down the required
  amount and frees any `asn` entries that need to be freed.  It also
  
  `pop` primarily moves the top of stack (`tos`) back down the required
  amount and frees any `asn` entries that need to be freed.  It also
-collects a summary of the indents in the symbols that are being
-removed. It is called _after_ we reduce a production, just before we
-`shift` the nonterminal in.
-
-`pop` is only called if there are entries to remove, so `num` is never zero.
+collects a summary of the indents and line starts in the symbols that
+are being removed. It is called _after_ we reduce a production, just
+before we `shift` the nonterminal in.
  
  ###### parser functions
  
  
  ###### parser functions
  
-       static void pop(struct parser *p, int num, struct frame *next,
-                       void(*do_free)(short sym, void *asn))
+       static int pop(struct parser *p, int num,
+                      short *start_of_line,
+                      void(*do_free)(short sym, void *asn))
         {
                 int i;
         {
                 int i;
+               short indents = 0;
+               int sol = 0;
+
                 p->tos -= num;
                 p->tos -= num;
-               next->since_newline =
-                       p->stack[p->tos].since_newline;
-               next->indents = 0;
                 for (i = 0; i < num; i++) {
                 for (i = 0; i < num; i++) {
-                       next->indents += p->stack[p->tos+i].indents;
+                       sol |= !p->stack[p->tos+i].since_newline;
+                       indents += p->stack[p->tos+i].indents;
                         do_free(p->stack[p->tos+i].sym,
                                 p->asn_stack[p->tos+i]);
                 }
                         do_free(p->stack[p->tos+i].sym,
                                 p->asn_stack[p->tos+i]);
                 }
+               if (start_of_line)
+                       *start_of_line = sol;
+               return indents;
         }
  
  ### Memory allocation
         }
  
  ### Memory allocation
@@ -2607,9 +2704,9 @@ copying, hence `memdup` and `tokcopy`.
  
  ### The heart of the parser.
  
  
  ### The heart of the parser.
  
-Now we have the parser.  If we can shift, we do, though newlines and
-reducing indenting may block that.  If not and we can reduce we do.
-If the production we reduced was production zero, then we have
+Now we have the parser.  If we can shift we do, though newlines and
+reducing indenting may block that.  If not and we can reduce we do
+that.  If the production we reduced was production zero, then we have
  accepted the input and can finish.
  
  We return whatever `asn` was returned by reducing production zero.
  accepted the input and can finish.
  
  We return whatever `asn` was returned by reducing production zero.
@@ -2622,20 +2719,70 @@ When we find `TK_in` and `TK_out` tokens which report indents we need
  to handle them directly as the grammar cannot express what we want to
  do with them.
  
  to handle them directly as the grammar cannot express what we want to
  do with them.
  
-`TK_in` tokens are easy: we simply update the `next` stack frame to
-record how many indents there are and that the next token started with
-an indent.
-
-`TK_out` tokens must either be counted off against any pending indent,
-or must force reductions until there is a pending indent which isn't
-at the start of a production.
-
-`TK_newline` tokens are ignored precisely if there has been an indent
-since the last state which could have been at the start of a line.
+`TK_in` tokens are easy: we simply update indent count in the top stack frame to
+record how many indents there are following the previous token.
+
+`TK_out` tokens must be canceled against an indent count
+within the stack.  If we can reduce some symbols that are all since
+the most recent indent, then we do that first.  If the minimum prefix
+of the current state then extends back before the most recent indent,
+that indent can be cancelled.  If the minimum prefix is shorter then
+the indent had ended prematurely and we must start error handling, which
+is still a work-in-progress.
+
+`TK_newline` tokens are ignored unless the top stack frame records
+that they are permitted.  In that case they will not be considered for
+shifting if it is possible to reduce some symbols that are all since
+the most recent start of line.  This is how a newline forcibly
+terminates any line-like structure - we try to reduce down to at most
+one symbol for each line where newlines are allowed.
+A consequence of this is that a rule like
+
+###### Example: newlines - broken
+
+       Newlines ->
+               | NEWLINE Newlines
+       IfStatement -> Newlines if ....
+
+cannot work, as the NEWLINE will never be shifted as the empty string
+will be reduced first.  Optional sets of newlines need to be include
+in the thing that preceed:
+
+###### Example: newlines - works
+
+       If -> if
+               | NEWLINE If
+       IfStatement -> If ....
+
+Here the NEWLINE will be shifted because nothing can be reduced until
+the `if` is seen.
+
+When, during error handling, we discard token read in, we want to keep
+discarding until we see one that is recognised.  If we had a full set
+of LR(1) grammar states, this will mean looking in the look-ahead set,
+but we don't keep a full look-ahead set.  We only record the subset
+that leads to SHIFT.  We can, however, deduce the look-ahead set but
+looking at the SHIFT subsets for all states that we can get to by
+reducing zero or more times.  So we need a little function which
+checks if a given token is in any of these look-ahead sets.
  
  ###### parser includes
         #include "parser.h"
  
  ###### parser includes
         #include "parser.h"
+
  ###### parser_run
  ###### parser_run
+
+       static int in_lookahead(struct token *tk, const struct state *states, int state)
+       {
+               while (state >= 0) {
+                       if (search(&states[state], tk->num) >= 0)
+                               return 1;
+                       if (states[state].reduce_prod < 0)
+                               return 0;
+                       state = search(&states[state], states[state].reduce_sym);
+               }
+               return 0;
+       }
+
         void *parser_run(struct token_state *tokens,
                          const struct state states[],
                          int (*do_reduce)(int, void**, struct token_config*, void*),
         void *parser_run(struct token_state *tokens,
                          const struct state states[],
                          int (*do_reduce)(int, void**, struct token_config*, void*),
@@ -2644,29 +2791,31 @@ since the last state which could have been at the start of a line.
                          struct token_config *config)
         {
                 struct parser p = { 0 };
                          struct token_config *config)
         {
                 struct parser p = { 0 };
-               struct frame next = { 0 };
                 struct token *tk = NULL;
                 int accepted = 0;
                 void *ret = NULL;
  
                 struct token *tk = NULL;
                 int accepted = 0;
                 void *ret = NULL;
  
-               shift(&p, &next, NULL, states);
+               shift(&p, TK_eof, 0, 1, NULL, states);
                 while (!accepted) {
                         struct token *err_tk;
                         struct frame *tos = &p.stack[p.tos-1];
                         if (!tk)
                                 tk = tok_copy(token_next(tokens));
                 while (!accepted) {
                         struct token *err_tk;
                         struct frame *tos = &p.stack[p.tos-1];
                         if (!tk)
                                 tk = tok_copy(token_next(tokens));
-                       next.sym = tk->num;
-                       parser_trace(trace, &p, &next, tk, states, non_term, config->known_count);
-
-                       if (next.sym == TK_in) {
-                               next.indents = 1;
-                               next.since_newline = 0;
+                       parser_trace(trace, &p,
+                                    tk, states, non_term, config->known_count);
+
+                       if (tk->num == TK_in) {
+                               tos->indents += 1;
+                               tos->since_newline = 0;
+                               tos->since_indent = 0;
+                               if (!states[tos->state].starts_line)
+                                       tos->newline_permitted = 0;
                                 free(tk);
                                 tk = NULL;
                                 parser_trace_action(trace, "Record");
                                 continue;
                         }
                                 free(tk);
                                 tk = NULL;
                                 parser_trace_action(trace, "Record");
                                 continue;
                         }
-                       if (next.sym == TK_out) {
+                       if (tk->num == TK_out) {
                                 if (states[tos->state].reduce_size >= 0 &&
                                     states[tos->state].reduce_size <= tos->since_indent)
                                         goto force_reduce;
                                 if (states[tos->state].reduce_size >= 0 &&
                                     states[tos->state].reduce_size <= tos->since_indent)
                                         goto force_reduce;
@@ -2684,7 +2833,7 @@ since the last state which could have been at the start of a line.
                                                         in->newline_permitted = 0;
                                                 }
                                                 if (states[in->state].starts_line)
                                                         in->newline_permitted = 0;
                                                 }
                                                 if (states[in->state].starts_line)
-                                                       in->newline_permitted = 0;
+                                                       in->newline_permitted = 1;
                                                 while (in < tos) {
                                                         in += 1;
                                                         in->since_indent = in[-1].since_indent + 1;
                                                 while (in < tos) {
                                                         in += 1;
                                                         in->since_indent = in[-1].since_indent + 1;
@@ -2699,29 +2848,34 @@ since the last state which could have been at the start of a line.
                                         parser_trace_action(trace, "Cancel");
                                         continue;
                                 }
                                         parser_trace_action(trace, "Cancel");
                                         continue;
                                 }
-                               // fall through and force a REDUCE (as 'shift'
-                               // will fail).
+                               // fall through to error handling as both SHIFT and REDUCE
+                               // will fail.
                         }
                         }
-                       if (next.sym == TK_newline) {
+                       if (tk->num == TK_newline) {
                                 if (!tos->newline_permitted) {
                                         free(tk);
                                         tk = NULL;
                                         parser_trace_action(trace, "Discard");
                                         continue;
                                 }
                                 if (!tos->newline_permitted) {
                                         free(tk);
                                         tk = NULL;
                                         parser_trace_action(trace, "Discard");
                                         continue;
                                 }
-                               if (states[tos->state].reduce_size > 0 &&
-                                   states[tos->state].reduce_size < tos->since_newline)
+                               if (tos->since_newline > 1 &&
+                                   states[tos->state].reduce_size >= 0 &&
+                                   states[tos->state].reduce_size <= tos->since_newline)
                                         goto force_reduce;
                         }
                                         goto force_reduce;
                         }
-                       if (shift(&p, &next, tk, states)) {
-                               next.since_newline = !(tk->num == TK_newline);
-                               next.indents = 0;
+                       if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
                                 tk = NULL;
                                 parser_trace_action(trace, "Shift");
                                 continue;
                         }
                 force_reduce:
                                 tk = NULL;
                                 parser_trace_action(trace, "Shift");
                                 continue;
                         }
                 force_reduce:
-                       if (states[tos->state].reduce_prod >= 0) {
+                       if (states[tos->state].reduce_prod >= 0 &&
+                           states[tos->state].newline_only &&
+                           tk->num != TK_newline && tk->num != TK_eof && tk->num != TK_out) {
+                               /* Anything other than newline in an error as this
+                                * production must end at EOL
+                                */
+                       } else if (states[tos->state].reduce_prod >= 0) {
                                 void **body;
                                 void *res;
                                 const struct state *nextstate = &states[tos->state];
                                 void **body;
                                 void *res;
                                 const struct state *nextstate = &states[tos->state];
@@ -2729,23 +2883,19 @@ since the last state which could have been at the start of a line.
                                 int size = nextstate->reduce_size;
                                 int bufsize;
                                 static char buf[16*1024];
                                 int size = nextstate->reduce_size;
                                 int bufsize;
                                 static char buf[16*1024];
-                               struct frame frame;
-                               frame.sym = nextstate->reduce_sym;
+                               short indents, start_of_line;
  
                                 body = p.asn_stack + (p.tos - size);
  
                                 bufsize = do_reduce(prod, body, config, buf);
  
  
                                 body = p.asn_stack + (p.tos - size);
  
                                 bufsize = do_reduce(prod, body, config, buf);
  
-                               if (size)
-                                       pop(&p, size, &frame, do_free);
-                               else {
-                                       frame.indents = next.indents;
-                                       frame.since_newline = 1;
-                                       next.indents = 0;
-                               }
+                               indents = pop(&p, size, &start_of_line,
+                                             do_free);
                                 res = memdup(buf, bufsize);
                                 memset(buf, 0, bufsize);
                                 res = memdup(buf, bufsize);
                                 memset(buf, 0, bufsize);
-                               if (!shift(&p, &frame, res, states)) {
+                               if (!shift(&p, nextstate->reduce_sym,
+                                          indents, start_of_line,
+                                          res, states)) {
                                         if (prod != 0) abort();
                                         accepted = 1;
                                         ret = res;
                                         if (prod != 0) abort();
                                         accepted = 1;
                                         ret = res;
@@ -2753,18 +2903,6 @@ since the last state which could have been at the start of a line.
                                 parser_trace_action(trace, "Reduce");
                                 continue;
                         }
                                 parser_trace_action(trace, "Reduce");
                                 continue;
                         }
-                       if (tk->num == TK_out) {
-                               // Indent problem - synthesise tokens to get us
-                               // out of here.
-                               struct frame frame = { 0 };
-                               fprintf(stderr, "Synthesize %d to handle indent problem\n", states[tos->state].shift_sym);
-                               frame.sym = states[tos->state].shift_sym;
-                               frame.since_newline = 1;
-                               shift(&p, &frame, tok_copy(*tk), states);
-                               // FIXME need to report this error somehow
-                               parser_trace_action(trace, "Synthesize");
-                               continue;
-                       }
                         /* Error. We walk up the stack until we
                          * find a state which will accept TK_error.
                          * We then shift in TK_error and see what state
                         /* Error. We walk up the stack until we
                          * find a state which will accept TK_error.
                          * We then shift in TK_error and see what state
@@ -2773,38 +2911,37 @@ since the last state which could have been at the start of a line.
                          * we find one that is acceptable.
                          */
                         parser_trace_action(trace, "ERROR");
                          * we find one that is acceptable.
                          */
                         parser_trace_action(trace, "ERROR");
+                       short indents = 0, start_of_line;
  
                         err_tk = tok_copy(*tk);
  
                         err_tk = tok_copy(*tk);
-                       next.sym = TK_error;
-                       while (shift(&p, &next, err_tk, states) == 0
-                              && p.tos > 0)
+                       while (p.tos > 0 &&
+                              shift(&p, TK_error, 0, 0,
+                                    err_tk, states) == 0)
                                 // discard this state
                                 // discard this state
-                               pop(&p, 1, &next, do_free);
+                               indents += pop(&p, 1, &start_of_line, do_free);
                         if (p.tos == 0) {
                                 free(err_tk);
                                 // no state accepted TK_error
                                 break;
                         }
                         tos = &p.stack[p.tos-1];
                         if (p.tos == 0) {
                                 free(err_tk);
                                 // no state accepted TK_error
                                 break;
                         }
                         tos = &p.stack[p.tos-1];
-                       while (search(&states[tos->state], tk->num) < 0 &&
+                       while (!in_lookahead(tk, states, tos->state) &&
                                tk->num != TK_eof) {
                                 free(tk);
                                 tk = tok_copy(token_next(tokens));
                                 if (tk->num == TK_in)
                                tk->num != TK_eof) {
                                 free(tk);
                                 tk = tok_copy(token_next(tokens));
                                 if (tk->num == TK_in)
-                                       next.indents += 1;
+                                       indents += 1;
                                 if (tk->num == TK_out) {
                                 if (tk->num == TK_out) {
-                                       if (next.indents == 0)
+                                       if (indents == 0)
                                                 break;
                                                 break;
-                                       next.indents -= 1;
+                                       indents -= 1;
                                         // FIXME update since_indent here
                                 }
                         }
                                         // FIXME update since_indent here
                                 }
                         }
-                       if (p.tos == 0 && tk->num == TK_eof)
-                               break;
+                       tos->indents += indents;
                 }
                 free(tk);
                 }
                 free(tk);
-               if (p.tos)
-                       pop(&p, p.tos, &next, do_free);
+               pop(&p, p.tos, NULL, do_free);
                 free(p.asn_stack);
                 free(p.stack);
                 return ret;
                 free(p.asn_stack);
                 free(p.stack);
                 return ret;
@@ -2850,11 +2987,11 @@ end inside square brackets.
                 if (states[f->state].starts_line)
                         fprintf(trace, "s");
                 if (f->newline_permitted)
                 if (states[f->state].starts_line)
                         fprintf(trace, "s");
                 if (f->newline_permitted)
-                       fprintf(trace, "n%d", f->newline_permitted);
+                       fprintf(trace, "n%d", f->since_newline);
                 fprintf(trace, ") ");
         }
  
                 fprintf(trace, ") ");
         }
  
-       void parser_trace(FILE *trace, struct parser *p, struct frame *n,
+       void parser_trace(FILE *trace, struct parser *p,
                           struct token *tk, const struct state states[],
                           const char *non_term[], int knowns)
         {
                           struct token *tk, const struct state states[],
                           const char *non_term[], int knowns)
         {
@@ -2888,10 +3025,6 @@ end inside square brackets.
                         fputs(reserved_words[tk->num], trace);
                 else
                         text_dump(trace, tk->txt, 20);
                         fputs(reserved_words[tk->num], trace);
                 else
                         text_dump(trace, tk->txt, 20);
-               if (n->indents)
-                       fprintf(trace, ".%d", n->indents);
-               if (n->since_newline == 0)
-                       fputs("/", trace);
                 fputs("]", trace);
         }
  
                 fputs("]", trace);
         }
  
@@ -2908,7 +3041,7 @@ The obvious example for a parser is a calculator.
  As `scanner` provides number parsing function using `libgmp` is it not much
  work to perform arbitrary rational number calculations.
  
  As `scanner` provides number parsing function using `libgmp` is it not much
  work to perform arbitrary rational number calculations.
  
-This calculator takes one expression, or an equality test per line.  The
+This calculator takes one expression, or an equality test, per line.  The
  results are printed and if any equality test fails, the program exits with
  an error.
  
  results are printed and if any equality test fails, the program exits with
  an error.
  
@@ -2917,6 +3050,9 @@ an error.
                 ./parsergen --tag calc -o calc parsergen.mdc
         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
                 ./parsergen --tag calc -o calc parsergen.mdc
         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
+       calctest : calc
+               ./calc parsergen.mdc
+       demos :: calctest
  
  # calc: header
  
  
  # calc: header
  
@@ -2937,6 +3073,7 @@ an error.
         #include <stdio.h>
         #include <malloc.h>
         #include <gmp.h>
         #include <stdio.h>
         #include <malloc.h>
         #include <gmp.h>
+       #include <string.h>
         #include "mdcode.h"
         #include "scanner.h"
         #include "number.h"
         #include "mdcode.h"
         #include "scanner.h"
         #include "number.h"
@@ -2950,12 +3087,19 @@ an error.
                 free(n);
         }
  
                 free(n);
         }
  
+       static int text_is(struct text t, char *s)
+       {
+               return (strlen(s) == t.len &&
+                       strncmp(s, t.txt, t.len) == 0);
+       }
+
         int main(int argc, char *argv[])
         {
                 int fd = open(argv[1], O_RDONLY);
                 int len = lseek(fd, 0, 2);
                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
         int main(int argc, char *argv[])
         {
                 int fd = open(argv[1], O_RDONLY);
                 int len = lseek(fd, 0, 2);
                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
-               struct section *s = code_extract(file, file+len, NULL);
+               struct section *table = code_extract(file, file+len, NULL);
+               struct section *s;
                 struct token_config config = {
                         .ignored = (1 << TK_line_comment)
                                  | (1 << TK_block_comment)
                 struct token_config config = {
                         .ignored = (1 << TK_line_comment)
                                  | (1 << TK_block_comment)
@@ -2965,18 +3109,23 @@ an error.
                         .word_start = "",
                         .word_cont = "",
                 };
                         .word_start = "",
                         .word_cont = "",
                 };
-               parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
-               while (s) {
-                       struct section *t = s->next;
-                       code_free(s->code);
-                       free(s);
-                       s = t;
+               for (s = table; s; s = s->next)
+                       if (text_is(s->section, "example: input"))
+                               parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
+               while (table) {
+                       struct section *t = table->next;
+                       code_free(table->code);
+                       free(table);
+                       table = t;
                 }
                 exit(0);
         }
  
  # calc: grammar
  
                 }
                 exit(0);
         }
  
  # calc: grammar
  
+       $LEFT + -
+       $LEFT * /
+
         Session -> Session Line
                 | Line
  
         Session -> Session Line
                 | Line
  
@@ -2999,13 +3148,20 @@ an error.
                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
  
         $number
                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
  
         $number
-       Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
-               | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
-               | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
+       Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
+               | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
+               | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
+               | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+               | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
+               | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
  
  
-       Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
-               | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
-               | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
+# example: input
  
  
-       Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
-               | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
+       355/113
+       3.1415926535 - 355/113
+       2 + 4 * 5
+       1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
+       10 * 9 / 2
+       1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
+
+       error