parsergen: add more power to symbol references in generated code

[ocean] / csrc / parsergen.mdc
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index d12a4000d3712f6a44eaf486a284793073e1abf8..742996e16222854cecedd235f9449987cc6c5e08 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -100,6 +100,7 @@ symbol.
         struct production {
                 unsigned short precedence;
                 enum assoc assoc;
+               char line_like;
                 ## production fields
         };
         struct grammar {
@@ -150,11 +151,17 @@ those which don't.  There are also "virtual" symbols used for precedence
  marking discussed later, and sometimes we won't know what type a symbol
  is yet.
  
+To help with code safety it is possible to declare the terminal symbols.
+If this is done, then any symbol used in a production that does not
+appear in a head and is not declared is treated as an error.
+
  ###### forward declarations
         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
         char *symtypes = "UVTN";
  ###### symbol fields
         enum symtype type;
+###### grammar fields
+       int terminals_declared;
  
  Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
  table of known symbols and the resulting parser will report them as
@@ -240,9 +247,10 @@ symbol, but its type might be `Unknown`.
  
  ### Data types and precedence.
  
-Data type specification and precedence specification are both
-introduced by a dollar sign at the start of the line.  If the next
-word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+Data type specification, precedence specification, and declaration of
+terminals are all introduced by a dollar sign at the start of the line.
+If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+precedence, if it is `TERM` the the line declares terminals without
  precedence, otherwise it specifies a data type.
  
  The data type name is simply stored and applied to the head of all
@@ -295,6 +303,7 @@ Subsequent lines introduce symbols with higher precedence.
                 struct token t = token_next(ts);
                 char *err;
                 enum assoc assoc;
+               int term = 0;
                 int found;
  
                 if (t.num != TK_ident) {
@@ -307,7 +316,10 @@ Subsequent lines introduce symbols with higher precedence.
                         assoc = Right;
                 else if (text_is(t.txt, "NON"))
                         assoc = Non;
-               else {
+               else if (text_is(t.txt, "TERM")) {
+                       term = 1;
+                       g->terminals_declared = 1;
+               } else {
                         g->current_type = t.txt;
                         g->type_isref = isref;
                         if (text_is(t.txt, "void"))
@@ -325,7 +337,7 @@ Subsequent lines introduce symbols with higher precedence.
                         goto abort;
                 }
  
-               // This is a precedence line, need some symbols.
+               // This is a precedence or TERM line, need some symbols.
                 found = 0;
                 g->prec_levels += 1;
                 t = token_next(ts);
@@ -339,6 +351,10 @@ Subsequent lines introduce symbols with higher precedence.
                                         err = "$$ must be followed by a word";
                                         goto abort;
                                 }
+                               if (term) {
+                                       err = "Virtual symbols not permitted on $TERM line";
+                                       goto abort;
+                               }
                         } else if (t.num != TK_ident &&
                                    t.num != TK_mark) {
                                 err = "Illegal token in precedence line";
@@ -346,17 +362,19 @@ Subsequent lines introduce symbols with higher precedence.
                         }
                         s = sym_find(g, t.txt);
                         if (s->type != Unknown) {
-                               err = "Symbols in precedence line must not already be known.";
+                               err = "Symbols in precedence/TERM line must not already be known.";
                                 goto abort;
                         }
                         s->type = type;
-                       s->precedence = g->prec_levels;
-                       s->assoc = assoc;
+                       if (!term) {
+                               s->precedence = g->prec_levels;
+                               s->assoc = assoc;
+                       }
                         found += 1;
                         t = token_next(ts);
                 }
                 if (found == 0)
-                       err = "No symbols given on precedence line";
+                       err = "No symbols given on precedence/TERM line";
                         goto abort;
                 return NULL;
         abort:
@@ -390,14 +408,30 @@ be in one `code_node` of the literate code.  The `}$` must be
  at the end of a line.
  
  Text in the code fragment will undergo substitutions where `$N` or
-`$<N`,for some numeric `N`, will be replaced with a variable holding
-the parse information for the particular symbol in the production.
-`$0` is the head of the production, `$1` is the first symbol of the
-body, etc.  The type of `$N` for a terminal symbol is `struct token`.
-For a non-terminal, it is whatever has been declared for that symbol.
-The `<` may be included for symbols declared as storing a reference
-(not a structure) and means that the reference is being moved out, so
-it will not automatically be freed.
+`$<N`,for some numeric `N` (or non-numeric indicator as described
+later), will be replaced with a variable holding the parse information
+for the particular symbol in the production.  `$0` is the head of the
+production, `$1` is the first symbol of the body, etc.  The type of `$N`
+for a terminal symbol is `struct token`.  For a non-terminal, it is
+whatever has been declared for that symbol.  The `<` may be included and
+means that the value (usually a reference) is being moved out, so it
+will not automatically be freed.  The effect of using '<' is that the
+variable is cleareed to all-zeros.
+
+Symbols that are left-recursive are a little special.  These are symbols
+that both the head of a production and the first body symbol of the same
+production.  They are problematic when they appear in other productions
+elsewhere than at the end, and when indenting is used to describe
+structure.  In this case there is no way for a smaller indent to ensure
+the left-recursive symbol cannot be extended.  When it appears at the
+end of a production, that production can be reduced to ensure the symbol
+isn't extended.  So we record left-recursive symbols while reading the
+grammar, and produce a warning when reporting the grammar if they are
+found in an unsuitable place.
+
+A symbol that is only left recursive in a production where it is
+followed by newline does not cause these problems because the newline
+will effectively terminate it.
  
  While building productions we will need to add to an array which needs to
  grow dynamically.
@@ -459,6 +493,7 @@ Now we have all the bits we need to parse a full production.
  
  ###### symbol fields
         int first_production;
+       int left_recursive;
  
  ###### functions
         static char *parse_production(struct grammar *g,
@@ -475,8 +510,10 @@ Now we have all the bits we need to parse a full production.
                 tk = token_next(state);
                 while (tk.num == TK_ident || tk.num == TK_mark) {
                         struct symbol *bs = sym_find(g, tk.txt);
-                       if (bs->type == Unknown)
-                               bs->type = Terminal;
+                       if (bs->type == Unknown) {
+                               if (!g->terminals_declared)
+                                       bs->type = Terminal;
+                       }
                         if (bs->type == Virtual) {
                                 err = "Virtual symbol not permitted in production";
                                 goto abort;
@@ -496,12 +533,17 @@ Now we have all the bits we need to parse a full production.
                                 goto abort;
                         }
                         vs = sym_find(g, tk.txt);
-                       if (vs->type != Virtual) {
-                               err = "symbol after $$ must be virtual";
+                       if (vs->num == TK_newline)
+                               p.line_like = 1;
+                       else if (vs->num == TK_out)
+                               p.line_like = 2;
+                       else if (vs->precedence == 0) {
+                               err = "symbol after $$ must have precedence";
                                 goto abort;
+                       } else {
+                               p.precedence = vs->precedence;
+                               p.assoc = vs->assoc;
                         }
-                       p.precedence = vs->precedence;
-                       p.assoc = vs->assoc;
                         tk = token_next(state);
                 }
                 if (tk.num == TK_open) {
@@ -513,6 +555,11 @@ Now we have all the bits we need to parse a full production.
                         }
                         tk = token_next(state);
                 }
+               if (p.body_size >= 2 &&
+                   p.body[0] == p.head &&
+                   p.body[1]->num != TK_newline)
+                       p.head->left_recursive = 1;
+
                 if (tk.num != TK_newline && tk.num != TK_eof) {
                         err = "stray tokens at end of line";
                         goto abort;
@@ -630,6 +677,11 @@ to produce errors that the parser is better positioned to handle.
                         } else if (tk.num == TK_mark
                                    && text_is(tk.txt, "$*")) {
                                 err = dollar_line(state, g, 1);
+                       } else if (tk.num == TK_mark
+                                  && text_is(tk.txt, "//")) {
+                               while (tk.num != TK_newline &&
+                                      tk.num != TK_eof)
+                                       tk = token_next(state);
                         } else {
                                 err = "Unrecognised token at start of line.";
                         }
@@ -637,6 +689,21 @@ to produce errors that the parser is better positioned to handle.
                                 goto abort;
                 }
                 token_close(state);
+               if (g->terminals_declared) {
+                       struct symbol *s;
+                       int errs = 0;
+                       for (s = g->syms; s; s = s->next) {
+                               if (s->type != Unknown)
+                                       continue;
+                               errs += 1;
+                               fprintf(stderr, "Token %.*s not declared\n",
+                                       s->name.len, s->name.txt);
+                       }
+                       if (errs) {
+                               free(g);
+                               g = NULL;
+                       }
+               }
                 return g;
         abort:
                 fprintf(stderr, "Error at line %d: %s\n",
@@ -1305,6 +1372,8 @@ into the go to set, so the item is ineffective.
                 struct symbol *s;
                 struct symset LA = INIT_SYMSET;
                 unsigned short sn = 0;
+               struct symset LAnl = INIT_SYMSET;
+               unsigned short snnl = 0;
  
                 if (is->min_prefix == 0 ||
                     (bs > 0 && bs < is->min_prefix))
@@ -1326,11 +1395,11 @@ into the go to set, so the item is ineffective.
                         continue;
                 if (symset_find(&done, s->num) < 0) {
                         symset_add(&done, s->num, 0);
-                       if (s->line_like)
-                               is->starts_line = 1;
                 }
                 if (s->type != Nonterminal)
                         continue;
+               if (s->line_like)
+                       is->starts_line = 1;
                 again = 1;
                 if (type >= LALR) {
                         // Need the LA set.
@@ -1342,6 +1411,10 @@ into the go to set, so the item is ineffective.
                         }
                         sn = save_set(g, LA);
                         LA = set_find(g, sn);
+                       if (symset_find(&LA, TK_newline))
+                               symset_add(&LAnl, TK_newline, 0);
+                       snnl = save_set(g, LAnl);
+                       LAnl = set_find(g, snnl);
                 }
  
                 /* Add productions for this symbol */
@@ -1352,19 +1425,25 @@ into the go to set, so the item is ineffective.
                         int itm = item_num(p2, 0);
                         int pos = symset_find(&is->items, itm);
                         if (pos < 0) {
-                               symset_add(&is->items, itm, sn);
+                               if (g->productions[p2]->line_like)
+                                       symset_add(&is->items, itm, snnl);
+                               else
+                                       symset_add(&is->items, itm, sn);
                                 /* Will have re-ordered, so start
                                  * from beginning again */
                                 i = -1;
                         } else if (type >= LALR) {
                                 struct symset ss = set_find(g, is->items.data[pos]);
                                 struct symset tmp = INIT_SYMSET;
+                               struct symset *la = &LA;
  
+                               if (g->productions[p2]->line_like)
+                                       la = &LAnl;
                                 symset_union(&tmp, &ss);
-                               if (symset_union(&tmp, &LA)) {
+                               if (symset_union(&tmp, la)) {
                                         is->items.data[pos] = save_set(g, tmp);
                                         i = -1;
-                               }else
+                               } else
                                         symset_free(tmp);
                         }
                 }
@@ -1501,6 +1580,11 @@ changeover point in `first_nonterm`.
                                 snum++;
                         }
                 g->first_nonterm = snum;
+               for (s = g->syms; s; s = s->next)
+                       if (s->num < 0 && s->type != Virtual) {
+                               s->num = snum;
+                               snum++;
+                       }
                 for (s = g->syms; s; s = s->next)
                         if (s->num < 0) {
                                 s->num = snum;
@@ -1539,7 +1623,7 @@ all the tables that have been generated, plus a description of any conflicts.
                 if (g->follow)
                         report_follow(g);
                 report_itemsets(g);
-               return report_conflicts(g, type);
+               return report_conflicts(g, type) + report_left_recursive(g);
         }
  
  Firstly we have the complete list of symbols, together with the
@@ -1635,6 +1719,10 @@ it up a bit.  First the items, with production number and associativity.
                         printf(" [%d%s]", s->precedence,
                                assoc_names[s->assoc]);
                 }
+               if (pr->line_like == 1)
+                       printf(" $$NEWLINE");
+               else if (pr->line_like)
+                       printf(" $$OUT");
                 printf("\n");
         }
  
@@ -1773,14 +1861,10 @@ terminals to items where that terminal could be shifted and another
  which maps terminals to items that could be reduced when the terminal
  is in look-ahead.  We report when we get conflicts between the two.
  
-As a special case, if we find a SHIFT/REDUCE conflict, where a
-terminal that could be shifted is in the lookahead set of some
-reducable item, then set check if the reducable item also have
-`TK_newline` in its lookahead set.  If it does, then a newline will
-force the reduction, but anything else can reasonably be shifted, so
-that isn't really a conflict.  Such apparent conflicts do not get
-counted, and are reported as non-critical.  This will not affect a
-"traditional" grammar that does not include newlines as token.
+As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
+terminal, we ignore it.  NEWLINES are handled specially with its own
+rules for when to shift and when to reduce.  Conflicts are expected,
+but handled internally.
  
         static int conflicts_slr(struct grammar *g, enum grammar_type type)
         {
@@ -1800,14 +1884,20 @@ counted, and are reported as non-critical.  This will not affect a
                                 int p = item_prod(itm);
                                 int bp = item_index(itm);
                                 struct production *pr = g->productions[p];
+                               struct symbol *s;
  
-                               if (bp < pr->body_size &&
-                                   pr->body[bp]->type == Terminal) {
-                                       /* shiftable */
-                                       int sym = pr->body[bp]->num;
-                                       if (symset_find(&shifts, sym) < 0)
-                                               symset_add(&shifts, sym, itm);
-                               }
+                               if (bp >= pr->body_size ||
+                                   pr->body[bp]->type != Terminal)
+                                       /* not shiftable */
+                                       continue;
+
+                               s = pr->body[bp];
+                               if (s->precedence && is->precedence)
+                                       /* Precedence resolves this, so no conflict */
+                                       continue;
+
+                               if (symset_find(&shifts, s->num) < 0)
+                                       symset_add(&shifts, s->num, itm);
                         }
                         /* Now look for reductions and conflicts */
                         for (j = 0; j < is->items.cnt; j++) {
@@ -1827,13 +1917,10 @@ counted, and are reported as non-critical.  This will not affect a
                                 int k;
                                 for (k = 0; k < la.cnt; k++) {
                                         int pos = symset_find(&shifts, la.syms[k]);
-                                       if (pos >= 0) {
-                                               if (symset_find(&la, TK_newline) < 0) {
-                                                       printf("  State %d has SHIFT/REDUCE conflict on ", i);
-                                                       cnt++;
-                                               } else
-                                                       printf("  State %d has non-critical SHIFT/REDUCE conflict on ", i);
-                                               prtxt(g->symtab[la.syms[k]]->name);
+                                       if (pos >= 0 && la.syms[k] != TK_newline) {
+                                               printf("  State %d has SHIFT/REDUCE conflict on ", i);
+                                               cnt++;
+                                                       prtxt(g->symtab[la.syms[k]]->name);
                                                 printf(":\n");
                                                 report_item(g, shifts.data[pos]);
                                                 report_item(g, itm);
@@ -1857,6 +1944,43 @@ counted, and are reported as non-critical.  This will not affect a
                 return cnt;
         }
  
+
+### Reporting non-final left-recursive symbols.
+
+Left recursive symbols are a problem for parses that honour indentation
+when they appear other than at the end of the production.  So we need to
+report these when asked.
+
+###### functions
+
+       static int report_left_recursive(struct grammar *g)
+       {
+               int p;
+               int bad_left_recursive = 0;
+
+               for (p = 0; p < g->production_count; p++) {
+                       struct production *pr = g->productions[p];
+                       int sn;
+
+                       for (sn = 0; sn < pr->body_size - 1; sn++) {
+                               struct symbol *s = pr->body[sn];
+
+                               if (s->left_recursive == 1 &&
+                                   s != pr->head) {
+                                       if (!bad_left_recursive) {
+                                               bad_left_recursive = 1;
+                                               printf("Misplaced left recursive symbols.\n");
+                                       }
+                                       printf("  ");
+                                       prtxt(s->name);
+                                       printf(" in production [%d]\n", p);
+                                       s->left_recursive = 2;
+                               }
+                       }
+               }
+               return bad_left_recursive;
+       }
+
  ## Generating the parser
  
  The exported part of the parser is the `parse_XX` function, where the name
@@ -1890,7 +2014,6 @@ pieces of code provided in the grammar file, so they are generated first.
                 fprintf(f, "\tstruct token_state *tokens;\n");
                 fprintf(f, "\tconfig->words_marks = known;\n");
                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
-               fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
                 fprintf(f, "\ttokens = token_open(code, config);\n");
                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
                 fprintf(f, "\ttoken_close(tokens);\n");
@@ -1928,7 +2051,7 @@ numbers right.
                 for (i = TK_reserved;
                      i < g->num_syms;
                      i++)
-                       if (g->symtab[i]->type != Terminal)
+                       if (g->symtab[i]->type == Nonterminal)
                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
                                         g->symtab[i]->name.txt);
                 fprintf(f, "};\n\n");
@@ -1958,7 +2081,8 @@ The go to table is stored in a simple array of `sym` and corresponding
                 short reduce_prod;
                 short reduce_size;
                 short reduce_sym;
-               short starts_line;
+               char starts_line;
+               char newline_only;
                 short min_prefix;
         };
  
@@ -2007,13 +2131,15 @@ The go to table is stored in a simple array of `sym` and corresponding
                         }
  
                         if (prod >= 0)
-                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d },\n",
+                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n",
                                         i, is->go_to.cnt, i, prod,
                                         g->productions[prod]->body_size,
                                         g->productions[prod]->head->num,
-                                       is->starts_line, is->min_prefix);
+                                       is->starts_line,
+                                       g->productions[prod]->line_like,
+                                       is->min_prefix);
                         else
-                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
+                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n",
                                         i, is->go_to.cnt, i,
                                         is->starts_line, is->min_prefix);
                 }
@@ -2044,13 +2170,105 @@ structure returned by a previous reduction.  These pointers need to be cast
  to the appropriate type for each access.  All this is handled in
  `gen_code`.
  
-`gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
-This applied only to symbols with references (or pointers), not those with structures.
-The `<` implies that the reference it being moved out, so the object will not be
-automatically freed.  This is equivalent to assigning `NULL` to the pointer.
+`gen_code` also allows symbol references to contain a '`<`' as in
+'`$<2`'.  This is particularly useful for references (or pointers), but
+can be used with structures too.  The `<` implies that the value it
+being moved out, so the object will not be automatically freed.  It is
+equivalent to assigning `NULL` to the pointer or filling a structure
+with zeros.
+
+Instead of a number `N`, the `$` or `$<` can be followed by some letters
+and possibly a number.  A number by itself (other than zero) selects a
+symbol from the body of the production.  A sequence of letters selects
+the shortest symbol in the body which contains those letters in the given
+order.  If a number follows the letters, then a later occurrence of
+that symbol is chosen.  So "`$AB2`" will refer to the structure attached
+to the second occurrence of the shortest symbol which contains an `A`
+followed by a `B`.  If there is no unique shortest system, or if the
+number given is too large, then the symbol reference is not transformed,
+and will cause an error when the code is compiled.
  
  ###### functions
  
+       static int textchr(struct text t, char c, int s)
+       {
+               int i;
+               for (i = s; i < t.len; i++)
+                       if (t.txt[i] == c)
+                               return i;
+               return -1;
+       }
+
+       static int subseq_match(char *seq, int slen, struct text name)
+       {
+               int st = 0;
+               while (slen > 0) {
+                       st = textchr(name, *seq, st);
+                       if (st < 0)
+                               return 0;
+                       slen -= 1;
+                       seq += 1;
+                       st += 1;
+               }
+               return 1;
+       }
+
+       static int choose_sym(char **namep, int len, struct production *p)
+       {
+               char *name = *namep;
+               char *nam = name;
+               int namlen;
+               int n = 0;
+               int i, s, slen;
+               char c;
+
+               c = *name;
+               while (len > 0 &&
+                      ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
+                       name += 1;
+                       len -= 1;
+                       c = *name;
+               }
+               namlen = name-nam;
+               while (len > 0 && (c >= '0' && c <= '9')) {
+                       name += 1;
+                       len -= 1;
+                       n = n * 10 + (c - '0');
+                       c = *name;
+               }
+               if (namlen == 0) {
+                       if (name == *namep)
+                               return -1;
+                       *namep = name;
+                       return n;
+               }
+               slen = 0; s = -1;
+               for (i = 0; i < p->body_size; i++) {
+                       if (!subseq_match(nam, namlen, p->body[i]->name))
+                               continue;
+                       if (slen == 0 || p->body[i]->name.len < slen)
+                               s = i;
+                       if (s >= 0 && p->body[i] != p->body[s] &&
+                           p->body[i]->name.len == p->body[s]->name.len)
+                               /* not unique, so s cannot be used */
+                               s = -1;
+               }
+               if (s < 0)
+                       return -1;
+               if (n == 0);
+                       n = 1;
+               for (i = 0; i < p->body_size; i++)
+                       if (p->body[i] == p->body[s]) {
+                               n -= 1;
+                               if (n == 0)
+                                       break;
+                       }
+               if (n > 1)
+                       return -1;
+               *namep = name;
+               return i + 1;
+       }
+
         static void gen_code(struct production *p, FILE *f, struct grammar *g)
         {
                 char *c;
@@ -2072,24 +2290,19 @@ automatically freed.  This is equivalent to assigning `NULL` to the pointer.
                                 use = 1;
                                 c++;
                         }
-                       if (*c < '0' || *c > '9') {
+                       n = choose_sym(&c, p->code.txt + p->code.len - c, p);
+                       if (n < 0) {
+                               fputc('$', f);
                                 if (use)
                                         fputc('<', f);
                                 fputc(*c, f);
                                 continue;
                         }
-                       n = *c - '0';
-                       while (c[1] >= '0' && c[1] <= '9') {
-                               c += 1;
-                               n = n * 10 + *c - '0';
-                       }
                         if (n == 0)
                                 fprintf(f, "(*(struct %.*s*%s)ret)",
                                         p->head->struct_name.len,
                                         p->head->struct_name.txt,
                                         p->head->isref ? "*":"");
-                       else if (n > p->body_size)
-                               fprintf(f, "$%d", n);
                         else if (p->body[n-1]->type == Terminal)
                                 fprintf(f, "(*(struct token *)body[%d])",
                                         n-1);
@@ -2102,14 +2315,18 @@ automatically freed.  This is equivalent to assigning `NULL` to the pointer.
                                         p->body[n-1]->isref ? "*":"", n-1);
                                 used[n-1] = use;
                         }
+                       c -= 1;
                 }
                 fputs("\n", f);
                 for (i = 0; i < p->body_size; i++) {
                         if (p->body[i]->struct_name.txt &&
-                           p->body[i]->isref &&
-                           used[i])
+                           used[i]) {
                                 // assume this has been copied out
-                               fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
+                               if (p->body[i]->isref)
+                                       fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
+                               else
+                                       fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
+                       }
                 }
                 free(used);
         }
@@ -2685,8 +2902,12 @@ accepted the input and can finish.
  We return whatever `asn` was returned by reducing production zero.
  
  If we can neither shift nor reduce we have an error to handle.  We pop
-single entries off the stack until we can shift the `TK_error` symbol, then
-drop input tokens until we find one we can shift into the new error state.
+single entries off the stack until we can shift the `TK_error` symbol,
+then drop input tokens until we find one we can shift into the new error
+state.  We need to ensure that something is dropped or shifted after an
+error, or we could get into an infinite loop, only shifting in
+`TK_error`, then reducing.  So we track if there has been a shift since
+the last error, and if not the next error always discards one token.
  
  When we find `TK_in` and `TK_out` tokens which report indents we need
  to handle them directly as the grammar cannot express what we want to
@@ -2730,11 +2951,11 @@ in the thing that preceed:
  Here the NEWLINE will be shifted because nothing can be reduced until
  the `if` is seen.
  
-When, during error handling, we discard token read in, we want to keep
+When during error handling we discard tokens read in, we want to keep
  discarding until we see one that is recognised.  If we had a full set
-of LR(1) grammar states, this will mean looking in the look-ahead set,
+of LR(1) grammar states, this would mean looking in the look-ahead set,
  but we don't keep a full look-ahead set.  We only record the subset
-that leads to SHIFT.  We can, however, deduce the look-ahead set but
+that leads to SHIFT.  We can, however, deduce the look-ahead set by
  looking at the SHIFT subsets for all states that we can get to by
  reducing zero or more times.  So we need a little function which
  checks if a given token is in any of these look-ahead sets.
@@ -2766,6 +2987,7 @@ checks if a given token is in any of these look-ahead sets.
                 struct parser p = { 0 };
                 struct token *tk = NULL;
                 int accepted = 0;
+               int shift_since_err = 1;
                 void *ret = NULL;
  
                 shift(&p, TK_eof, 0, 1, NULL, states);
@@ -2837,12 +3059,23 @@ checks if a given token is in any of these look-ahead sets.
                                         goto force_reduce;
                         }
                         if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
+                               shift_since_err = 1;
                                 tk = NULL;
                                 parser_trace_action(trace, "Shift");
                                 continue;
                         }
                 force_reduce:
-                       if (states[tos->state].reduce_prod >= 0) {
+                       if (states[tos->state].reduce_prod >= 0 &&
+                           states[tos->state].newline_only &&
+                           !(tk->num == TK_newline ||
+                             tk->num == TK_eof ||
+                             tk->num == TK_out ||
+                             (tos->indents == 0 && tos->since_newline == 0))) {
+                               /* Anything other than newline or out or eof
+                                * in an error unless we are already at start
+                                * of line, as this production must end at EOL.
+                                */
+                       } else if (states[tos->state].reduce_prod >= 0) {
                                 void **body;
                                 void *res;
                                 const struct state *nextstate = &states[tos->state];
@@ -2891,11 +3124,22 @@ checks if a given token is in any of these look-ahead sets.
                                 // no state accepted TK_error
                                 break;
                         }
+                       if (!shift_since_err) {
+                               /* must discard at least one token to avoid
+                                * infinite loop.
+                                */
+                               if (tk->num == TK_eof)
+                                       break;
+                               free(tk);
+                               tk = tok_copy(token_next(tokens));
+                       }
+                       shift_since_err = 0;
                         tos = &p.stack[p.tos-1];
                         while (!in_lookahead(tk, states, tos->state) &&
                                tk->num != TK_eof) {
                                 free(tk);
                                 tk = tok_copy(token_next(tokens));
+                               shift_since_err = 1;
                                 if (tk->num == TK_in)
                                         indents += 1;
                                 if (tk->num == TK_out) {
@@ -2992,7 +3236,7 @@ end inside square brackets.
                         fputs(reserved_words[tk->num], trace);
                 else
                         text_dump(trace, tk->txt, 20);
-               fputs("]", trace);
+               fprintf(trace, ":%d:%d]", tk->line, tk->col);
         }
  
         void parser_trace_action(FILE *trace, char *action)
@@ -3023,7 +3267,7 @@ an error.
  
  # calc: header
  
-       #include "number.h"
+       #include "parse_number.h"
         // what do we use for a demo-grammar?  A calculator of course.
         struct number {
                 mpq_t val;
@@ -3043,7 +3287,6 @@ an error.
         #include <string.h>
         #include "mdcode.h"
         #include "scanner.h"
-       #include "number.h"
         #include "parser.h"
  
         #include "calc.h"
@@ -3069,7 +3312,6 @@ an error.
                 struct section *s;
                 struct token_config config = {
                         .ignored = (1 << TK_line_comment)
-                                | (1 << TK_block_comment)
                                  | (1 << TK_in)
                                  | (1 << TK_out),
                         .number_chars = ".,_+-",
@@ -3091,7 +3333,7 @@ an error.
  # calc: grammar
  
         $LEFT + -
-       $LEFT * /
+       $LEFT * / //
  
         Session -> Session Line
                 | Line
@@ -3119,6 +3361,16 @@ an error.
                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+               | Expression // Expression ${ {
+                       mpz_t z0, z1, z2;
+                       mpq_init($0.val);
+                       mpz_init(z0); mpz_init(z1); mpz_init(z2);
+                       mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
+                       mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
+                       mpz_tdiv_q(z0, z1, z2);
+                       mpq_set_z($0.val, z0);
+                       mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
+               } }$
                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
  
@@ -3131,4 +3383,6 @@ an error.
         10 * 9 / 2
         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
  
+       355//113
+
         error