]> ocean-lang.org Git - ocean/blobdiff - csrc/parsergen.mdc
parsergen.mdc: remove tracking of left-recursive symbols.
[ocean] / csrc / parsergen.mdc
index d30275e4e4d383a22905acd1e415c72d89fc56de..2da1d65bb7b2f962af024ef6b7d7f20a9e2e1f3e 100644 (file)
@@ -100,6 +100,7 @@ symbol.
        struct production {
                unsigned short precedence;
                enum assoc assoc;
        struct production {
                unsigned short precedence;
                enum assoc assoc;
+               char line_like;
                ## production fields
        };
        struct grammar {
                ## production fields
        };
        struct grammar {
@@ -150,11 +151,17 @@ those which don't.  There are also "virtual" symbols used for precedence
 marking discussed later, and sometimes we won't know what type a symbol
 is yet.
 
 marking discussed later, and sometimes we won't know what type a symbol
 is yet.
 
+To help with code safety it is possible to declare the terminal symbols.
+If this is done, then any symbol used in a production that does not
+appear in a head and is not declared is treated as an error.
+
 ###### forward declarations
        enum symtype { Unknown, Virtual, Terminal, Nonterminal };
        char *symtypes = "UVTN";
 ###### symbol fields
        enum symtype type;
 ###### forward declarations
        enum symtype { Unknown, Virtual, Terminal, Nonterminal };
        char *symtypes = "UVTN";
 ###### symbol fields
        enum symtype type;
+###### grammar fields
+       int terminals_declared;
 
 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 table of known symbols and the resulting parser will report them as
 
 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 table of known symbols and the resulting parser will report them as
@@ -162,17 +169,18 @@ table of known symbols and the resulting parser will report them as
 different token types that `scanner` can report.
 
 ###### declarations
 different token types that `scanner` can report.
 
 ###### declarations
-       static char *reserved_words[] = {
-               [TK_error]        = "ERROR",
-               [TK_number]       = "NUMBER",
-               [TK_ident]        = "IDENTIFIER",
-               [TK_mark]         = "MARK",
-               [TK_string]       = "STRING",
-               [TK_multi_string] = "MULTI_STRING",
-               [TK_in]           = "IN",
-               [TK_out]          = "OUT",
-               [TK_newline]      = "NEWLINE",
-               [TK_eof]          = "$eof",
+
+       static struct { int num; char *name; } reserved_words[] = {
+               { TK_error,        "ERROR" },
+               { TK_number,       "NUMBER" },
+               { TK_ident,        "IDENTIFIER" },
+               { TK_mark,         "MARK" },
+               { TK_string,       "STRING" },
+               { TK_multi_string, "MULTI_STRING" },
+               { TK_in,           "IN" },
+               { TK_out,          "OUT" },
+               { TK_newline,      "NEWLINE" },
+               { TK_eof,          "$eof" },
        };
 ###### symbol fields
        short num;
        };
 ###### symbol fields
        short num;
@@ -228,21 +236,20 @@ symbol, but its type might be `Unknown`.
                for (i = 0; i < entries; i++) {
                        struct text t;
                        struct symbol *s;
                for (i = 0; i < entries; i++) {
                        struct text t;
                        struct symbol *s;
-                       t.txt = reserved_words[i];
-                       if (!t.txt)
-                               continue;
+                       t.txt = reserved_words[i].name;
                        t.len = strlen(t.txt);
                        s = sym_find(g, t);
                        s->type = Terminal;
                        t.len = strlen(t.txt);
                        s = sym_find(g, t);
                        s->type = Terminal;
-                       s->num = i;
+                       s->num = reserved_words[i].num;
                }
        }
 
 ### Data types and precedence.
 
                }
        }
 
 ### Data types and precedence.
 
-Data type specification and precedence specification are both
-introduced by a dollar sign at the start of the line.  If the next
-word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+Data type specification, precedence specification, and declaration of
+terminals are all introduced by a dollar sign at the start of the line.
+If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+precedence, if it is `TERM` the the line declares terminals without
 precedence, otherwise it specifies a data type.
 
 The data type name is simply stored and applied to the head of all
 precedence, otherwise it specifies a data type.
 
 The data type name is simply stored and applied to the head of all
@@ -295,6 +302,7 @@ Subsequent lines introduce symbols with higher precedence.
                struct token t = token_next(ts);
                char *err;
                enum assoc assoc;
                struct token t = token_next(ts);
                char *err;
                enum assoc assoc;
+               int term = 0;
                int found;
 
                if (t.num != TK_ident) {
                int found;
 
                if (t.num != TK_ident) {
@@ -307,7 +315,10 @@ Subsequent lines introduce symbols with higher precedence.
                        assoc = Right;
                else if (text_is(t.txt, "NON"))
                        assoc = Non;
                        assoc = Right;
                else if (text_is(t.txt, "NON"))
                        assoc = Non;
-               else {
+               else if (text_is(t.txt, "TERM")) {
+                       term = 1;
+                       g->terminals_declared = 1;
+               } else {
                        g->current_type = t.txt;
                        g->type_isref = isref;
                        if (text_is(t.txt, "void"))
                        g->current_type = t.txt;
                        g->type_isref = isref;
                        if (text_is(t.txt, "void"))
@@ -325,7 +336,7 @@ Subsequent lines introduce symbols with higher precedence.
                        goto abort;
                }
 
                        goto abort;
                }
 
-               // This is a precedence line, need some symbols.
+               // This is a precedence or TERM line, need some symbols.
                found = 0;
                g->prec_levels += 1;
                t = token_next(ts);
                found = 0;
                g->prec_levels += 1;
                t = token_next(ts);
@@ -339,6 +350,10 @@ Subsequent lines introduce symbols with higher precedence.
                                        err = "$$ must be followed by a word";
                                        goto abort;
                                }
                                        err = "$$ must be followed by a word";
                                        goto abort;
                                }
+                               if (term) {
+                                       err = "Virtual symbols not permitted on $TERM line";
+                                       goto abort;
+                               }
                        } else if (t.num != TK_ident &&
                                   t.num != TK_mark) {
                                err = "Illegal token in precedence line";
                        } else if (t.num != TK_ident &&
                                   t.num != TK_mark) {
                                err = "Illegal token in precedence line";
@@ -346,17 +361,19 @@ Subsequent lines introduce symbols with higher precedence.
                        }
                        s = sym_find(g, t.txt);
                        if (s->type != Unknown) {
                        }
                        s = sym_find(g, t.txt);
                        if (s->type != Unknown) {
-                               err = "Symbols in precedence line must not already be known.";
+                               err = "Symbols in precedence/TERM line must not already be known.";
                                goto abort;
                        }
                        s->type = type;
                                goto abort;
                        }
                        s->type = type;
-                       s->precedence = g->prec_levels;
-                       s->assoc = assoc;
+                       if (!term) {
+                               s->precedence = g->prec_levels;
+                               s->assoc = assoc;
+                       }
                        found += 1;
                        t = token_next(ts);
                }
                if (found == 0)
                        found += 1;
                        t = token_next(ts);
                }
                if (found == 0)
-                       err = "No symbols given on precedence line";
+                       err = "No symbols given on precedence/TERM line";
                        goto abort;
                return NULL;
        abort:
                        goto abort;
                return NULL;
        abort:
@@ -390,14 +407,15 @@ be in one `code_node` of the literate code.  The `}$` must be
 at the end of a line.
 
 Text in the code fragment will undergo substitutions where `$N` or
 at the end of a line.
 
 Text in the code fragment will undergo substitutions where `$N` or
-`$<N`,for some numeric `N`, will be replaced with a variable holding
-the parse information for the particular symbol in the production.
-`$0` is the head of the production, `$1` is the first symbol of the
-body, etc.  The type of `$N` for a terminal symbol is `struct token`.
-For a non-terminal, it is whatever has been declared for that symbol.
-The `<` may be included for symbols declared as storing a reference
-(not a structure) and means that the reference is being moved out, so
-it will not automatically be freed.
+`$<N`,for some numeric `N` (or non-numeric indicator as described
+later), will be replaced with a variable holding the parse information
+for the particular symbol in the production.  `$0` is the head of the
+production, `$1` is the first symbol of the body, etc.  The type of `$N`
+for a terminal symbol is `struct token`.  For a non-terminal, it is
+whatever has been declared for that symbol.  The `<` may be included and
+means that the value (usually a reference) is being moved out, so it
+will not automatically be freed.  The effect of using '<' is that the
+variable is cleareed to all-zeros.
 
 While building productions we will need to add to an array which needs to
 grow dynamically.
 
 While building productions we will need to add to an array which needs to
 grow dynamically.
@@ -475,8 +493,10 @@ Now we have all the bits we need to parse a full production.
                tk = token_next(state);
                while (tk.num == TK_ident || tk.num == TK_mark) {
                        struct symbol *bs = sym_find(g, tk.txt);
                tk = token_next(state);
                while (tk.num == TK_ident || tk.num == TK_mark) {
                        struct symbol *bs = sym_find(g, tk.txt);
-                       if (bs->type == Unknown)
-                               bs->type = Terminal;
+                       if (bs->type == Unknown) {
+                               if (!g->terminals_declared)
+                                       bs->type = Terminal;
+                       }
                        if (bs->type == Virtual) {
                                err = "Virtual symbol not permitted in production";
                                goto abort;
                        if (bs->type == Virtual) {
                                err = "Virtual symbol not permitted in production";
                                goto abort;
@@ -496,12 +516,17 @@ Now we have all the bits we need to parse a full production.
                                goto abort;
                        }
                        vs = sym_find(g, tk.txt);
                                goto abort;
                        }
                        vs = sym_find(g, tk.txt);
-                       if (vs->type != Virtual) {
-                               err = "symbol after $$ must be virtual";
+                       if (vs->num == TK_newline)
+                               p.line_like = 1;
+                       else if (vs->num == TK_out)
+                               p.line_like = 2;
+                       else if (vs->precedence == 0) {
+                               err = "symbol after $$ must have precedence";
                                goto abort;
                                goto abort;
+                       } else {
+                               p.precedence = vs->precedence;
+                               p.assoc = vs->assoc;
                        }
                        }
-                       p.precedence = vs->precedence;
-                       p.assoc = vs->assoc;
                        tk = token_next(state);
                }
                if (tk.num == TK_open) {
                        tk = token_next(state);
                }
                if (tk.num == TK_open) {
@@ -513,6 +538,7 @@ Now we have all the bits we need to parse a full production.
                        }
                        tk = token_next(state);
                }
                        }
                        tk = token_next(state);
                }
+
                if (tk.num != TK_newline && tk.num != TK_eof) {
                        err = "stray tokens at end of line";
                        goto abort;
                if (tk.num != TK_newline && tk.num != TK_eof) {
                        err = "stray tokens at end of line";
                        goto abort;
@@ -630,6 +656,11 @@ to produce errors that the parser is better positioned to handle.
                        } else if (tk.num == TK_mark
                                   && text_is(tk.txt, "$*")) {
                                err = dollar_line(state, g, 1);
                        } else if (tk.num == TK_mark
                                   && text_is(tk.txt, "$*")) {
                                err = dollar_line(state, g, 1);
+                       } else if (tk.num == TK_mark
+                                  && text_is(tk.txt, "//")) {
+                               while (tk.num != TK_newline &&
+                                      tk.num != TK_eof)
+                                       tk = token_next(state);
                        } else {
                                err = "Unrecognised token at start of line.";
                        }
                        } else {
                                err = "Unrecognised token at start of line.";
                        }
@@ -637,6 +668,21 @@ to produce errors that the parser is better positioned to handle.
                                goto abort;
                }
                token_close(state);
                                goto abort;
                }
                token_close(state);
+               if (g->terminals_declared) {
+                       struct symbol *s;
+                       int errs = 0;
+                       for (s = g->syms; s; s = s->next) {
+                               if (s->type != Unknown)
+                                       continue;
+                               errs += 1;
+                               fprintf(stderr, "Token %.*s not declared\n",
+                                       s->name.len, s->name.txt);
+                       }
+                       if (errs) {
+                               free(g);
+                               g = NULL;
+                       }
+               }
                return g;
        abort:
                fprintf(stderr, "Error at line %d: %s\n",
                return g;
        abort:
                fprintf(stderr, "Error at line %d: %s\n",
@@ -1101,15 +1147,15 @@ the end of the list in the symset, and then only compare items before
 the first "0".
 
 ###### declarations
 the first "0".
 
 ###### declarations
-       static inline unsigned short item_num(int production, int index)
+       static inline unsigned short item_num(int production, int dot)
        {
        {
-               return production | ((31-index) << 11);
+               return production | ((31-dot) << 11);
        }
        static inline int item_prod(unsigned short item)
        {
                return item & 0x7ff;
        }
        }
        static inline int item_prod(unsigned short item)
        {
                return item & 0x7ff;
        }
-       static inline int item_index(unsigned short item)
+       static inline int item_dot(unsigned short item)
        {
                return (31-(item >> 11)) & 0x1f;
        }
        {
                return (31-(item >> 11)) & 0x1f;
        }
@@ -1127,8 +1173,8 @@ can just compare the symset and the data values together.
 
                for (i = 0;
                     i < a.cnt && i < b.cnt &&
 
                for (i = 0;
                     i < a.cnt && i < b.cnt &&
-                    item_index(a.syms[i]) > 0 &&
-                    item_index(b.syms[i]) > 0;
+                    item_dot(a.syms[i]) > 0 &&
+                    item_dot(b.syms[i]) > 0;
                     i++) {
                        int diff = a.syms[i] - b.syms[i];
                        if (diff)
                     i++) {
                        int diff = a.syms[i] - b.syms[i];
                        if (diff)
@@ -1139,11 +1185,11 @@ can just compare the symset and the data values together.
                                        return diff;
                        }
                }
                                        return diff;
                        }
                }
-               if (i == a.cnt || item_index(a.syms[i]) == 0)
+               if (i == a.cnt || item_dot(a.syms[i]) == 0)
                        av = -1;
                else
                        av = a.syms[i];
                        av = -1;
                else
                        av = a.syms[i];
-               if (i == b.cnt || item_index(b.syms[i]) == 0)
+               if (i == b.cnt || item_dot(b.syms[i]) == 0)
                        bv = -1;
                else
                        bv = b.syms[i];
                        bv = -1;
                else
                        bv = b.syms[i];
@@ -1299,12 +1345,14 @@ into the go to set, so the item is ineffective.
 ###### complete itemset
        for (i = 0; i < is->items.cnt; i++) {
                int p = item_prod(is->items.syms[i]);
 ###### complete itemset
        for (i = 0; i < is->items.cnt; i++) {
                int p = item_prod(is->items.syms[i]);
-               int bs = item_index(is->items.syms[i]);
+               int bs = item_dot(is->items.syms[i]);
                struct production *pr = g->productions[p];
                int p2;
                struct symbol *s;
                struct symset LA = INIT_SYMSET;
                unsigned short sn = 0;
                struct production *pr = g->productions[p];
                int p2;
                struct symbol *s;
                struct symset LA = INIT_SYMSET;
                unsigned short sn = 0;
+               struct symset LAnl = INIT_SYMSET;
+               unsigned short snnl = 0;
 
                if (is->min_prefix == 0 ||
                    (bs > 0 && bs < is->min_prefix))
 
                if (is->min_prefix == 0 ||
                    (bs > 0 && bs < is->min_prefix))
@@ -1326,11 +1374,11 @@ into the go to set, so the item is ineffective.
                        continue;
                if (symset_find(&done, s->num) < 0) {
                        symset_add(&done, s->num, 0);
                        continue;
                if (symset_find(&done, s->num) < 0) {
                        symset_add(&done, s->num, 0);
-                       if (s->line_like)
-                               is->starts_line = 1;
                }
                if (s->type != Nonterminal)
                        continue;
                }
                if (s->type != Nonterminal)
                        continue;
+               if (s->line_like)
+                       is->starts_line = 1;
                again = 1;
                if (type >= LALR) {
                        // Need the LA set.
                again = 1;
                if (type >= LALR) {
                        // Need the LA set.
@@ -1342,6 +1390,10 @@ into the go to set, so the item is ineffective.
                        }
                        sn = save_set(g, LA);
                        LA = set_find(g, sn);
                        }
                        sn = save_set(g, LA);
                        LA = set_find(g, sn);
+                       if (symset_find(&LA, TK_newline))
+                               symset_add(&LAnl, TK_newline, 0);
+                       snnl = save_set(g, LAnl);
+                       LAnl = set_find(g, snnl);
                }
 
                /* Add productions for this symbol */
                }
 
                /* Add productions for this symbol */
@@ -1352,19 +1404,25 @@ into the go to set, so the item is ineffective.
                        int itm = item_num(p2, 0);
                        int pos = symset_find(&is->items, itm);
                        if (pos < 0) {
                        int itm = item_num(p2, 0);
                        int pos = symset_find(&is->items, itm);
                        if (pos < 0) {
-                               symset_add(&is->items, itm, sn);
+                               if (g->productions[p2]->line_like)
+                                       symset_add(&is->items, itm, snnl);
+                               else
+                                       symset_add(&is->items, itm, sn);
                                /* Will have re-ordered, so start
                                 * from beginning again */
                                i = -1;
                        } else if (type >= LALR) {
                                struct symset ss = set_find(g, is->items.data[pos]);
                                struct symset tmp = INIT_SYMSET;
                                /* Will have re-ordered, so start
                                 * from beginning again */
                                i = -1;
                        } else if (type >= LALR) {
                                struct symset ss = set_find(g, is->items.data[pos]);
                                struct symset tmp = INIT_SYMSET;
+                               struct symset *la = &LA;
 
 
+                               if (g->productions[p2]->line_like)
+                                       la = &LAnl;
                                symset_union(&tmp, &ss);
                                symset_union(&tmp, &ss);
-                               if (symset_union(&tmp, &LA)) {
+                               if (symset_union(&tmp, la)) {
                                        is->items.data[pos] = save_set(g, tmp);
                                        i = -1;
                                        is->items.data[pos] = save_set(g, tmp);
                                        i = -1;
-                               }else
+                               } else
                                        symset_free(tmp);
                        }
                }
                                        symset_free(tmp);
                        }
                }
@@ -1392,7 +1450,7 @@ with a pre-existing itemset).
                for (j = 0; j < is->items.cnt; j++) {
                        int itm = is->items.syms[j];
                        int p = item_prod(itm);
                for (j = 0; j < is->items.cnt; j++) {
                        int itm = is->items.syms[j];
                        int p = item_prod(itm);
-                       int bp = item_index(itm);
+                       int bp = item_dot(itm);
                        struct production *pr = g->productions[p];
                        unsigned short la = 0;
                        int pos;
                        struct production *pr = g->productions[p];
                        unsigned short la = 0;
                        int pos;
@@ -1501,6 +1559,11 @@ changeover point in `first_nonterm`.
                                snum++;
                        }
                g->first_nonterm = snum;
                                snum++;
                        }
                g->first_nonterm = snum;
+               for (s = g->syms; s; s = s->next)
+                       if (s->num < 0 && s->type != Virtual) {
+                               s->num = snum;
+                               snum++;
+                       }
                for (s = g->syms; s; s = s->next)
                        if (s->num < 0) {
                                s->num = snum;
                for (s = g->syms; s; s = s->next)
                        if (s->num < 0) {
                                s->num = snum;
@@ -1612,7 +1675,7 @@ it up a bit.  First the items, with production number and associativity.
        static void report_item(struct grammar *g, int itm)
        {
                int p = item_prod(itm);
        static void report_item(struct grammar *g, int itm)
        {
                int p = item_prod(itm);
-               int dot = item_index(itm);
+               int dot = item_dot(itm);
                struct production *pr = g->productions[p];
                int i;
 
                struct production *pr = g->productions[p];
                int i;
 
@@ -1635,6 +1698,10 @@ it up a bit.  First the items, with production number and associativity.
                        printf(" [%d%s]", s->precedence,
                               assoc_names[s->assoc]);
                }
                        printf(" [%d%s]", s->precedence,
                               assoc_names[s->assoc]);
                }
+               if (pr->line_like == 1)
+                       printf(" $$NEWLINE");
+               else if (pr->line_like)
+                       printf(" $$OUT");
                printf("\n");
        }
 
                printf("\n");
        }
 
@@ -1737,7 +1804,7 @@ as shifts always over-ride reductions.
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
 
                                if (bp == pr->body_size) {
                                struct production *pr = g->productions[p];
 
                                if (bp == pr->body_size) {
@@ -1773,14 +1840,10 @@ terminals to items where that terminal could be shifted and another
 which maps terminals to items that could be reduced when the terminal
 is in look-ahead.  We report when we get conflicts between the two.
 
 which maps terminals to items that could be reduced when the terminal
 is in look-ahead.  We report when we get conflicts between the two.
 
-As a special case, if we find a SHIFT/REDUCE conflict, where a
-terminal that could be shifted is in the lookahead set of some
-reducable item, then set check if the reducable item also have
-`TK_newline` in its lookahead set.  If it does, then a newline will
-force the reduction, but anything else can reasonably be shifted, so
-that isn't really a conflict.  Such apparent conflicts do not get
-counted, and are reported as non-critical.  This will not affect a
-"traditional" grammar that does not include newlines as token.
+As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
+terminal, we ignore it.  NEWLINES are handled specially with its own
+rules for when to shift and when to reduce.  Conflicts are expected,
+but handled internally.
 
        static int conflicts_slr(struct grammar *g, enum grammar_type type)
        {
 
        static int conflicts_slr(struct grammar *g, enum grammar_type type)
        {
@@ -1798,7 +1861,7 @@ counted, and are reported as non-critical.  This will not affect a
                        for (j = 0; j < is->items.cnt; j++) {
                                unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
                        for (j = 0; j < is->items.cnt; j++) {
                                unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
                                struct symbol *s;
 
                                struct production *pr = g->productions[p];
                                struct symbol *s;
 
@@ -1819,7 +1882,7 @@ counted, and are reported as non-critical.  This will not affect a
                        for (j = 0; j < is->items.cnt; j++) {
                                unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
                        for (j = 0; j < is->items.cnt; j++) {
                                unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
 
                                if (bp < pr->body_size)
                                struct production *pr = g->productions[p];
 
                                if (bp < pr->body_size)
@@ -1833,13 +1896,10 @@ counted, and are reported as non-critical.  This will not affect a
                                int k;
                                for (k = 0; k < la.cnt; k++) {
                                        int pos = symset_find(&shifts, la.syms[k]);
                                int k;
                                for (k = 0; k < la.cnt; k++) {
                                        int pos = symset_find(&shifts, la.syms[k]);
-                                       if (pos >= 0) {
-                                               if (symset_find(&la, TK_newline) < 0) {
-                                                       printf("  State %d has SHIFT/REDUCE conflict on ", i);
-                                                       cnt++;
-                                               } else
-                                                       printf("  State %d has non-critical SHIFT/REDUCE conflict on ", i);
-                                               prtxt(g->symtab[la.syms[k]]->name);
+                                       if (pos >= 0 && la.syms[k] != TK_newline) {
+                                               printf("  State %d has SHIFT/REDUCE conflict on ", i);
+                                               cnt++;
+                                                       prtxt(g->symtab[la.syms[k]]->name);
                                                printf(":\n");
                                                report_item(g, shifts.data[pos]);
                                                report_item(g, itm);
                                                printf(":\n");
                                                report_item(g, shifts.data[pos]);
                                                report_item(g, itm);
@@ -1863,6 +1923,7 @@ counted, and are reported as non-critical.  This will not affect a
                return cnt;
        }
 
                return cnt;
        }
 
+
 ## Generating the parser
 
 The exported part of the parser is the `parse_XX` function, where the name
 ## Generating the parser
 
 The exported part of the parser is the `parse_XX` function, where the name
@@ -1896,7 +1957,6 @@ pieces of code provided in the grammar file, so they are generated first.
                fprintf(f, "\tstruct token_state *tokens;\n");
                fprintf(f, "\tconfig->words_marks = known;\n");
                fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
                fprintf(f, "\tstruct token_state *tokens;\n");
                fprintf(f, "\tconfig->words_marks = known;\n");
                fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
-               fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
                fprintf(f, "\ttokens = token_open(code, config);\n");
                fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
                fprintf(f, "\ttoken_close(tokens);\n");
                fprintf(f, "\ttokens = token_open(code, config);\n");
                fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
                fprintf(f, "\ttoken_close(tokens);\n");
@@ -1934,7 +1994,7 @@ numbers right.
                for (i = TK_reserved;
                     i < g->num_syms;
                     i++)
                for (i = TK_reserved;
                     i < g->num_syms;
                     i++)
-                       if (g->symtab[i]->type != Terminal)
+                       if (g->symtab[i]->type == Nonterminal)
                                fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
                                        g->symtab[i]->name.txt);
                fprintf(f, "};\n\n");
                                fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
                                        g->symtab[i]->name.txt);
                fprintf(f, "};\n\n");
@@ -1964,7 +2024,8 @@ The go to table is stored in a simple array of `sym` and corresponding
                short reduce_prod;
                short reduce_size;
                short reduce_sym;
                short reduce_prod;
                short reduce_size;
                short reduce_sym;
-               short starts_line;
+               char starts_line;
+               char newline_only;
                short min_prefix;
        };
 
                short min_prefix;
        };
 
@@ -2000,7 +2061,7 @@ The go to table is stored in a simple array of `sym` and corresponding
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
 
                                if (bp < pr->body_size)
                                struct production *pr = g->productions[p];
 
                                if (bp < pr->body_size)
@@ -2013,13 +2074,15 @@ The go to table is stored in a simple array of `sym` and corresponding
                        }
 
                        if (prod >= 0)
                        }
 
                        if (prod >= 0)
-                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d },\n",
+                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n",
                                        i, is->go_to.cnt, i, prod,
                                        g->productions[prod]->body_size,
                                        g->productions[prod]->head->num,
                                        i, is->go_to.cnt, i, prod,
                                        g->productions[prod]->body_size,
                                        g->productions[prod]->head->num,
-                                       is->starts_line, is->min_prefix);
+                                       is->starts_line,
+                                       g->productions[prod]->line_like,
+                                       is->min_prefix);
                        else
                        else
-                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
+                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n",
                                        i, is->go_to.cnt, i,
                                        is->starts_line, is->min_prefix);
                }
                                        i, is->go_to.cnt, i,
                                        is->starts_line, is->min_prefix);
                }
@@ -2050,13 +2113,105 @@ structure returned by a previous reduction.  These pointers need to be cast
 to the appropriate type for each access.  All this is handled in
 `gen_code`.
 
 to the appropriate type for each access.  All this is handled in
 `gen_code`.
 
-`gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
-This applied only to symbols with references (or pointers), not those with structures.
-The `<` implies that the reference it being moved out, so the object will not be
-automatically freed.  This is equivalent to assigning `NULL` to the pointer.
+`gen_code` also allows symbol references to contain a '`<`' as in
+'`$<2`'.  This is particularly useful for references (or pointers), but
+can be used with structures too.  The `<` implies that the value it
+being moved out, so the object will not be automatically freed.  It is
+equivalent to assigning `NULL` to the pointer or filling a structure
+with zeros.
+
+Instead of a number `N`, the `$` or `$<` can be followed by some letters
+and possibly a number.  A number by itself (other than zero) selects a
+symbol from the body of the production.  A sequence of letters selects
+the shortest symbol in the body which contains those letters in the given
+order.  If a number follows the letters, then a later occurrence of
+that symbol is chosen.  So "`$AB2`" will refer to the structure attached
+to the second occurrence of the shortest symbol which contains an `A`
+followed by a `B`.  If there is no unique shortest system, or if the
+number given is too large, then the symbol reference is not transformed,
+and will cause an error when the code is compiled.
 
 ###### functions
 
 
 ###### functions
 
+       static int textchr(struct text t, char c, int s)
+       {
+               int i;
+               for (i = s; i < t.len; i++)
+                       if (t.txt[i] == c)
+                               return i;
+               return -1;
+       }
+
+       static int subseq_match(char *seq, int slen, struct text name)
+       {
+               int st = 0;
+               while (slen > 0) {
+                       st = textchr(name, *seq, st);
+                       if (st < 0)
+                               return 0;
+                       slen -= 1;
+                       seq += 1;
+                       st += 1;
+               }
+               return 1;
+       }
+
+       static int choose_sym(char **namep, int len, struct production *p)
+       {
+               char *name = *namep;
+               char *nam = name;
+               int namlen;
+               int n = 0;
+               int i, s, slen;
+               char c;
+
+               c = *name;
+               while (len > 0 &&
+                      ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
+                       name += 1;
+                       len -= 1;
+                       c = *name;
+               }
+               namlen = name-nam;
+               while (len > 0 && (c >= '0' && c <= '9')) {
+                       name += 1;
+                       len -= 1;
+                       n = n * 10 + (c - '0');
+                       c = *name;
+               }
+               if (namlen == 0) {
+                       if (name == *namep)
+                               return -1;
+                       *namep = name;
+                       return n;
+               }
+               slen = 0; s = -1;
+               for (i = 0; i < p->body_size; i++) {
+                       if (!subseq_match(nam, namlen, p->body[i]->name))
+                               continue;
+                       if (slen == 0 || p->body[i]->name.len < slen)
+                               s = i;
+                       if (s >= 0 && p->body[i] != p->body[s] &&
+                           p->body[i]->name.len == p->body[s]->name.len)
+                               /* not unique, so s cannot be used */
+                               s = -1;
+               }
+               if (s < 0)
+                       return -1;
+               if (n == 0);
+                       n = 1;
+               for (i = 0; i < p->body_size; i++)
+                       if (p->body[i] == p->body[s]) {
+                               n -= 1;
+                               if (n == 0)
+                                       break;
+                       }
+               if (n > 1)
+                       return -1;
+               *namep = name;
+               return i + 1;
+       }
+
        static void gen_code(struct production *p, FILE *f, struct grammar *g)
        {
                char *c;
        static void gen_code(struct production *p, FILE *f, struct grammar *g)
        {
                char *c;
@@ -2078,24 +2233,19 @@ automatically freed.  This is equivalent to assigning `NULL` to the pointer.
                                use = 1;
                                c++;
                        }
                                use = 1;
                                c++;
                        }
-                       if (*c < '0' || *c > '9') {
+                       n = choose_sym(&c, p->code.txt + p->code.len - c, p);
+                       if (n < 0) {
+                               fputc('$', f);
                                if (use)
                                        fputc('<', f);
                                fputc(*c, f);
                                continue;
                        }
                                if (use)
                                        fputc('<', f);
                                fputc(*c, f);
                                continue;
                        }
-                       n = *c - '0';
-                       while (c[1] >= '0' && c[1] <= '9') {
-                               c += 1;
-                               n = n * 10 + *c - '0';
-                       }
                        if (n == 0)
                                fprintf(f, "(*(struct %.*s*%s)ret)",
                                        p->head->struct_name.len,
                                        p->head->struct_name.txt,
                                        p->head->isref ? "*":"");
                        if (n == 0)
                                fprintf(f, "(*(struct %.*s*%s)ret)",
                                        p->head->struct_name.len,
                                        p->head->struct_name.txt,
                                        p->head->isref ? "*":"");
-                       else if (n > p->body_size)
-                               fprintf(f, "$%d", n);
                        else if (p->body[n-1]->type == Terminal)
                                fprintf(f, "(*(struct token *)body[%d])",
                                        n-1);
                        else if (p->body[n-1]->type == Terminal)
                                fprintf(f, "(*(struct token *)body[%d])",
                                        n-1);
@@ -2108,6 +2258,7 @@ automatically freed.  This is equivalent to assigning `NULL` to the pointer.
                                        p->body[n-1]->isref ? "*":"", n-1);
                                used[n-1] = use;
                        }
                                        p->body[n-1]->isref ? "*":"", n-1);
                                used[n-1] = use;
                        }
+                       c -= 1;
                }
                fputs("\n", f);
                for (i = 0; i < p->body_size; i++) {
                }
                fputs("\n", f);
                for (i = 0; i < p->body_size; i++) {
@@ -2694,8 +2845,12 @@ accepted the input and can finish.
 We return whatever `asn` was returned by reducing production zero.
 
 If we can neither shift nor reduce we have an error to handle.  We pop
 We return whatever `asn` was returned by reducing production zero.
 
 If we can neither shift nor reduce we have an error to handle.  We pop
-single entries off the stack until we can shift the `TK_error` symbol, then
-drop input tokens until we find one we can shift into the new error state.
+single entries off the stack until we can shift the `TK_error` symbol,
+then drop input tokens until we find one we can shift into the new error
+state.  We need to ensure that something is dropped or shifted after an
+error, or we could get into an infinite loop, only shifting in
+`TK_error`, then reducing.  So we track if there has been a shift since
+the last error, and if not the next error always discards one token.
 
 When we find `TK_in` and `TK_out` tokens which report indents we need
 to handle them directly as the grammar cannot express what we want to
 
 When we find `TK_in` and `TK_out` tokens which report indents we need
 to handle them directly as the grammar cannot express what we want to
@@ -2739,11 +2894,11 @@ in the thing that preceed:
 Here the NEWLINE will be shifted because nothing can be reduced until
 the `if` is seen.
 
 Here the NEWLINE will be shifted because nothing can be reduced until
 the `if` is seen.
 
-When, during error handling, we discard token read in, we want to keep
+When during error handling we discard tokens read in, we want to keep
 discarding until we see one that is recognised.  If we had a full set
 discarding until we see one that is recognised.  If we had a full set
-of LR(1) grammar states, this will mean looking in the look-ahead set,
+of LR(1) grammar states, this would mean looking in the look-ahead set,
 but we don't keep a full look-ahead set.  We only record the subset
 but we don't keep a full look-ahead set.  We only record the subset
-that leads to SHIFT.  We can, however, deduce the look-ahead set but
+that leads to SHIFT.  We can, however, deduce the look-ahead set by
 looking at the SHIFT subsets for all states that we can get to by
 reducing zero or more times.  So we need a little function which
 checks if a given token is in any of these look-ahead sets.
 looking at the SHIFT subsets for all states that we can get to by
 reducing zero or more times.  So we need a little function which
 checks if a given token is in any of these look-ahead sets.
@@ -2775,10 +2930,11 @@ checks if a given token is in any of these look-ahead sets.
                struct parser p = { 0 };
                struct token *tk = NULL;
                int accepted = 0;
                struct parser p = { 0 };
                struct token *tk = NULL;
                int accepted = 0;
+               int shift_since_err = 1;
                void *ret = NULL;
 
                shift(&p, TK_eof, 0, 1, NULL, states);
                void *ret = NULL;
 
                shift(&p, TK_eof, 0, 1, NULL, states);
-               while (!accepted) {
+               while (!accepted && p.tos > 0) {
                        struct token *err_tk;
                        struct frame *tos = &p.stack[p.tos-1];
                        if (!tk)
                        struct token *err_tk;
                        struct frame *tos = &p.stack[p.tos-1];
                        if (!tk)
@@ -2846,12 +3002,23 @@ checks if a given token is in any of these look-ahead sets.
                                        goto force_reduce;
                        }
                        if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
                                        goto force_reduce;
                        }
                        if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
+                               shift_since_err = 1;
                                tk = NULL;
                                parser_trace_action(trace, "Shift");
                                continue;
                        }
                force_reduce:
                                tk = NULL;
                                parser_trace_action(trace, "Shift");
                                continue;
                        }
                force_reduce:
-                       if (states[tos->state].reduce_prod >= 0) {
+                       if (states[tos->state].reduce_prod >= 0 &&
+                           states[tos->state].newline_only &&
+                           !(tk->num == TK_newline ||
+                             tk->num == TK_eof ||
+                             tk->num == TK_out ||
+                             (tos->indents == 0 && tos->since_newline == 0))) {
+                               /* Anything other than newline or out or eof
+                                * in an error unless we are already at start
+                                * of line, as this production must end at EOL.
+                                */
+                       } else if (states[tos->state].reduce_prod >= 0) {
                                void **body;
                                void *res;
                                const struct state *nextstate = &states[tos->state];
                                void **body;
                                void *res;
                                const struct state *nextstate = &states[tos->state];
@@ -2900,11 +3067,22 @@ checks if a given token is in any of these look-ahead sets.
                                // no state accepted TK_error
                                break;
                        }
                                // no state accepted TK_error
                                break;
                        }
+                       if (!shift_since_err) {
+                               /* must discard at least one token to avoid
+                                * infinite loop.
+                                */
+                               if (tk->num == TK_eof)
+                                       break;
+                               free(tk);
+                               tk = tok_copy(token_next(tokens));
+                       }
+                       shift_since_err = 0;
                        tos = &p.stack[p.tos-1];
                        while (!in_lookahead(tk, states, tos->state) &&
                               tk->num != TK_eof) {
                                free(tk);
                                tk = tok_copy(token_next(tokens));
                        tos = &p.stack[p.tos-1];
                        while (!in_lookahead(tk, states, tos->state) &&
                               tk->num != TK_eof) {
                                free(tk);
                                tk = tok_copy(token_next(tokens));
+                               shift_since_err = 1;
                                if (tk->num == TK_in)
                                        indents += 1;
                                if (tk->num == TK_out) {
                                if (tk->num == TK_in)
                                        indents += 1;
                                if (tk->num == TK_out) {
@@ -3001,7 +3179,7 @@ end inside square brackets.
                        fputs(reserved_words[tk->num], trace);
                else
                        text_dump(trace, tk->txt, 20);
                        fputs(reserved_words[tk->num], trace);
                else
                        text_dump(trace, tk->txt, 20);
-               fputs("]", trace);
+               fprintf(trace, ":%d:%d]", tk->line, tk->col);
        }
 
        void parser_trace_action(FILE *trace, char *action)
        }
 
        void parser_trace_action(FILE *trace, char *action)
@@ -3032,7 +3210,7 @@ an error.
 
 # calc: header
 
 
 # calc: header
 
-       #include "number.h"
+       #include "parse_number.h"
        // what do we use for a demo-grammar?  A calculator of course.
        struct number {
                mpq_t val;
        // what do we use for a demo-grammar?  A calculator of course.
        struct number {
                mpq_t val;
@@ -3052,7 +3230,6 @@ an error.
        #include <string.h>
        #include "mdcode.h"
        #include "scanner.h"
        #include <string.h>
        #include "mdcode.h"
        #include "scanner.h"
-       #include "number.h"
        #include "parser.h"
 
        #include "calc.h"
        #include "parser.h"
 
        #include "calc.h"
@@ -3078,7 +3255,6 @@ an error.
                struct section *s;
                struct token_config config = {
                        .ignored = (1 << TK_line_comment)
                struct section *s;
                struct token_config config = {
                        .ignored = (1 << TK_line_comment)
-                                | (1 << TK_block_comment)
                                 | (1 << TK_in)
                                 | (1 << TK_out),
                        .number_chars = ".,_+-",
                                 | (1 << TK_in)
                                 | (1 << TK_out),
                        .number_chars = ".,_+-",
@@ -3100,7 +3276,7 @@ an error.
 # calc: grammar
 
        $LEFT + -
 # calc: grammar
 
        $LEFT + -
-       $LEFT * /
+       $LEFT * / //
 
        Session -> Session Line
                | Line
 
        Session -> Session Line
                | Line
@@ -3128,6 +3304,16 @@ an error.
                | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
                | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
                | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
                | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
                | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
                | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+               | Expression // Expression ${ {
+                       mpz_t z0, z1, z2;
+                       mpq_init($0.val);
+                       mpz_init(z0); mpz_init(z1); mpz_init(z2);
+                       mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
+                       mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
+                       mpz_tdiv_q(z0, z1, z2);
+                       mpq_set_z($0.val, z0);
+                       mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
+               } }$
                | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
                | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
 
                | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
                | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
 
@@ -3140,4 +3326,6 @@ an error.
        10 * 9 / 2
        1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
 
        10 * 9 / 2
        1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
 
+       355//113
+
        error
        error