]> ocean-lang.org Git - ocean/blobdiff - csrc/parsergen.mdc
parsergen: add handling for TK_IN and TK_OUT
[ocean] / csrc / parsergen.mdc
index ddf03ab27d640ab38f8a07c342f9c5ee9dc66324..d2d70d531d163a67d1de05a1682d71573b37b404 100644 (file)
@@ -40,7 +40,8 @@ There are several distinct sections.
        #include <stdlib.h>
        #include <stdio.h>
        ## parser includes
-       ## parser
+       ## parser functions
+       ## parser_run
 ###### File: calc.cgm
        ## demo grammar
 ###### File: parsergen.mk
@@ -97,7 +98,7 @@ information about each symbol.
        };
 
 The strings reported by `mdcode` and `scanner` are `struct text` which have
-length rather than being null terminated.  To help with printing an
+length rather than being null terminated.  To help with printing and
 comparing we define `text_is` and `prtxt`, which should possibly go in
 `mdcode`.  `scanner` does provide `text_dump` which is useful for strings
 which might contain control characters.
@@ -142,8 +143,8 @@ different token types that `scanner` can report.
                [TK_mark]         = "MARK",
                [TK_string]       = "STRING",
                [TK_multi_string] = "MULTI_STRING",
-               [TK_indent]       = "INDENT",
-               [TK_undent]       = "UNDENT",
+               [TK_in]           = "IN",
+               [TK_out]          = "OUT",
                [TK_newline]      = "NEWLINE",
                [TK_eof]          = "$eof",
        };
@@ -234,6 +235,12 @@ The data type name is simply stored and applied to the head of all
 subsequent productions.  It must be the name of a structure, so `$expr`
 maps to `struct expr`.
 
+Any productions given before the first data type will have no data type
+and can carry no information.  In order to allow other non-terminals to
+have no type, the data type `$void` can be given.  This does *not* mean
+that `struct void` will be used, but rather than no type will be
+associated with future non-terminals.
+
 The precedence line must contain a list of symbols - typically
 terminal symbols, but not necessarily.  It can only contain symbols
 that have not been seen yet, so precedence declaration must precede
@@ -281,6 +288,8 @@ production inherits from the last symbol which has a precedence.
                        assoc = Non;
                else {
                        g->current_type = t.txt;
+                       if (text_is(t.txt, "void"))
+                               g->current_type.txt = NULL;
                        t = token_next(ts);
                        if (t.num != TK_newline) {
                                err = "Extra tokens after type name";
@@ -343,7 +352,7 @@ precedence of the production is that for the virtual symbol.  If none
 is given, the precedence is inherited from the last symbol in the
 production which has a precedence specified.
 
-After the optional precedence may come  the `${` mark.  This indicates
+After the optional precedence may come the `${` mark.  This indicates
 the start of a code fragment.  If present, this must be on the same
 line as the start of the production.
 
@@ -357,7 +366,7 @@ some numeric `N` will be replaced with a variable holding the parse
 information for the particular symbol in the production.  `$0` is the
 head of the production, `$1` is the first symbol of the body, etc.
 The type of `$N` for a terminal symbol is `struct token`.  For
-non-terminal, it is whatever has been declared for that symbol.
+non-terminal, it is whatever has been declared for that symbol.
 
 While building productions we will need to add to an array which needs to
 grow dynamically.
@@ -489,15 +498,16 @@ With the ability to parse production and dollar-lines, we have nearly all
 that we need to parse a grammar from a `code_node`.
 
 The head of the first production will effectively be the `start` symbol of
-the grammar.  However it wont _actually_ be so.  Processing the grammar is
+the grammar.  However it won't _actually_ be so.  Processing the grammar is
 greatly simplified if the real start symbol only has a single production,
-and expect `$eof` as the final terminal.  So when we find the first explicit
-production we insert an extra production as production zero which looks like
+and expects `$eof` as the final terminal.  So when we find the first
+explicit production we insert an extra production as production zero which
+looks like
 
 ###### Example: production 0
        $start -> START $eof
 
-where `START` is the first non-terminal give.
+where `START` is the first non-terminal given.
 
 ###### create production zero
        struct production *p = calloc(1,sizeof(*p));
@@ -507,7 +517,7 @@ where `START` is the first non-terminal give.
        p->head->type = Nonterminal;
        array_add(&p->body, &p->body_size, head);
        array_add(&p->body, &p->body_size, sym_find(g, eof));
-       g->start  = p->head->num;
+       g->start = p->head->num;
        p->head->first_production = g->production_count;
        array_add(&g->productions, &g->production_count, p);
 
@@ -528,10 +538,10 @@ Now we are ready to read in the grammar.
                        .ignored = (1 << TK_line_comment)
                                 | (1 << TK_block_comment)
                                 | (0 << TK_number)
-                                | (0 << TK_string)
+                                | (1 << TK_string)
                                 | (1 << TK_multi_string)
-                                | (1 << TK_indent)
-                                | (1 << TK_undent),
+                                | (1 << TK_in)
+                                | (1 << TK_out),
                };
 
                struct token_state *state = token_open(code, &conf);
@@ -576,7 +586,7 @@ Now we are ready to read in the grammar.
                                else
                                        err = "First production must have a head";
                        } else if (tk.num == TK_mark
-                                  &&  text_is(tk.txt, "$")) {
+                                  && text_is(tk.txt, "$")) {
                                err = dollar_line(state, g);
                        } else {
                                err = "Unrecognised token at start of line.";
@@ -636,7 +646,7 @@ allocated space as it can be derived directly from the current `cnt` using
 `((cnt - 1) | 7) + 1`.
 
 ###### functions
-       static void symset_add(struct symset *s, int key, int val)
+       static void symset_add(struct symset *s, unsigned short key, unsigned short val)
        {
                int i;
                int current = ((s->cnt-1) | 7) + 1;
@@ -663,7 +673,7 @@ Finding a symbol (or item) in a `symset` uses a simple binary search.
 We return the index where the value was found (so data can be accessed),
 or `-1` to indicate failure.
 
-       static int symset_find(struct symset *ss, int key)
+       static int symset_find(struct symset *ss, unsigned short key)
        {
                int lo = 0;
                int hi = ss->cnt;
@@ -695,7 +705,7 @@ can be optimised later.
                int added = 0;
                for (i = 0; i < b->cnt; i++)
                        if (symset_find(a, b->syms[i]) < 0) {
-                               int data = 0;
+                               unsigned short data = 0;
                                if (b->data != NO_DATA)
                                        data = b->data[i];
                                symset_add(a, b->syms[i], data);
@@ -975,7 +985,7 @@ building the itemsets and states for the LR grammar.  They are:
 1. LR(0) or SLR(1), where no look-ahead is considered.
 2. LALR(1) where we build look-ahead sets with each item and merge
    the LA sets when we find two paths to the same "kernel" set of items.
-3. LR(1) where different look-ahead for any item in the code means
+3. LR(1) where different look-ahead for any item in the set means
    a different state must be created.
 
 ###### forward declarations
@@ -997,13 +1007,14 @@ as we want to do the lookup after generating the "kernel" of an
 itemset, so we need to ignore the offset=zero items which are added during
 completion.
 
-To facilitate this, we modify the "DOT" number so that "0" sorts to the end of
-the list in the symset, and then only compare items before the first "0".
+To facilitate this, we modify the "DOT" number so that "0" sorts to
+the end of the list in the symset, and then only compare items before
+the first "0".
 
 ###### declarations
        static inline unsigned short item_num(int production, int index)
        {
-               return production | (((index-1)&0x1f) << 11);
+               return production | ((31-index) << 11);
        }
        static inline int item_prod(unsigned short item)
        {
@@ -1011,7 +1022,7 @@ the list in the symset, and then only compare items before the first "0".
        }
        static inline int item_index(unsigned short item)
        {
-               return ((item >> 11)+1) & 0x1f;
+               return (31-(item >> 11)) & 0x1f;
        }
 
 For LR(1) analysis we need to compare not just the itemset in a state
@@ -1168,7 +1179,7 @@ though.
                int p2;
                struct symbol *s;
                struct symset LA = INIT_SYMSET;
-               int sn = 0;
+               unsigned short sn = 0;
 
                if (bs == pr->body_size)
                        continue;
@@ -1226,7 +1237,7 @@ with a pre-existing itemset).
        // if they don't exist.
        for (i = 0; i < done.cnt; i++) {
                int j;
-               int state;
+               unsigned short state;
                struct symset newitemset = INIT_SYMSET;
                if (type >= LALR)
                        newitemset = INIT_DATASET;
@@ -1236,7 +1247,7 @@ with a pre-existing itemset).
                        int p = item_prod(itm);
                        int bp = item_index(itm);
                        struct production *pr = g->productions[p];
-                       int la = 0;
+                       unsigned short la = 0;
                        int pos;
 
                        if (bp == pr->body_size)
@@ -1277,7 +1288,7 @@ with `TK_eof` as the LA set.
                struct symset first = INIT_SYMSET;
                struct itemset *is;
                int again;
-               int la = 0;
+               unsigned short la = 0;
                if (type >= LALR) {
                        // LA set just has eof
                        struct symset eof = INIT_SYMSET;
@@ -1285,7 +1296,7 @@ with `TK_eof` as the LA set.
                        la = save_set(g, eof);
                        first = INIT_DATASET;
                }
-               // production 0, offset 0  (with no data)
+               // production 0, offset 0 (with no data)
                symset_add(&first, item_num(0, 0), la);
                add_itemset(g, first, type);
                for (again = 0, is = g->items;
@@ -1296,6 +1307,7 @@ with `TK_eof` as the LA set.
                        if (is->completed)
                                continue;
                        is->completed = 1;
+                       again = 1;
                        ## complete itemset
                        ## derive itemsets
                        symset_free(done);
@@ -1607,7 +1619,7 @@ between the two.
                                continue;
                        /* First collect the shifts */
                        for (j = 0; j < is->items.cnt; j++) {
-                               int itm = is->items.syms[j];
+                               unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
                                int bp = item_index(itm);
                                struct production *pr = g->productions[p];
@@ -1622,7 +1634,7 @@ between the two.
                        }
                        /* Now look for reduction and conflicts */
                        for (j = 0; j < is->items.cnt; j++) {
-                               int itm = is->items.syms[j];
+                               unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
                                int bp = item_index(itm);
                                struct production *pr = g->productions[p];
@@ -1677,7 +1689,7 @@ known words added and then is used with the `code_node` to initialize the
 scanner.
 
 `parse_XX` then call the library function `parser_run` to actually complete
-the parse,  This needs the `states` table and function to call the various
+the parse.  This needs the `states` table and function to call the various
 pieces of code provided in the grammar file, so they are generated first.
 
 ###### parser_generate
@@ -1685,6 +1697,7 @@ pieces of code provided in the grammar file, so they are generated first.
        static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
        {
                gen_known(f, g);
+               gen_non_term(f, g);
                gen_goto(f, g);
                gen_states(f, g);
                gen_reduce(f, g, file);
@@ -1699,7 +1712,7 @@ pieces of code provided in the grammar file, so they are generated first.
                fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
                fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
                fprintf(f, "\ttokens = token_open(code, config);\n");
-               fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace);\n");
+               fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
                fprintf(f, "\ttoken_close(tokens);\n");
                fprintf(f, "\treturn rv;\n");
                fprintf(f, "}\n\n");
@@ -1708,6 +1721,7 @@ pieces of code provided in the grammar file, so they are generated first.
 ### Table words table
 
 The know words is simply an array of terminal symbols.
+The table of nonterminals used for tracing is a similar array.
 
 ###### functions
 
@@ -1724,10 +1738,24 @@ The know words is simply an array of terminal symbols.
                fprintf(f, "};\n\n");
        }
 
+       static void gen_non_term(FILE *f, struct grammar *g)
+       {
+               int i;
+               fprintf(f, "#line 0 \"gen_non_term\"\n");
+               fprintf(f, "static const char *non_term[] = {\n");
+               for (i = TK_reserved;
+                    i < g->num_syms;
+                    i++)
+                       if (g->symtab[i]->type == Nonterminal)
+                               fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
+                                       g->symtab[i]->name.txt);
+               fprintf(f, "};\n\n");
+       }
+
 ### States and the goto tables.
 
-For each state we record the goto table and the reducible production if
-there is one.
+For each state we record the goto table, the reducible production if
+there is one, or a symbol to shift for error recovery.
 Some of the details of the reducible production are stored in the
 `do_reduce` function to come later.  Here we store the production number,
 the body size (useful for stack management) and the resulting symbol (useful
@@ -1748,6 +1776,7 @@ The go to table is stored in a simple array of `sym` and corresponding
                short reduce_prod;
                short reduce_size;
                short reduce_sym;
+               short shift_sym;
        };
 
 
@@ -1778,24 +1807,39 @@ The go to table is stored in a simple array of `sym` and corresponding
                fprintf(f, "static const struct state states[] = {\n");
                for (i = 0; i < g->states; i++) {
                        struct itemset *is = g->statetab[i];
-                       int j, prod = -1;
+                       int j, prod = -1, prod_len;
+                       int shift_sym = -1;
+                       int shift_len = 0, shift_remain = 0;
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
                                int bp = item_index(itm);
                                struct production *pr = g->productions[p];
 
-                               if (bp < pr->body_size)
+                               if (bp < pr->body_size) {
+                                       if (shift_sym < 0 ||
+                                           (shift_len == bp && shift_remain > pr->body_size - bp)) {
+                                               shift_sym = pr->body[bp]->num;
+                                               shift_len = bp;
+                                               shift_remain = pr->body_size - bp;
+                                       }
                                        continue;
+                               }
                                /* This is what we reduce */
-                               prod = p;
-                               break;
+                               if (prod < 0 || prod_len < pr->body_size) {
+                                       prod = p;
+                                       prod_len = pr->body_size;
+                               }
                        }
 
-                       fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d },\n",
-                               i, is->go_to.cnt, i, prod,
-                               prod < 0 ? -1 : g->productions[prod]->body_size,
-                               prod < 0 ? -1 : g->productions[prod]->head->num);
+                       if (prod >= 0)
+                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0 },\n",
+                                       i, is->go_to.cnt, i, prod,
+                                       g->productions[prod]->body_size,
+                                       g->productions[prod]->head->num);
+                       else
+                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d },\n",
+                                       i, is->go_to.cnt, i, shift_sym);
                }
                fprintf(f, "};\n\n");
        }
@@ -1867,10 +1911,9 @@ to the appropriate type for each access.  All this is handling in
 
        static void gen_reduce(FILE *f, struct grammar *g, char *file)
        {
-               int i, j;
+               int i;
                fprintf(f, "#line 0 \"gen_reduce\"\n");
-               fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n");
-               fprintf(f, "                      void *ret, FILE *trace)\n");
+               fprintf(f, "static int do_reduce(int prod, void **body, void *ret)\n");
                fprintf(f, "{\n");
                fprintf(f, "\tint ret_size = 0;\n");
 
@@ -1882,21 +1925,6 @@ to the appropriate type for each access.  All this is handling in
                        if (p->code.txt)
                                gen_code(p, f, g);
 
-                       fprintf(f, "\t\tif (trace) {\n");
-                       fprintf(f, "\t\t\tfprintf(trace, \"[%%2d]%.*s ->\", depth);\n",
-                               p->head->name.len, p->head->name.txt);
-                       for (j = 0; j < p->body_size; j++) {
-                               if (p->body[j]->type == Terminal) {
-                                       fputs("\t\t\tfputs(\" \", trace);\n", f);
-                                       fprintf(f, "\t\t\ttext_dump(trace, (*(struct token*)body[%d]).txt, 20);\n", j);
-                               } else {
-                                       fprintf(f, "\t\t\tfprintf(trace, \" %.*s\");\n",
-                                               p->body[j]->name.len,
-                                               p->body[j]->name.txt);
-                               }
-                       }
-                       fprintf(f, "\t\t}\n");
-
                        if (p->head->struct_name.txt)
                                fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
                                        p->head->struct_name.len,
@@ -2195,7 +2223,7 @@ The parser generator has nicely provided us with goto tables sorted by
 symbol number.  We need a binary search function to find a symbol in the
 table.
 
-###### parser
+###### parser functions
 
        static int search(const struct state *l, int sym)
        {
@@ -2219,7 +2247,7 @@ table.
 
 ### The state stack.
 
-The core data structure for the parser is the stack.  This track all the
+The core data structure for the parser is the stack.  This tracks all the
 symbols that have been recognised or partially recognised.
 
 The stack usually won't grow very large - maybe a few tens of entries.  So
@@ -2232,21 +2260,30 @@ We keep the stack as two separate allocations.  One, `asn_stack` stores the
 production, and by keeping a separate `asn` stack, we can just pass a
 pointer into this stack.
 
-The other allocation store all other stack fields of which there are two.
+The other allocation stores all other stack fields of which there are four.
 The `state` is the most important one and guides the parsing process.  The
 `sym` is nearly unnecessary.  However when we want to free entries from the
 `asn_stack`, it helps to know what type they are so we can call the right
 freeing function.  The symbol leads us to the right free function through
 `do_free`.
 
-###### parser
+The `indents` count and the `starts_indented` flag track the line
+indents in the symbol.  These are used to allow indent information to
+guide parsing and error recovery.
+
+As well as the stack of frames we have a `next` frame which is
+assembled from the incoming token and other information prior to
+pushing it onto the stack.
+
+###### parser functions
 
        struct parser {
-               int state;
                struct frame {
                        short state;
                        short sym;
-               } *stack;
+                       short starts_indented;
+                       short indents;
+               } *stack, next;
                void **asn_stack;
                int stack_size;
                int tos;
@@ -2254,9 +2291,9 @@ freeing function.  The symbol leads us to the right free function through
 
 #### Shift and pop
 
-The operations are needed on the stack - shift (which is like push) and pop.
+Two operations are needed on the stack - shift (which is like push) and pop.
 
-Shift applies no only to terminals but also to non-terminals.  When we
+Shift applies not only to terminals but also to non-terminals.  When we
 reduce a production we will pop off entries corresponding to the body
 symbols, then push on an item for the head of the production.  This last is
 exactly the same process as shifting in a terminal so we use the same
@@ -2270,14 +2307,14 @@ function reports if it could.
 So `shift` finds the next state.  If that succeed it extends the allocations
 if needed and pushed all the information onto the stacks.
 
-###### parser
+###### parser functions
 
        static int shift(struct parser *p,
-                        int sym, void *asn,
+                        void *asn,
                         const struct state states[])
        {
                // Push an entry onto the stack
-               int newstate = search(&states[p->state], sym);
+               int newstate = search(&states[p->next.state], p->next.sym);
                if (newstate < 0)
                        return 0;
                if (p->tos >= p->stack_size) {
@@ -2287,43 +2324,49 @@ if needed and pushed all the information onto the stacks.
                        p->asn_stack = realloc(p->asn_stack, p->stack_size
                                           * sizeof(p->asn_stack[0]));
                }
-               p->stack[p->tos].state = p->state;
-               p->stack[p->tos].sym = sym;
+               p->stack[p->tos] = p->next;
                p->asn_stack[p->tos] = asn;
                p->tos++;
-               p->state = newstate;
+               p->next.state = newstate;
+               p->next.indents = 0;
+               p->next.starts_indented = 0;
                return 1;
        }
 
 `pop` simply moves the top of stack (`tos`) back down the required amount
-and frees and `asn` entries that need to be freed.  It is called _after_ we
+and frees any `asn` entries that need to be freed.  It is called _after_ we
 reduce a production, just before we `shift` the nonterminal in.
 
-###### parser
+###### parser functions
 
        static void pop(struct parser *p, int num,
                        void(*do_free)(short sym, void *asn))
        {
                int i;
                p->tos -= num;
-               for (i = 0; i < num; i++)
+               for (i = 0; i < num; i++) {
+                       p->next.indents += p->stack[p->tos+i].indents;
                        do_free(p->stack[p->tos+i].sym,
                                p->asn_stack[p->tos+i]);
+               }
 
-               p->state = p->stack[p->tos].state;
+               if (num) {
+                       p->next.state = p->stack[p->tos].state;
+                       p->next.starts_indented = p->stack[p->tos].starts_indented;
+               }
        }
 
 ### Memory allocation
 
 The `scanner` returns tokens in a local variable - we want them in allocated
-memory so they can live in the `asn_stack`.  Similarly the `asn` produce by
+memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
 a reduce is in a large buffer.  Both of these require some allocation and
 copying, hence `memdup` and `tokcopy`.
 
 ###### parser includes
        #include <memory.h>
 
-###### parser
+###### parser functions
 
        void *memdup(void *m, int len)
        {
@@ -2348,75 +2391,127 @@ Now we have the parser.  If we can shift, we do.  If not and we can reduce
 we do.  If the production we reduced was production zero, then we have
 accepted the input and can finish.
 
+We return whatever `asn` was returned by reducing production zero.
+
 If we can neither shift nor reduce we have an error to handle.  We pop
-single entries off the stack until we can shift the `TK_error` symbol, the
+single entries off the stack until we can shift the `TK_error` symbol, then
 drop input tokens until we find one we can shift into the new error state.
 
-We return whatever `asn` was returned by reducing production zero.
+When we find `TK_in` and `TK_out` tokens which report indents we need
+to handle them directly as the grammar cannot express what we want to
+do with them.
+
+`TK_in` tokens are easy: we simply update the `next` stack frame to
+record how many indents there are and that the next token started with
+an indent.
+
+`TK_out` tokens must either be counted off against any pending indent,
+or must force reductions until there is a pending indent which isn't
+at the start of a production.
 
 ###### parser includes
        #include "parser.h"
-###### parser
+###### parser_run
        void *parser_run(struct token_state *tokens,
                         const struct state states[],
-                        int (*do_reduce)(int, int, void**, void*, FILE*),
+                        int (*do_reduce)(int, void**, void*),
                         void (*do_free)(short, void*),
-                        FILE *trace)
+                        FILE *trace, const char *non_term[], int knowns)
        {
                struct parser p = { 0 };
-               struct token *tk;
+               struct token *tk = NULL;
                int accepted = 0;
                void *ret;
 
-               tk = tok_copy(token_next(tokens));
                while (!accepted) {
-                       if (shift(&p, tk->num, tk, states)) {
-                               if (trace) {
-                                       fputs("Shift ", trace);
-                                       text_dump(trace, tk->txt, 20);
-                                       fputs("\n", trace);
-                               }
+                       struct token *err_tk;
+                       if (!tk)
                                tk = tok_copy(token_next(tokens));
+                       p.next.sym = tk->num;
+                       if (trace)
+                               parser_trace(trace, &p, tk, states, non_term, knowns);
+
+                       if (p.next.sym == TK_in) {
+                               p.next.starts_indented = 1;
+                               p.next.indents = 1;
+                               free(tk);
+                               tk = NULL;
+                               continue;
+                       }
+                       if (p.next.sym == TK_out) {
+                               if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
+                                   (p.stack[p.tos-1].indents == 1 &&
+                                    states[p.next.state].reduce_size > 1)) {
+                                       p.stack[p.tos-1].indents -= 1;
+                                       free(tk);
+                                       tk = NULL;
+                                       continue;
+                               }
+                               // fall through and force a REDUCE (as 'shift'
+                               // will fail).
+                       }
+                       if (shift(&p, tk, states)) {
+                               tk = NULL;
                                continue;
                        }
-                       if (states[p.state].reduce_prod >= 0) {
+                       if (states[p.next.state].reduce_prod >= 0) {
                                void **body;
-                               int prod = states[p.state].reduce_prod;
-                               int size = states[p.state].reduce_size;
-                               int sym = states[p.state].reduce_sym;
+                               int prod = states[p.next.state].reduce_prod;
+                               int size = states[p.next.state].reduce_size;
                                int bufsize;
                                static char buf[16*1024];
+                               p.next.sym = states[p.next.state].reduce_sym;
 
                                body = p.asn_stack +
-                                       (p.tos - states[p.state].reduce_size);
+                                       (p.tos - states[p.next.state].reduce_size);
 
-                               bufsize = do_reduce(prod, p.tos, body,
-                                                   buf, trace);
-                               if (trace)
-                                       fputs("\n", trace);
+                               bufsize = do_reduce(prod, body, buf);
 
                                pop(&p, size, do_free);
-                               shift(&p, sym, memdup(buf, bufsize), states);
+                               shift(&p, memdup(buf, bufsize), states);
                                if (prod == 0)
                                        accepted = 1;
                                continue;
                        }
-                       /* Error. we walk up the stack until we
+                       if (tk->num == TK_out) {
+                               // Indent problem - synthesise tokens to get us
+                               // out of here.
+                               fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
+                               p.next.sym = states[p.next.state].shift_sym;
+                               shift(&p, tok_copy(*tk), states);
+                               // FIXME need to report this error somehow
+                               continue;
+                       }
+                       /* Error. We walk up the stack until we
                         * find a state which will accept TK_error.
                         * We then shift in TK_error and see what state
                         * that takes us too.
                         * Then we discard input tokens until
                         * we find one that is acceptable.
                         */
-                       while (shift(&p, TK_error, tk, states) < 0
+
+                       err_tk = tok_copy(*tk);
+                       p.next.sym = TK_error;
+                       while (shift(&p, err_tk, states) == 0
                               && p.tos > 0)
                                // discard this state
                                pop(&p, 1, do_free);
-                       tk = tok_copy(*tk);
-                       while (search(&states[p.state], tk->num) < 0 &&
+                       if (p.tos == 0) {
+                               free(err_tk);
+                               // no state accepted TK_error
+                               break;
+                       }
+                       while (search(&states[p.next.state], tk->num) < 0 &&
                               tk->num != TK_eof) {
                                free(tk);
                                tk = tok_copy(token_next(tokens));
+                               if (tk->num == TK_in)
+                                       p.next.indents += 1;
+                               if (tk->num == TK_out) {
+                                       if (p.next.indents == 0)
+                                               break;
+                                       p.next.indents -= 1;
+                               }
                        }
                        if (p.tos == 0 && tk->num == TK_eof)
                                break;
@@ -2434,10 +2529,63 @@ We return whatever `asn` was returned by reducing production zero.
 ###### exported functions
        void *parser_run(struct token_state *tokens,
                         const struct state states[],
-                        int (*do_reduce)(int, int, void**, void*, FILE*),
+                        int (*do_reduce)(int, void**, void*),
                         void (*do_free)(short, void*),
-                        FILE *trace);
+                        FILE *trace, const char *non_term[], int knowns);
+
+### Tracing
 
+Being able to visualize the parser in action can be invaluable when
+debugging the parser code, or trying to understand how the parse of a
+particular grammar progresses.  The stack contains all the important
+state, so just printing out the stack every time around the parse loop
+can make it possible to see exactly what is happening.
+
+This doesn't explicitly show each SHIFT and REDUCE action.  However they
+are easily deduced from the change between consecutive lines, and the
+details of each state can be found by cross referencing the states list
+in the stack with the "report" that parsergen can generate.
+
+For terminal symbols, we just dump the token.  For non-terminals we
+print the name of the symbol.  The look ahead token is reported at the
+end inside square brackets.
+
+###### parser functions
+
+       static char *reserved_words[] = {
+               [TK_error]        = "ERROR",
+               [TK_in]           = "IN",
+               [TK_out]          = "OUT",
+               [TK_newline]      = "NEWLINE",
+               [TK_eof]          = "$eof",
+       };
+       void parser_trace(FILE *trace, struct parser *p,
+                         struct token *tk, const struct state states[],
+                         const char *non_term[], int knowns)
+       {
+               int i;
+               for (i = 0; i < p->tos; i++) {
+                       int sym = p->stack[i].sym;
+                       fprintf(trace, "(%d) ", p->stack[i].state);
+                       if (sym < TK_reserved &&
+                           reserved_words[sym] != NULL)
+                               fputs(reserved_words[sym], trace);
+                       else if (sym < TK_reserved + knowns) {
+                               struct token *t = p->asn_stack[i];
+                               text_dump(trace, t->txt, 20);
+                       } else
+                               fputs(non_term[sym - TK_reserved - knowns],
+                                     trace);
+                       fputs(" ", trace);
+               }
+               fprintf(trace, "(%d) [", p->next.state);
+               if (tk->num < TK_reserved &&
+                   reserved_words[tk->num] != NULL)
+                       fputs(reserved_words[tk->num], trace);
+               else
+                       text_dump(trace, tk->txt, 20);
+               fputs("]\n", trace);
+       }
 
 # A Worked Example
 
@@ -2506,8 +2654,8 @@ something like this.
                struct token_config config = {
                        .ignored = (1 << TK_line_comment)
                                 | (1 << TK_block_comment)
-                                | (1 << TK_indent)
-                                | (1 << TK_undent),
+                                | (1 << TK_in)
+                                | (1 << TK_out),
                        .number_chars = ".,_+-",
                        .word_start = "",
                        .word_cont = "",
@@ -2523,27 +2671,27 @@ something like this.
        Session -> Session Line
                | Line
 
-       Line -> Expression NEWLINE ${ gmp_printf( "Answer = %Qd\n", $1.val);
+       Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
                                        { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
-                                       gmp_printf( "  or as a decimal: %Fg\n", fl);
+                                       gmp_printf("  or as a decimal: %Fg\n", fl);
                                        mpf_clear(fl);
                                        }
                                     }$
                | Expression = Expression NEWLINE ${
                        if (mpq_equal($1.val, $3.val))
-                               gmp_printf( "Both equal %Qd\n", $1.val);
+                               gmp_printf("Both equal %Qd\n", $1.val);
                        else {
-                               gmp_printf( "NOT EQUAL: %Qd\n      != : %Qd\n",
+                               gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
                                        $1.val, $3.val);
                                exit(1);
                        }
                }$
-               |  NEWLINE ${ printf( "Blank line\n"); }$
-               | ERROR NEWLINE ${ printf( "Skipped a bad line\n"); }$
+               | NEWLINE ${ printf("Blank line\n"); }$
+               | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
 
        $number
-       Expression -> Expression +  Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
-               | Expression -  Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
+       Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
+               | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
                | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
 
        Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
@@ -2551,4 +2699,4 @@ something like this.
                | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
 
        Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
-               | ( Expression  ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
+               | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$