]> ocean-lang.org Git - ocean/blobdiff - csrc/parsergen.mdc
parsergen: add handling for TK_IN and TK_OUT
[ocean] / csrc / parsergen.mdc
index 7b6664d9cb250125a1f508e9c99402e9e49542cd..d2d70d531d163a67d1de05a1682d71573b37b404 100644 (file)
@@ -1754,8 +1754,8 @@ The table of nonterminals used for tracing is a similar array.
 
 ### States and the goto tables.
 
 
 ### States and the goto tables.
 
-For each state we record the goto table and the reducible production if
-there is one.
+For each state we record the goto table, the reducible production if
+there is one, or a symbol to shift for error recovery.
 Some of the details of the reducible production are stored in the
 `do_reduce` function to come later.  Here we store the production number,
 the body size (useful for stack management) and the resulting symbol (useful
 Some of the details of the reducible production are stored in the
 `do_reduce` function to come later.  Here we store the production number,
 the body size (useful for stack management) and the resulting symbol (useful
@@ -1776,6 +1776,7 @@ The go to table is stored in a simple array of `sym` and corresponding
                short reduce_prod;
                short reduce_size;
                short reduce_sym;
                short reduce_prod;
                short reduce_size;
                short reduce_sym;
+               short shift_sym;
        };
 
 
        };
 
 
@@ -1806,24 +1807,39 @@ The go to table is stored in a simple array of `sym` and corresponding
                fprintf(f, "static const struct state states[] = {\n");
                for (i = 0; i < g->states; i++) {
                        struct itemset *is = g->statetab[i];
                fprintf(f, "static const struct state states[] = {\n");
                for (i = 0; i < g->states; i++) {
                        struct itemset *is = g->statetab[i];
-                       int j, prod = -1;
+                       int j, prod = -1, prod_len;
+                       int shift_sym = -1;
+                       int shift_len = 0, shift_remain = 0;
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
                                int bp = item_index(itm);
                                struct production *pr = g->productions[p];
 
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
                                int bp = item_index(itm);
                                struct production *pr = g->productions[p];
 
-                               if (bp < pr->body_size)
+                               if (bp < pr->body_size) {
+                                       if (shift_sym < 0 ||
+                                           (shift_len == bp && shift_remain > pr->body_size - bp)) {
+                                               shift_sym = pr->body[bp]->num;
+                                               shift_len = bp;
+                                               shift_remain = pr->body_size - bp;
+                                       }
                                        continue;
                                        continue;
+                               }
                                /* This is what we reduce */
                                /* This is what we reduce */
-                               prod = p;
-                               break;
+                               if (prod < 0 || prod_len < pr->body_size) {
+                                       prod = p;
+                                       prod_len = pr->body_size;
+                               }
                        }
 
                        }
 
-                       fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d },\n",
-                               i, is->go_to.cnt, i, prod,
-                               prod < 0 ? -1 : g->productions[prod]->body_size,
-                               prod < 0 ? -1 : g->productions[prod]->head->num);
+                       if (prod >= 0)
+                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0 },\n",
+                                       i, is->go_to.cnt, i, prod,
+                                       g->productions[prod]->body_size,
+                                       g->productions[prod]->head->num);
+                       else
+                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d },\n",
+                                       i, is->go_to.cnt, i, shift_sym);
                }
                fprintf(f, "};\n\n");
        }
                }
                fprintf(f, "};\n\n");
        }
@@ -2244,21 +2260,30 @@ We keep the stack as two separate allocations.  One, `asn_stack` stores the
 production, and by keeping a separate `asn` stack, we can just pass a
 pointer into this stack.
 
 production, and by keeping a separate `asn` stack, we can just pass a
 pointer into this stack.
 
-The other allocation stores all other stack fields of which there are two.
+The other allocation stores all other stack fields of which there are four.
 The `state` is the most important one and guides the parsing process.  The
 `sym` is nearly unnecessary.  However when we want to free entries from the
 `asn_stack`, it helps to know what type they are so we can call the right
 freeing function.  The symbol leads us to the right free function through
 `do_free`.
 
 The `state` is the most important one and guides the parsing process.  The
 `sym` is nearly unnecessary.  However when we want to free entries from the
 `asn_stack`, it helps to know what type they are so we can call the right
 freeing function.  The symbol leads us to the right free function through
 `do_free`.
 
+The `indents` count and the `starts_indented` flag track the line
+indents in the symbol.  These are used to allow indent information to
+guide parsing and error recovery.
+
+As well as the stack of frames we have a `next` frame which is
+assembled from the incoming token and other information prior to
+pushing it onto the stack.
+
 ###### parser functions
 
        struct parser {
 ###### parser functions
 
        struct parser {
-               int state;
                struct frame {
                        short state;
                        short sym;
                struct frame {
                        short state;
                        short sym;
-               } *stack;
+                       short starts_indented;
+                       short indents;
+               } *stack, next;
                void **asn_stack;
                int stack_size;
                int tos;
                void **asn_stack;
                int stack_size;
                int tos;
@@ -2285,11 +2310,11 @@ if needed and pushed all the information onto the stacks.
 ###### parser functions
 
        static int shift(struct parser *p,
 ###### parser functions
 
        static int shift(struct parser *p,
-                        int sym, void *asn,
+                        void *asn,
                         const struct state states[])
        {
                // Push an entry onto the stack
                         const struct state states[])
        {
                // Push an entry onto the stack
-               int newstate = search(&states[p->state], sym);
+               int newstate = search(&states[p->next.state], p->next.sym);
                if (newstate < 0)
                        return 0;
                if (p->tos >= p->stack_size) {
                if (newstate < 0)
                        return 0;
                if (p->tos >= p->stack_size) {
@@ -2299,11 +2324,12 @@ if needed and pushed all the information onto the stacks.
                        p->asn_stack = realloc(p->asn_stack, p->stack_size
                                           * sizeof(p->asn_stack[0]));
                }
                        p->asn_stack = realloc(p->asn_stack, p->stack_size
                                           * sizeof(p->asn_stack[0]));
                }
-               p->stack[p->tos].state = p->state;
-               p->stack[p->tos].sym = sym;
+               p->stack[p->tos] = p->next;
                p->asn_stack[p->tos] = asn;
                p->tos++;
                p->asn_stack[p->tos] = asn;
                p->tos++;
-               p->state = newstate;
+               p->next.state = newstate;
+               p->next.indents = 0;
+               p->next.starts_indented = 0;
                return 1;
        }
 
                return 1;
        }
 
@@ -2318,11 +2344,16 @@ reduce a production, just before we `shift` the nonterminal in.
        {
                int i;
                p->tos -= num;
        {
                int i;
                p->tos -= num;
-               for (i = 0; i < num; i++)
+               for (i = 0; i < num; i++) {
+                       p->next.indents += p->stack[p->tos+i].indents;
                        do_free(p->stack[p->tos+i].sym,
                                p->asn_stack[p->tos+i]);
                        do_free(p->stack[p->tos+i].sym,
                                p->asn_stack[p->tos+i]);
+               }
 
 
-               p->state = p->stack[p->tos].state;
+               if (num) {
+                       p->next.state = p->stack[p->tos].state;
+                       p->next.starts_indented = p->stack[p->tos].starts_indented;
+               }
        }
 
 ### Memory allocation
        }
 
 ### Memory allocation
@@ -2366,6 +2397,17 @@ If we can neither shift nor reduce we have an error to handle.  We pop
 single entries off the stack until we can shift the `TK_error` symbol, then
 drop input tokens until we find one we can shift into the new error state.
 
 single entries off the stack until we can shift the `TK_error` symbol, then
 drop input tokens until we find one we can shift into the new error state.
 
+When we find `TK_in` and `TK_out` tokens which report indents we need
+to handle them directly as the grammar cannot express what we want to
+do with them.
+
+`TK_in` tokens are easy: we simply update the `next` stack frame to
+record how many indents there are and that the next token started with
+an indent.
+
+`TK_out` tokens must either be counted off against any pending indent,
+or must force reductions until there is a pending indent which isn't
+at the start of a production.
 
 ###### parser includes
        #include "parser.h"
 
 ###### parser includes
        #include "parser.h"
@@ -2377,38 +2419,69 @@ drop input tokens until we find one we can shift into the new error state.
                         FILE *trace, const char *non_term[], int knowns)
        {
                struct parser p = { 0 };
                         FILE *trace, const char *non_term[], int knowns)
        {
                struct parser p = { 0 };
-               struct token *tk;
+               struct token *tk = NULL;
                int accepted = 0;
                void *ret;
 
                int accepted = 0;
                void *ret;
 
-               tk = tok_copy(token_next(tokens));
                while (!accepted) {
                while (!accepted) {
+                       struct token *err_tk;
+                       if (!tk)
+                               tk = tok_copy(token_next(tokens));
+                       p.next.sym = tk->num;
                        if (trace)
                                parser_trace(trace, &p, tk, states, non_term, knowns);
 
                        if (trace)
                                parser_trace(trace, &p, tk, states, non_term, knowns);
 
-                       if (shift(&p, tk->num, tk, states)) {
-                               tk = tok_copy(token_next(tokens));
+                       if (p.next.sym == TK_in) {
+                               p.next.starts_indented = 1;
+                               p.next.indents = 1;
+                               free(tk);
+                               tk = NULL;
                                continue;
                        }
                                continue;
                        }
-                       if (states[p.state].reduce_prod >= 0) {
+                       if (p.next.sym == TK_out) {
+                               if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
+                                   (p.stack[p.tos-1].indents == 1 &&
+                                    states[p.next.state].reduce_size > 1)) {
+                                       p.stack[p.tos-1].indents -= 1;
+                                       free(tk);
+                                       tk = NULL;
+                                       continue;
+                               }
+                               // fall through and force a REDUCE (as 'shift'
+                               // will fail).
+                       }
+                       if (shift(&p, tk, states)) {
+                               tk = NULL;
+                               continue;
+                       }
+                       if (states[p.next.state].reduce_prod >= 0) {
                                void **body;
                                void **body;
-                               int prod = states[p.state].reduce_prod;
-                               int size = states[p.state].reduce_size;
-                               int sym = states[p.state].reduce_sym;
+                               int prod = states[p.next.state].reduce_prod;
+                               int size = states[p.next.state].reduce_size;
                                int bufsize;
                                static char buf[16*1024];
                                int bufsize;
                                static char buf[16*1024];
+                               p.next.sym = states[p.next.state].reduce_sym;
 
                                body = p.asn_stack +
 
                                body = p.asn_stack +
-                                       (p.tos - states[p.state].reduce_size);
+                                       (p.tos - states[p.next.state].reduce_size);
 
                                bufsize = do_reduce(prod, body, buf);
 
                                pop(&p, size, do_free);
 
                                bufsize = do_reduce(prod, body, buf);
 
                                pop(&p, size, do_free);
-                               shift(&p, sym, memdup(buf, bufsize), states);
+                               shift(&p, memdup(buf, bufsize), states);
                                if (prod == 0)
                                        accepted = 1;
                                continue;
                        }
                                if (prod == 0)
                                        accepted = 1;
                                continue;
                        }
+                       if (tk->num == TK_out) {
+                               // Indent problem - synthesise tokens to get us
+                               // out of here.
+                               fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
+                               p.next.sym = states[p.next.state].shift_sym;
+                               shift(&p, tok_copy(*tk), states);
+                               // FIXME need to report this error somehow
+                               continue;
+                       }
                        /* Error. We walk up the stack until we
                         * find a state which will accept TK_error.
                         * We then shift in TK_error and see what state
                        /* Error. We walk up the stack until we
                         * find a state which will accept TK_error.
                         * We then shift in TK_error and see what state
@@ -2416,15 +2489,29 @@ drop input tokens until we find one we can shift into the new error state.
                         * Then we discard input tokens until
                         * we find one that is acceptable.
                         */
                         * Then we discard input tokens until
                         * we find one that is acceptable.
                         */
-                       while (shift(&p, TK_error, tk, states) == 0
+
+                       err_tk = tok_copy(*tk);
+                       p.next.sym = TK_error;
+                       while (shift(&p, err_tk, states) == 0
                               && p.tos > 0)
                                // discard this state
                                pop(&p, 1, do_free);
                               && p.tos > 0)
                                // discard this state
                                pop(&p, 1, do_free);
-                       tk = tok_copy(*tk);
-                       while (search(&states[p.state], tk->num) < 0 &&
+                       if (p.tos == 0) {
+                               free(err_tk);
+                               // no state accepted TK_error
+                               break;
+                       }
+                       while (search(&states[p.next.state], tk->num) < 0 &&
                               tk->num != TK_eof) {
                                free(tk);
                                tk = tok_copy(token_next(tokens));
                               tk->num != TK_eof) {
                                free(tk);
                                tk = tok_copy(token_next(tokens));
+                               if (tk->num == TK_in)
+                                       p.next.indents += 1;
+                               if (tk->num == TK_out) {
+                                       if (p.next.indents == 0)
+                                               break;
+                                       p.next.indents -= 1;
+                               }
                        }
                        if (p.tos == 0 && tk->num == TK_eof)
                                break;
                        }
                        if (p.tos == 0 && tk->num == TK_eof)
                                break;
@@ -2491,7 +2578,7 @@ end inside square brackets.
                                      trace);
                        fputs(" ", trace);
                }
                                      trace);
                        fputs(" ", trace);
                }
-               fprintf(trace, "(%d) [", p->state);
+               fprintf(trace, "(%d) [", p->next.state);
                if (tk->num < TK_reserved &&
                    reserved_words[tk->num] != NULL)
                        fputs(reserved_words[tk->num], trace);
                if (tk->num < TK_reserved &&
                    reserved_words[tk->num] != NULL)
                        fputs(reserved_words[tk->num], trace);