]> ocean-lang.org Git - ocean/commitdiff
parsergen: fix up stack management
authorNeilBrown <neilb@suse.de>
Sun, 22 Jun 2014 05:12:41 +0000 (15:12 +1000)
committerNeilBrown <neilb@suse.de>
Sun, 22 Jun 2014 05:15:48 +0000 (15:15 +1000)
The stack has alternating states and symbols.  I had groups a state
with the following symbol as the first thing pushed is a state and the
next is a symbol.

It works much better to group the other ways.  First we push just state zero.
Then we push some symbol and the state which 'goto' leads to.

In particularly this keeps the 'shift' that happens after "reduce"
quite separate from the 'shift' that happens when the look-ahead is
shifted in.  Previous the post-reduce shift was stealing the indent
information that should have stayed in the look-ahead buffer.

Signed-off-by: NeilBrown <neilb@suse.de>
csrc/parsergen.mdc

index b94bd3bbc8ff823180d0030a07191c82f8f72462..19703bce82000b2ce642a29e9e6ad5c15bd91d3a 100644 (file)
@@ -2418,20 +2418,23 @@ guide parsing and error recovery.
 `newline_permitted` keeps track of whether newlines should be ignored
 or not, and `starts_line` records if this state stated on a newline.
 
-As well as the stack of frames we have a `next` frame which is
-assembled from the incoming token and other information prior to
-pushing it onto the stack.
+The stack is more properly seen as alternating states and symbols -
+states, like the 'DOT' in items, are between symbols.  Each frame in
+our stack holds a state and the symbol that was before it.  The
+bottom of stack holds the start state, but no symbol, as nothing came
+before the beginning.
 
 ###### parser functions
 
        struct parser {
                struct frame {
                        short state;
+                       short newline_permitted;
+
                        short sym;
                        short starts_indented;
                        short indents;
-                       short newline_permitted;
-               } *stack, next;
+               } *stack;
                void **asn_stack;
                int stack_size;
                int tos;
@@ -2445,24 +2448,31 @@ Shift applies not only to terminals but also to non-terminals.  When we
 reduce a production we will pop off entries corresponding to the body
 symbols, then push on an item for the head of the production.  This last is
 exactly the same process as shifting in a terminal so we use the same
-function for both.
+function for both.  In both cases we provide a stack frame which
+contains the symbol to shift and related indent information.
 
 To simplify other code we arrange for `shift` to fail if there is no `goto`
 state for the symbol.  This is useful in basic parsing due to our design
 that we shift when we can, and reduce when we cannot.  So the `shift`
 function reports if it could.
 
+`shift` is also used to push state zero onto the stack, so if the
+stack is empty, it always chooses zero as the next state.
+
 So `shift` finds the next state.  If that succeed it extends the allocations
 if needed and pushes all the information onto the stacks.
 
 ###### parser functions
 
-       static int shift(struct parser *p,
+       static int shift(struct parser *p, struct frame *next,
                         void *asn,
                         const struct state states[])
        {
                // Push an entry onto the stack
-               int newstate = search(&states[p->next.state], p->next.sym);
+               int newstate = p->tos
+                       ? search(&states[p->stack[p->tos-1].state],
+                                next->sym)
+                       : 0;
                if (newstate < 0)
                        return 0;
                if (p->tos >= p->stack_size) {
@@ -2472,42 +2482,43 @@ if needed and pushes all the information onto the stacks.
                        p->asn_stack = realloc(p->asn_stack, p->stack_size
                                           * sizeof(p->asn_stack[0]));
                }
-               p->stack[p->tos] = p->next;
+               next->state = newstate;
+               next->newline_permitted = 0;
+               if (p->tos)
+                       next->newline_permitted =
+                               p->stack[p->tos-1].newline_permitted;
+               if (next->indents)
+                       next->newline_permitted = 0;
+               if (states[newstate].starts_line)
+                       next->newline_permitted = 1;
+               p->stack[p->tos] = *next;
                p->asn_stack[p->tos] = asn;
                p->tos++;
-               p->next.state = newstate;
-               p->next.indents = 0;
-               p->next.starts_indented = 0;
-               // if new state doesn't start a line, we inherit newline_permitted status
-               if (states[newstate].starts_line)
-                       p->next.newline_permitted = 1;
                return 1;
        }
 
-`pop` simply moves the top of stack (`tos`) back down the required amount
-and frees any `asn` entries that need to be freed.  It is called _after_ we
-reduce a production, just before we `shift` the nonterminal in.
+`pop` primarily moves the top of stack (`tos`) back down the required
+amount and frees any `asn` entries that need to be freed.  It also
+collects a summary of the indents in the symbols that are being
+removed. It is called _after_ we reduce a production, just before we
+`shift` the nonterminal in.
+
+`pop` is only called if there are entries to remove, so `num` is never zero.
 
 ###### parser functions
 
-       static void pop(struct parser *p, int num,
+       static void pop(struct parser *p, int num, struct frame *next,
                        void(*do_free)(short sym, void *asn))
        {
                int i;
                p->tos -= num;
+               next->starts_indented = p->stack[p->tos].starts_indented;
+               next->indents = 0;
                for (i = 0; i < num; i++) {
-                       p->next.indents += p->stack[p->tos+i].indents;
+                       next->indents += p->stack[p->tos+i].indents;
                        do_free(p->stack[p->tos+i].sym,
                                p->asn_stack[p->tos+i]);
                }
-
-               if (num) {
-                       p->next.state = p->stack[p->tos].state;
-                       p->next.starts_indented = p->stack[p->tos].starts_indented;
-                       p->next.newline_permitted = p->stack[p->tos].newline_permitted;
-                       if (p->next.indents > p->next.starts_indented)
-                               p->next.newline_permitted = 0;
-               }
        }
 
 ### Memory allocation
@@ -2541,8 +2552,9 @@ copying, hence `memdup` and `tokcopy`.
 
 ### The heart of the parser.
 
-Now we have the parser.  If we can shift, we do.  If not and we can reduce
-we do.  If the production we reduced was production zero, then we have
+Now we have the parser.  If we can shift, we do, though newlines and
+reducing indenting may block that.  If not and we can reduce we do.
+If the production we reduced was production zero, then we have
 accepted the input and can finish.
 
 We return whatever `asn` was returned by reducing production zero.
@@ -2577,37 +2589,39 @@ since the last state which could have been at the start of a line.
                         struct token_config *config)
        {
                struct parser p = { 0 };
+               struct frame next = { 0 };
                struct token *tk = NULL;
                int accepted = 0;
                void *ret = NULL;
 
-               p.next.newline_permitted = states[0].starts_line;
+               shift(&p, &next, NULL, states);
                while (!accepted) {
                        struct token *err_tk;
+                       struct frame *tos = &p.stack[p.tos-1];
                        if (!tk)
                                tk = tok_copy(token_next(tokens));
-                       p.next.sym = tk->num;
+                       next.sym = tk->num;
                        if (trace)
-                               parser_trace(trace, &p, tk, states, non_term, config->known_count);
+                               parser_trace(trace, &p, &next, tk, states, non_term, config->known_count);
 
-                       if (p.next.sym == TK_in) {
-                               p.next.starts_indented = 1;
-                               p.next.indents = 1;
+                       if (next.sym == TK_in) {
+                               next.starts_indented = 1;
+                               next.indents = 1;
                                free(tk);
                                tk = NULL;
                                continue;
                        }
-                       if (p.next.sym == TK_out) {
-                               if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
-                                   (p.stack[p.tos-1].indents == 1 &&
-                                    states[p.next.state].reduce_size != 1)) {
-                                       p.stack[p.tos-1].indents -= 1;
-                                       if (p.stack[p.tos-1].indents == p.stack[p.tos-1].starts_indented) {
+                       if (next.sym == TK_out) {
+                               if (tos->indents > tos->starts_indented ||
+                                   (tos->indents == 1 &&
+                                    states[tos->state].reduce_size != 1)) {
+                                       tos->indents -= 1;
+                                       if (tos->indents <= tos->starts_indented) {
                                                // no internal indent any more, reassess 'newline_permitted'
-                                               if (states[p.stack[p.tos-1].state].starts_line)
-                                                       p.stack[p.tos-1].newline_permitted = 1;
+                                               if (states[tos->state].starts_line)
+                                                       tos->newline_permitted = 1;
                                                else if (p.tos > 1)
-                                                       p.stack[p.tos-1].newline_permitted = p.stack[p.tos-2].newline_permitted;
+                                                       tos->newline_permitted = p.stack[p.tos-2].newline_permitted;
                                        }
                                        free(tk);
                                        tk = NULL;
@@ -2616,35 +2630,45 @@ since the last state which could have been at the start of a line.
                                // fall through and force a REDUCE (as 'shift'
                                // will fail).
                        }
-                       if (p.next.sym == TK_newline) {
-                               if (!p.tos || ! p.stack[p.tos-1].newline_permitted) {
+                       if (next.sym == TK_newline) {
+                               if (! tos->newline_permitted) {
                                        free(tk);
                                        tk = NULL;
                                        continue;
                                }
                        }
-                       if (shift(&p, tk, states)) {
+                       if (shift(&p, &next, tk, states)) {
                                tk = NULL;
+                               next.starts_indented = 0;
+                               next.indents = 0;
                                continue;
                        }
-                       if (states[p.next.state].reduce_prod >= 0) {
+                       if (states[tos->state].reduce_prod >= 0) {
                                void **body;
                                void *res;
-                               int prod = states[p.next.state].reduce_prod;
-                               int size = states[p.next.state].reduce_size;
+                               const struct state *nextstate = &states[tos->state];
+                               int prod = nextstate->reduce_prod;
+                               int size = nextstate->reduce_size;
                                int bufsize;
                                static char buf[16*1024];
-                               p.next.sym = states[p.next.state].reduce_sym;
+                               struct frame frame;
+                               frame.sym = nextstate->reduce_sym;
 
-                               body = p.asn_stack +
-                                       (p.tos - states[p.next.state].reduce_size);
+                               body = p.asn_stack + (p.tos - size);
 
                                bufsize = do_reduce(prod, body, config, buf);
 
-                               pop(&p, size, do_free);
+                               if (size)
+                                       pop(&p, size, &frame, do_free);
+                               else {
+                                       frame.indents = next.indents;
+                                       frame.starts_indented = frame.indents;
+                                       next.indents = 0;
+                                       next.starts_indented = 0;
+                               }
                                res = memdup(buf, bufsize);
                                memset(buf, 0, bufsize);
-                               if (!shift(&p, res, states)) {
+                               if (!shift(&p, &frame, res, states)) {
                                        if (prod != 0) abort();
                                        accepted = 1;
                                        ret = res;
@@ -2654,9 +2678,10 @@ since the last state which could have been at the start of a line.
                        if (tk->num == TK_out) {
                                // Indent problem - synthesise tokens to get us
                                // out of here.
-                               fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
-                               p.next.sym = states[p.next.state].shift_sym;
-                               shift(&p, tok_copy(*tk), states);
+                               struct frame frame = { 0 };
+                               fprintf(stderr, "Synthesize %d to handle indent problem\n", states[tos->state].shift_sym);
+                               frame.sym = states[tos->state].shift_sym;
+                               shift(&p, &frame, tok_copy(*tk), states);
                                // FIXME need to report this error somehow
                                continue;
                        }
@@ -2669,33 +2694,35 @@ since the last state which could have been at the start of a line.
                         */
 
                        err_tk = tok_copy(*tk);
-                       p.next.sym = TK_error;
-                       while (shift(&p, err_tk, states) == 0
+                       next.sym = TK_error;
+                       while (shift(&p, &next, err_tk, states) == 0
                               && p.tos > 0)
                                // discard this state
-                               pop(&p, 1, do_free);
+                               pop(&p, 1, &next, do_free);
                        if (p.tos == 0) {
                                free(err_tk);
                                // no state accepted TK_error
                                break;
                        }
-                       while (search(&states[p.next.state], tk->num) < 0 &&
+                       tos = &p.stack[p.tos-1];
+                       while (search(&states[tos->state], tk->num) < 0 &&
                               tk->num != TK_eof) {
                                free(tk);
                                tk = tok_copy(token_next(tokens));
                                if (tk->num == TK_in)
-                                       p.next.indents += 1;
+                                       next.indents += 1;
                                if (tk->num == TK_out) {
-                                       if (p.next.indents == 0)
+                                       if (next.indents == 0)
                                                break;
-                                       p.next.indents -= 1;
+                                       next.indents -= 1;
                                }
                        }
                        if (p.tos == 0 && tk->num == TK_eof)
                                break;
                }
                free(tk);
-               pop(&p, p.tos, do_free);
+               if (p.tos)
+                       pop(&p, p.tos, &next, do_free);
                free(p.asn_stack);
                free(p.stack);
                return ret;
@@ -2738,9 +2765,6 @@ end inside square brackets.
        static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
        {
                fprintf(trace, "(%d", f->state);
-               if (f->indents)
-                       fprintf(trace, "%c%d", f->starts_indented?':':'.',
-                               f->indents);
                if (states[f->state].starts_line)
                        fprintf(trace, "s");
                if (f->newline_permitted)
@@ -2748,32 +2772,40 @@ end inside square brackets.
                fprintf(trace, ") ");
        }
 
-       void parser_trace(FILE *trace, struct parser *p,
+       void parser_trace(FILE *trace, struct parser *p, struct frame *n,
                          struct token *tk, const struct state states[],
                          const char *non_term[], int knowns)
        {
                int i;
                for (i = 0; i < p->tos; i++) {
-                       int sym = p->stack[i].sym;
-                       parser_trace_state(trace, &p->stack[i], states);
-                       if (sym < TK_reserved &&
-                           reserved_words[sym] != NULL)
-                               fputs(reserved_words[sym], trace);
-                       else if (sym < TK_reserved + knowns) {
-                               struct token *t = p->asn_stack[i];
-                               text_dump(trace, t->txt, 20);
-                       } else
-                               fputs(non_term[sym - TK_reserved - knowns],
-                                     trace);
-                       fputs(" ", trace);
+                       struct frame *f = &p->stack[i];
+                       if (i) {
+                               int sym = f->sym;
+                               if (sym < TK_reserved &&
+                                   reserved_words[sym] != NULL)
+                                       fputs(reserved_words[sym], trace);
+                               else if (sym < TK_reserved + knowns) {
+                                       struct token *t = p->asn_stack[i];
+                                       text_dump(trace, t->txt, 20);
+                               } else
+                                       fputs(non_term[sym - TK_reserved - knowns],
+                                             trace);
+                               if (f->indents)
+                                       fprintf(trace, "%c%d", f->starts_indented?':':'.',
+                                               f->indents);
+                               fputs(" ", trace);
+                       }
+                       parser_trace_state(trace, f, states);
                }
-               parser_trace_state(trace, &p->next, states);
-               fprintf(trace, " [");
+               fprintf(trace, "[");
                if (tk->num < TK_reserved &&
                    reserved_words[tk->num] != NULL)
                        fputs(reserved_words[tk->num], trace);
                else
                        text_dump(trace, tk->txt, 20);
+               if (n->indents)
+                       fprintf(trace, "%c%d", n->starts_indented?':':'.',
+                               n->indents);
                fputs("]\n", trace);
        }