]> ocean-lang.org Git - ocean/commitdiff
parsergen: various updates.
authorNeilBrown <neilb@suse.de>
Thu, 2 Oct 2014 10:42:04 +0000 (20:42 +1000)
committerNeilBrown <neilb@suse.de>
Thu, 2 Oct 2014 10:42:04 +0000 (20:42 +1000)
- add starts_line flag for symbols
- add starts_newline flag for stack frames

and related changes

Signed-off-by: NeilBrown <neil@brown.name>
csrc/parsergen.mdc

index 93065928492d0d164433fbce946e47e814f8c2d5..2aa296d9508d3ffe44e2d035f0c1898f68cb6db5 100644 (file)
@@ -869,7 +869,7 @@ changes happen.
                }
        }
 
-### Setting `can_eol`
+### Setting `can_eol` and `starts_line`
 
 In order to be able to ignore newline tokens when not relevant, but
 still include them in the parse when needed, we will need to know
@@ -891,6 +891,7 @@ compute it in a repetitive manner similar to `set_nullable`.
 
 ###### symbol fields
        int can_eol;
+       int starts_line;
 
 ###### functions
        static void set_can_eol(struct grammar *g)
@@ -920,6 +921,19 @@ compute it in a repetitive manner similar to `set_nullable`.
                }
        }
 
+       static void set_starts_line(struct grammar *g)
+       {
+               int p;
+               for (p = 0; p < g->production_count; p++) {
+                       struct production *pr = g->productions[p];
+                       int s;
+
+                       for (s = 0; s < pr->body_size - 1; s++)
+                               if (pr->body[s]->can_eol)
+                                       pr->body[s+1]->starts_line = 1;
+               }
+       }
+
 ### Building the `first` sets
 
 When calculating what can follow a particular non-terminal, we will need to
@@ -1198,7 +1212,7 @@ recalculated and their LA sets updated.
 them to a data structure, of freeing them.
 
        static int add_itemset(struct grammar *g, struct symset ss,
-                              enum grammar_type type, int starts_line)
+                              enum grammar_type type)
        {
                struct itemset **where, *is;
                int i;
@@ -1210,7 +1224,6 @@ them to a data structure, of freeing them.
                        is->items = ss;
                        is->next = *where;
                        is->go_to = INIT_DATASET;
-                       is->starts_line = starts_line;
                        *where = is;
                        return is->state;
                }
@@ -1252,6 +1265,8 @@ be supplemented by the LA set for the item which produce the new item.
 
 We also collect a set of all symbols which follow "DOT" (in `done`) as this
 is used in the next stage.
+If any of these symbols are flagged as starting a line, then this
+state must be a `starts_line` state so now is a good time to record that.
 
 NOTE: precedence handling should happen here - I haven't written this yet
 though.
@@ -1269,8 +1284,11 @@ though.
                if (bs == pr->body_size)
                        continue;
                s = pr->body[bs];
-               if (symset_find(&done, s->num) < 0)
+               if (symset_find(&done, s->num) < 0) {
                        symset_add(&done, s->num, 0);
+                       if (s->starts_line)
+                               is->starts_line = 1;
+               }
                if (s->type != Nonterminal)
                        continue;
                again = 1;
@@ -1324,15 +1342,11 @@ with a pre-existing itemset).
        for (i = 0; i < done.cnt; i++) {
                int j;
                unsigned short state;
-               int starts_line = 0;
                struct symbol *sym = g->symtab[done.syms[i]];
                struct symset newitemset = INIT_SYMSET;
                if (type >= LALR)
                        newitemset = INIT_DATASET;
 
-               if (sym->can_eol ||
-                   (sym->nullable && is->starts_line))
-                       starts_line = 1;
                for (j = 0; j < is->items.cnt; j++) {
                        int itm = is->items.syms[j];
                        int p = item_prod(itm);
@@ -1365,7 +1379,7 @@ with a pre-existing itemset).
                                }
                        }
                }
-               state = add_itemset(g, newitemset, type, starts_line);
+               state = add_itemset(g, newitemset, type);
                if (symset_find(&is->go_to, done.syms[i]) < 0)
                        symset_add(&is->go_to, done.syms[i], state);
        }
@@ -1389,7 +1403,7 @@ with `TK_eof` as the LA set.
                }
                // production 0, offset 0 (with no data)
                symset_add(&first, item_num(0, 0), la);
-               add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
+               add_itemset(g, first, type);
                for (again = 0, is = g->items;
                     is;
                     is = is->next ?: again ? (again = 0, g->items) : NULL) {
@@ -1450,6 +1464,7 @@ changeover point in `first_nonterm`.
 
                set_nullable(g);
                set_can_eol(g);
+               set_starts_line(g);
                if (type >= SLR)
                        build_first(g);
 
@@ -1481,7 +1496,8 @@ all the tables that have been generated, plus a description of any conflicts.
 
 Firstly we have the complete list of symbols, together with the
 "FIRST" set if that was generated.  We add a mark to each symbol to
-show if it can end in a newline (`>`), or if it is nullable (`.`).
+show if it can end in a newline (`>`), if it implies the start of a
+line (`<`), or if it is nullable (`.`).
 
 ###### functions
 
@@ -1498,9 +1514,10 @@ show if it can end in a newline (`>`), or if it is nullable (`.`).
                        if (!s)
                                continue;
 
-                       printf(" %c%c%3d%c: ",
+                       printf(" %c%c%c%3d%c: ",
                               s->nullable ? '.':' ',
                               s->can_eol ? '>':' ',
+                              s->starts_line ? '<':' ',
                               s->num, symtypes[s->type]);
                        prtxt(s->name);
                        if (s->precedence)
@@ -2415,10 +2432,14 @@ The `indents` count and the `starts_indented` flag track the line
 indents in the symbol.  These are used to allow indent information to
 guide parsing and error recovery.
 
+`since_newline` tracks how many stack frames since the last
+start-of-line (whether indented or not).  So if `since_newline` is
+zero, then this symbol is at the start of a line.
+
 `newline_permitted` keeps track of whether newlines should be ignored
 or not, and `starts_line` records if this state stated on a newline.
 
-The stack is more properly seen as alternating states and symbols -
+The stack is most properly seen as alternating states and symbols -
 states, like the 'DOT' in items, are between symbols.  Each frame in
 our stack holds a state and the symbol that was before it.  The
 bottom of stack holds the start state, but no symbol, as nothing came
@@ -2434,6 +2455,7 @@ before the beginning.
                        short sym;
                        short starts_indented;
                        short indents;
+                       short starts_newline;
                } *stack;
                void **asn_stack;
                int stack_size;
@@ -2462,6 +2484,15 @@ stack is empty, it always chooses zero as the next state.
 So `shift` finds the next state.  If that succeed it extends the allocations
 if needed and pushes all the information onto the stacks.
 
+Newlines are permitted after a starts_line state until an internal
+indent.  So we need to find the topmost state which `starts_line` and
+see if there are any indents other than immediately after it.
+
+So we walk down:
+
+-  if state starts_line, then newlines_permitted.
+-  if any non-initial indents, newlines not permitted
+
 ###### parser functions
 
        static int shift(struct parser *p, struct frame *next,
@@ -2486,8 +2517,10 @@ if needed and pushes all the information onto the stacks.
                next->newline_permitted = 0;
                if (p->tos)
                        next->newline_permitted =
-                               p->stack[p->tos-1].newline_permitted;
-               if (next->indents)
+                               (p->stack[p->tos-1].newline_permitted?:-1)+1;
+               if (next->indents > next->starts_indented)
+                       next->newline_permitted = 0;
+               if (next->indents && next->newline_permitted > 2)
                        next->newline_permitted = 0;
                if (states[newstate].starts_line)
                        next->newline_permitted = 1;
@@ -2512,7 +2545,10 @@ removed. It is called _after_ we reduce a production, just before we
        {
                int i;
                p->tos -= num;
-               next->starts_indented = p->stack[p->tos].starts_indented;
+               next->starts_indented =
+                       p->stack[p->tos].starts_indented;
+               next->starts_newline =
+                       p->stack[p->tos].starts_newline;
                next->indents = 0;
                for (i = 0; i < num; i++) {
                        next->indents += p->stack[p->tos+i].indents;
@@ -2594,6 +2630,7 @@ since the last state which could have been at the start of a line.
                int accepted = 0;
                void *ret = NULL;
 
+               next.starts_newline = 1;
                shift(&p, &next, NULL, states);
                while (!accepted) {
                        struct token *err_tk;
@@ -2606,6 +2643,7 @@ since the last state which could have been at the start of a line.
                        if (next.sym == TK_in) {
                                next.starts_indented = 1;
                                next.indents = 1;
+                               next.starts_newline = 1;
                                free(tk);
                                tk = NULL;
                                parser_trace_action(trace, "Record");
@@ -2621,7 +2659,7 @@ since the last state which could have been at the start of a line.
                                                if (states[tos->state].starts_line)
                                                        tos->newline_permitted = 1;
                                                else if (p.tos > 1)
-                                                       tos->newline_permitted = p.stack[p.tos-2].newline_permitted;
+                                                       tos->newline_permitted = (p.stack[p.tos-2].newline_permitted ?:-1)+1;
                                        }
                                        free(tk);
                                        tk = NULL;
@@ -2640,9 +2678,11 @@ since the last state which could have been at the start of a line.
                                }
                        }
                        if (shift(&p, &next, tk, states)) {
-                               tk = NULL;
+                               next.starts_newline =
+                                       tk->num == TK_newline;
                                next.starts_indented = 0;
                                next.indents = 0;
+                               tk = NULL;
                                parser_trace_action(trace, "Shift");
                                continue;
                        }
@@ -2666,6 +2706,7 @@ since the last state which could have been at the start of a line.
                                else {
                                        frame.indents = next.indents;
                                        frame.starts_indented = frame.indents;
+                                       frame.starts_newline = 0;
                                        next.indents = 0;
                                        next.starts_indented = 0;
                                }
@@ -2774,7 +2815,7 @@ end inside square brackets.
                if (states[f->state].starts_line)
                        fprintf(trace, "s");
                if (f->newline_permitted)
-                       fprintf(trace, "n");
+                       fprintf(trace, "n%d", f->newline_permitted);
                fprintf(trace, ") ");
        }
 
@@ -2801,6 +2842,8 @@ end inside square brackets.
                                if (f->indents)
                                        fprintf(trace, "%c%d", f->starts_indented?':':'.',
                                                f->indents);
+                               if (f->starts_newline)
+                                       fputs("/", trace);
                                fputs(" ", trace);
                        }
                        parser_trace_state(trace, f, states);
@@ -2814,6 +2857,8 @@ end inside square brackets.
                if (n->indents)
                        fprintf(trace, "%c%d", n->starts_indented?':':'.',
                                n->indents);
+               if (n->starts_newline)
+                       fputs("/", trace);
                fputs("]", trace);
        }