parsergen: various updates.

author NeilBrown <neilb@suse.de>

Thu, 2 Oct 2014 10:42:04 +0000 (20:42 +1000)

committer NeilBrown <neilb@suse.de>

Thu, 2 Oct 2014 10:42:04 +0000 (20:42 +1000)
author NeilBrown <neilb@suse.de>
Thu, 2 Oct 2014 10:42:04 +0000 (20:42 +1000)
committer NeilBrown <neilb@suse.de>
Thu, 2 Oct 2014 10:42:04 +0000 (20:42 +1000)
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index 93065928492d0d164433fbce946e47e814f8c2d5..2aa296d9508d3ffe44e2d035f0c1898f68cb6db5 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -869,7 +869,7 @@ changes happen.
                 }
         }
  
-### Setting `can_eol`
+### Setting `can_eol` and `starts_line`
  
  In order to be able to ignore newline tokens when not relevant, but
  still include them in the parse when needed, we will need to know
@@ -891,6 +891,7 @@ compute it in a repetitive manner similar to `set_nullable`.
  
  ###### symbol fields
         int can_eol;
+       int starts_line;
  
  ###### functions
         static void set_can_eol(struct grammar *g)
@@ -920,6 +921,19 @@ compute it in a repetitive manner similar to `set_nullable`.
                 }
         }
  
+       static void set_starts_line(struct grammar *g)
+       {
+               int p;
+               for (p = 0; p < g->production_count; p++) {
+                       struct production *pr = g->productions[p];
+                       int s;
+
+                       for (s = 0; s < pr->body_size - 1; s++)
+                               if (pr->body[s]->can_eol)
+                                       pr->body[s+1]->starts_line = 1;
+               }
+       }
+
  ### Building the `first` sets
  
  When calculating what can follow a particular non-terminal, we will need to
@@ -1198,7 +1212,7 @@ recalculated and their LA sets updated.
  them to a data structure, of freeing them.
  
         static int add_itemset(struct grammar *g, struct symset ss,
-                              enum grammar_type type, int starts_line)
+                              enum grammar_type type)
         {
                 struct itemset **where, *is;
                 int i;
@@ -1210,7 +1224,6 @@ them to a data structure, of freeing them.
                         is->items = ss;
                         is->next = *where;
                         is->go_to = INIT_DATASET;
-                       is->starts_line = starts_line;
                         *where = is;
                         return is->state;
                 }
@@ -1252,6 +1265,8 @@ be supplemented by the LA set for the item which produce the new item.
  
  We also collect a set of all symbols which follow "DOT" (in `done`) as this
  is used in the next stage.
+If any of these symbols are flagged as starting a line, then this
+state must be a `starts_line` state so now is a good time to record that.
  
  NOTE: precedence handling should happen here - I haven't written this yet
  though.
@@ -1269,8 +1284,11 @@ though.
                 if (bs == pr->body_size)
                         continue;
                 s = pr->body[bs];
-               if (symset_find(&done, s->num) < 0)
+               if (symset_find(&done, s->num) < 0) {
                         symset_add(&done, s->num, 0);
+                       if (s->starts_line)
+                               is->starts_line = 1;
+               }
                 if (s->type != Nonterminal)
                         continue;
                 again = 1;
@@ -1324,15 +1342,11 @@ with a pre-existing itemset).
         for (i = 0; i < done.cnt; i++) {
                 int j;
                 unsigned short state;
-               int starts_line = 0;
                 struct symbol *sym = g->symtab[done.syms[i]];
                 struct symset newitemset = INIT_SYMSET;
                 if (type >= LALR)
                         newitemset = INIT_DATASET;
  
-               if (sym->can_eol ||
-                   (sym->nullable && is->starts_line))
-                       starts_line = 1;
                 for (j = 0; j < is->items.cnt; j++) {
                         int itm = is->items.syms[j];
                         int p = item_prod(itm);
@@ -1365,7 +1379,7 @@ with a pre-existing itemset).
                                 }
                         }
                 }
-               state = add_itemset(g, newitemset, type, starts_line);
+               state = add_itemset(g, newitemset, type);
                 if (symset_find(&is->go_to, done.syms[i]) < 0)
                         symset_add(&is->go_to, done.syms[i], state);
         }
@@ -1389,7 +1403,7 @@ with `TK_eof` as the LA set.
                 }
                 // production 0, offset 0 (with no data)
                 symset_add(&first, item_num(0, 0), la);
-               add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
+               add_itemset(g, first, type);
                 for (again = 0, is = g->items;
                      is;
                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
@@ -1450,6 +1464,7 @@ changeover point in `first_nonterm`.
  
                 set_nullable(g);
                 set_can_eol(g);
+               set_starts_line(g);
                 if (type >= SLR)
                         build_first(g);
  
@@ -1481,7 +1496,8 @@ all the tables that have been generated, plus a description of any conflicts.
  
  Firstly we have the complete list of symbols, together with the
  "FIRST" set if that was generated.  We add a mark to each symbol to
-show if it can end in a newline (`>`), or if it is nullable (`.`).
+show if it can end in a newline (`>`), if it implies the start of a
+line (`<`), or if it is nullable (`.`).
  
  ###### functions
  
@@ -1498,9 +1514,10 @@ show if it can end in a newline (`>`), or if it is nullable (`.`).
                         if (!s)
                                 continue;
  
-                       printf(" %c%c%3d%c: ",
+                       printf(" %c%c%c%3d%c: ",
                                s->nullable ? '.':' ',
                                s->can_eol ? '>':' ',
+                              s->starts_line ? '<':' ',
                                s->num, symtypes[s->type]);
                         prtxt(s->name);
                         if (s->precedence)
@@ -2415,10 +2432,14 @@ The `indents` count and the `starts_indented` flag track the line
  indents in the symbol.  These are used to allow indent information to
  guide parsing and error recovery.
  
+`since_newline` tracks how many stack frames since the last
+start-of-line (whether indented or not).  So if `since_newline` is
+zero, then this symbol is at the start of a line.
+
  `newline_permitted` keeps track of whether newlines should be ignored
  or not, and `starts_line` records if this state stated on a newline.
  
-The stack is more properly seen as alternating states and symbols -
+The stack is most properly seen as alternating states and symbols -
  states, like the 'DOT' in items, are between symbols.  Each frame in
  our stack holds a state and the symbol that was before it.  The
  bottom of stack holds the start state, but no symbol, as nothing came
@@ -2434,6 +2455,7 @@ before the beginning.
                         short sym;
                         short starts_indented;
                         short indents;
+                       short starts_newline;
                 } *stack;
                 void **asn_stack;
                 int stack_size;
@@ -2462,6 +2484,15 @@ stack is empty, it always chooses zero as the next state.
  So `shift` finds the next state.  If that succeed it extends the allocations
  if needed and pushes all the information onto the stacks.
  
+Newlines are permitted after a starts_line state until an internal
+indent.  So we need to find the topmost state which `starts_line` and
+see if there are any indents other than immediately after it.
+
+So we walk down:
+
+-  if state starts_line, then newlines_permitted.
+-  if any non-initial indents, newlines not permitted
+
  ###### parser functions
  
         static int shift(struct parser *p, struct frame *next,
@@ -2486,8 +2517,10 @@ if needed and pushes all the information onto the stacks.
                 next->newline_permitted = 0;
                 if (p->tos)
                         next->newline_permitted =
-                               p->stack[p->tos-1].newline_permitted;
-               if (next->indents)
+                               (p->stack[p->tos-1].newline_permitted?:-1)+1;
+               if (next->indents > next->starts_indented)
+                       next->newline_permitted = 0;
+               if (next->indents && next->newline_permitted > 2)
                         next->newline_permitted = 0;
                 if (states[newstate].starts_line)
                         next->newline_permitted = 1;
@@ -2512,7 +2545,10 @@ removed. It is called _after_ we reduce a production, just before we
         {
                 int i;
                 p->tos -= num;
-               next->starts_indented = p->stack[p->tos].starts_indented;
+               next->starts_indented =
+                       p->stack[p->tos].starts_indented;
+               next->starts_newline =
+                       p->stack[p->tos].starts_newline;
                 next->indents = 0;
                 for (i = 0; i < num; i++) {
                         next->indents += p->stack[p->tos+i].indents;
@@ -2594,6 +2630,7 @@ since the last state which could have been at the start of a line.
                 int accepted = 0;
                 void *ret = NULL;
  
+               next.starts_newline = 1;
                 shift(&p, &next, NULL, states);
                 while (!accepted) {
                         struct token *err_tk;
@@ -2606,6 +2643,7 @@ since the last state which could have been at the start of a line.
                         if (next.sym == TK_in) {
                                 next.starts_indented = 1;
                                 next.indents = 1;
+                               next.starts_newline = 1;
                                 free(tk);
                                 tk = NULL;
                                 parser_trace_action(trace, "Record");
@@ -2621,7 +2659,7 @@ since the last state which could have been at the start of a line.
                                                 if (states[tos->state].starts_line)
                                                         tos->newline_permitted = 1;
                                                 else if (p.tos > 1)
-                                                       tos->newline_permitted = p.stack[p.tos-2].newline_permitted;
+                                                       tos->newline_permitted = (p.stack[p.tos-2].newline_permitted ?:-1)+1;
                                         }
                                         free(tk);
                                         tk = NULL;
@@ -2640,9 +2678,11 @@ since the last state which could have been at the start of a line.
                                 }
                         }
                         if (shift(&p, &next, tk, states)) {
-                               tk = NULL;
+                               next.starts_newline =
+                                       tk->num == TK_newline;
                                 next.starts_indented = 0;
                                 next.indents = 0;
+                               tk = NULL;
                                 parser_trace_action(trace, "Shift");
                                 continue;
                         }
@@ -2666,6 +2706,7 @@ since the last state which could have been at the start of a line.
                                 else {
                                         frame.indents = next.indents;
                                         frame.starts_indented = frame.indents;
+                                       frame.starts_newline = 0;
                                         next.indents = 0;
                                         next.starts_indented = 0;
                                 }
@@ -2774,7 +2815,7 @@ end inside square brackets.
                 if (states[f->state].starts_line)
                         fprintf(trace, "s");
                 if (f->newline_permitted)
-                       fprintf(trace, "n");
+                       fprintf(trace, "n%d", f->newline_permitted);
                 fprintf(trace, ") ");
         }
  
@@ -2801,6 +2842,8 @@ end inside square brackets.
                                 if (f->indents)
                                         fprintf(trace, "%c%d", f->starts_indented?':':'.',
                                                 f->indents);
+                               if (f->starts_newline)
+                                       fputs("/", trace);
                                 fputs(" ", trace);
                         }
                         parser_trace_state(trace, f, states);
@@ -2814,6 +2857,8 @@ end inside square brackets.
                 if (n->indents)
                         fprintf(trace, "%c%d", n->starts_indented?':':'.',
                                 n->indents);
+               if (n->starts_newline)
+                       fputs("/", trace);
                 fputs("]", trace);
         }
author	NeilBrown <neilb@suse.de>
	Thu, 2 Oct 2014 10:42:04 +0000 (20:42 +1000)
committer	NeilBrown <neilb@suse.de>
	Thu, 2 Oct 2014 10:42:04 +0000 (20:42 +1000)