parsergen: calculate and record "min_prefix" for each state.

[ocean] / csrc / parsergen.mdc
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index 93065928492d0d164433fbce946e47e814f8c2d5..45fb09e5228f26ce3240a8daa46f1d6d092ffa7d 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -869,28 +869,32 @@ changes happen.
                 }
         }
  
-### Setting `can_eol`
+### Setting `can_eol` and `line_like`
  
  In order to be able to ignore newline tokens when not relevant, but
  still include them in the parse when needed, we will need to know
  which states can start a "line-like" section of code.  We ignore
  newlines when there is an indent since the most recent start of a
-line-like section.
+line-like symbol.
  
-To know what is line-like, we first need to know which symbols can end
-a line-like section, which is precisely those which can end with a
-newline token.  These symbols don't necessarily alway end with a
-newline, but they can.  Hence they are not described as "lines" but
-only "line-like".
+To know which symbols are line-like, we first need to know which
+symbols start with a NEWLINE token.  Any symbol which is followed by a
+NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol.
+Certainly when trying to parse one of these we must take not of NEWLINEs.
  
-Clearly the `TK_newline` token can end with a newline.  Any symbol
-which is the head of a production that contains a line-ending symbol
-followed only by nullable symbols is also a line-ending symbol.  We
-use a new field `can_eol` to record this attribute of symbols, and
-compute it in a repetitive manner similar to `set_nullable`.
+Clearly the `TK_newline` token can start with a NEWLINE.  Any symbol
+which is the head of a production that contains a starts-with-NEWLINE
+symbol preceeded only by nullable symbols is also a
+starts-with-NEWLINE symbol.  We use a new field `can_eol` to record
+this attribute of symbols, and compute it in a repetitive manner
+similar to `set_nullable`.
+
+Once we have that, we can determine which symbols are `line_like` be
+seeing which are followed by a `can_eol` symbol in any production.
  
  ###### symbol fields
         int can_eol;
+       int line_like;
  
  ###### functions
         static void set_can_eol(struct grammar *g)
@@ -907,7 +911,7 @@ compute it in a repetitive manner similar to `set_nullable`.
                                 if (pr->head->can_eol)
                                         continue;
  
-                               for (s = pr->body_size - 1; s >= 0; s--) {
+                               for (s = 0 ; s < pr->body_size; s++) {
                                         if (pr->body[s]->can_eol) {
                                                 pr->head->can_eol = 1;
                                                 check_again = 1;
@@ -920,6 +924,19 @@ compute it in a repetitive manner similar to `set_nullable`.
                 }
         }
  
+       static void set_line_like(struct grammar *g)
+       {
+               int p;
+               for (p = 0; p < g->production_count; p++) {
+                       struct production *pr = g->productions[p];
+                       int s;
+
+                       for (s = 1; s < pr->body_size; s++)
+                               if (pr->body[s]->can_eol)
+                                       pr->body[s-1]->line_like = 1;
+               }
+       }
+
  ### Building the `first` sets
  
  When calculating what can follow a particular non-terminal, we will need to
@@ -1161,6 +1178,7 @@ should happen, so we don't need to search a second time.
                 struct symset go_to;
                 char completed;
                 char starts_line;
+               int min_prefix;
         };
  
  ###### grammar fields
@@ -1198,7 +1216,7 @@ recalculated and their LA sets updated.
  them to a data structure, of freeing them.
  
         static int add_itemset(struct grammar *g, struct symset ss,
-                              enum grammar_type type, int starts_line)
+                              enum grammar_type type)
         {
                 struct itemset **where, *is;
                 int i;
@@ -1210,7 +1228,6 @@ them to a data structure, of freeing them.
                         is->items = ss;
                         is->next = *where;
                         is->go_to = INIT_DATASET;
-                       is->starts_line = starts_line;
                         *where = is;
                         return is->state;
                 }
@@ -1252,6 +1269,8 @@ be supplemented by the LA set for the item which produce the new item.
  
  We also collect a set of all symbols which follow "DOT" (in `done`) as this
  is used in the next stage.
+If any of these symbols are flagged as starting a line, then this
+state must be a `starts_line` state so now is a good time to record that.
  
  NOTE: precedence handling should happen here - I haven't written this yet
  though.
@@ -1266,11 +1285,17 @@ though.
                 struct symset LA = INIT_SYMSET;
                 unsigned short sn = 0;
  
+               if (is->min_prefix == 0 ||
+                   (bs > 0 && bs < is->min_prefix))
+                       is->min_prefix = bs;
                 if (bs == pr->body_size)
                         continue;
                 s = pr->body[bs];
-               if (symset_find(&done, s->num) < 0)
+               if (symset_find(&done, s->num) < 0) {
                         symset_add(&done, s->num, 0);
+                       if (s->line_like)
+                               is->starts_line = 1;
+               }
                 if (s->type != Nonterminal)
                         continue;
                 again = 1;
@@ -1324,15 +1349,11 @@ with a pre-existing itemset).
         for (i = 0; i < done.cnt; i++) {
                 int j;
                 unsigned short state;
-               int starts_line = 0;
                 struct symbol *sym = g->symtab[done.syms[i]];
                 struct symset newitemset = INIT_SYMSET;
                 if (type >= LALR)
                         newitemset = INIT_DATASET;
  
-               if (sym->can_eol ||
-                   (sym->nullable && is->starts_line))
-                       starts_line = 1;
                 for (j = 0; j < is->items.cnt; j++) {
                         int itm = is->items.syms[j];
                         int p = item_prod(itm);
@@ -1365,7 +1386,7 @@ with a pre-existing itemset).
                                 }
                         }
                 }
-               state = add_itemset(g, newitemset, type, starts_line);
+               state = add_itemset(g, newitemset, type);
                 if (symset_find(&is->go_to, done.syms[i]) < 0)
                         symset_add(&is->go_to, done.syms[i], state);
         }
@@ -1389,7 +1410,7 @@ with `TK_eof` as the LA set.
                 }
                 // production 0, offset 0 (with no data)
                 symset_add(&first, item_num(0, 0), la);
-               add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
+               add_itemset(g, first, type);
                 for (again = 0, is = g->items;
                      is;
                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
@@ -1450,6 +1471,7 @@ changeover point in `first_nonterm`.
  
                 set_nullable(g);
                 set_can_eol(g);
+               set_line_like(g);
                 if (type >= SLR)
                         build_first(g);
  
@@ -1481,7 +1503,8 @@ all the tables that have been generated, plus a description of any conflicts.
  
  Firstly we have the complete list of symbols, together with the
  "FIRST" set if that was generated.  We add a mark to each symbol to
-show if it can end in a newline (`>`), or if it is nullable (`.`).
+show if it can end in a newline (`>`), if it implies the start of a
+line (`<`), or if it is nullable (`.`).
  
  ###### functions
  
@@ -1498,9 +1521,10 @@ show if it can end in a newline (`>`), or if it is nullable (`.`).
                         if (!s)
                                 continue;
  
-                       printf(" %c%c%3d%c: ",
+                       printf(" %c%c%c%3d%c: ",
                                s->nullable ? '.':' ',
                                s->can_eol ? '>':' ',
+                              s->line_like ? '<':' ',
                                s->num, symtypes[s->type]);
                         prtxt(s->name);
                         if (s->precedence)
@@ -1609,7 +1633,8 @@ Now we can report all the item sets complete with items, LA sets, and GO TO.
                 for (s = 0; s < g->states; s++) {
                         int j;
                         struct itemset *is = g->statetab[s];
-                       printf("  Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
+                       printf("  Itemset %d:%s min prefix=%d\n",
+                              s, is->starts_line?" (startsline)":"", is->min_prefix);
                         for (j = 0; j < is->items.cnt; j++) {
                                 report_item(g, is->items.syms[j]);
                                 if (is->items.data != NO_DATA)
@@ -1873,6 +1898,7 @@ The go to table is stored in a simple array of `sym` and corresponding
                 short reduce_sym;
                 short shift_sym;
                 short starts_line;
+               short min_prefix;
         };
  
  
@@ -1929,15 +1955,15 @@ The go to table is stored in a simple array of `sym` and corresponding
                         }
  
                         if (prod >= 0)
-                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
+                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d, %d },\n",
                                         i, is->go_to.cnt, i, prod,
                                         g->productions[prod]->body_size,
                                         g->productions[prod]->head->num,
-                                       is->starts_line);
+                                       is->starts_line, is->min_prefix);
                         else
-                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
+                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d, %d },\n",
                                         i, is->go_to.cnt, i, shift_sym,
-                                       is->starts_line);
+                                       is->starts_line, is->min_prefix);
                 }
                 fprintf(f, "};\n\n");
         }
@@ -2415,10 +2441,14 @@ The `indents` count and the `starts_indented` flag track the line
  indents in the symbol.  These are used to allow indent information to
  guide parsing and error recovery.
  
+`since_newline` tracks how many stack frames since the last
+start-of-line (whether indented or not).  So if `since_newline` is
+zero, then this symbol is at the start of a line.
+
  `newline_permitted` keeps track of whether newlines should be ignored
  or not, and `starts_line` records if this state stated on a newline.
  
-The stack is more properly seen as alternating states and symbols -
+The stack is most properly seen as alternating states and symbols -
  states, like the 'DOT' in items, are between symbols.  Each frame in
  our stack holds a state and the symbol that was before it.  The
  bottom of stack holds the start state, but no symbol, as nothing came
@@ -2434,6 +2464,7 @@ before the beginning.
                         short sym;
                         short starts_indented;
                         short indents;
+                       short since_newline;
                 } *stack;
                 void **asn_stack;
                 int stack_size;
@@ -2462,6 +2493,15 @@ stack is empty, it always chooses zero as the next state.
  So `shift` finds the next state.  If that succeed it extends the allocations
  if needed and pushes all the information onto the stacks.
  
+Newlines are permitted after a starts_line state until an internal
+indent.  So we need to find the topmost state which `starts_line` and
+see if there are any indents other than immediately after it.
+
+So we walk down:
+
+-  if state starts_line, then newlines_permitted.
+-  if any non-initial indents, newlines not permitted
+
  ###### parser functions
  
         static int shift(struct parser *p, struct frame *next,
@@ -2483,14 +2523,22 @@ if needed and pushes all the information onto the stacks.
                                            * sizeof(p->asn_stack[0]));
                 }
                 next->state = newstate;
-               next->newline_permitted = 0;
-               if (p->tos)
+               if (states[newstate].starts_line)
+                       next->newline_permitted = 1;
+               else if (next->indents)
+                       next->newline_permitted = 0;
+               else if (p->tos)
                         next->newline_permitted =
                                 p->stack[p->tos-1].newline_permitted;
-               if (next->indents)
+               else
                         next->newline_permitted = 0;
-               if (states[newstate].starts_line)
-                       next->newline_permitted = 1;
+
+               if (next->since_newline) {
+                       if (p->tos)
+                               next->since_newline = p->stack[p->tos-1].since_newline + 1;
+                       else
+                               next->since_newline = 1;
+               }
                 p->stack[p->tos] = *next;
                 p->asn_stack[p->tos] = asn;
                 p->tos++;
@@ -2512,7 +2560,10 @@ removed. It is called _after_ we reduce a production, just before we
         {
                 int i;
                 p->tos -= num;
-               next->starts_indented = p->stack[p->tos].starts_indented;
+               next->starts_indented =
+                       p->stack[p->tos].starts_indented;
+               next->since_newline =
+                       p->stack[p->tos].since_newline;
                 next->indents = 0;
                 for (i = 0; i < num; i++) {
                         next->indents += p->stack[p->tos+i].indents;
@@ -2606,6 +2657,7 @@ since the last state which could have been at the start of a line.
                         if (next.sym == TK_in) {
                                 next.starts_indented = 1;
                                 next.indents = 1;
+                               next.since_newline = 0;
                                 free(tk);
                                 tk = NULL;
                                 parser_trace_action(trace, "Record");
@@ -2622,6 +2674,8 @@ since the last state which could have been at the start of a line.
                                                         tos->newline_permitted = 1;
                                                 else if (p.tos > 1)
                                                         tos->newline_permitted = p.stack[p.tos-2].newline_permitted;
+                                               else
+                                                       tos->newline_permitted = 0;
                                         }
                                         free(tk);
                                         tk = NULL;
@@ -2632,20 +2686,25 @@ since the last state which could have been at the start of a line.
                                 // will fail).
                         }
                         if (next.sym == TK_newline) {
-                               if (! tos->newline_permitted) {
+                               if (!tos->newline_permitted) {
                                         free(tk);
                                         tk = NULL;
                                         parser_trace_action(trace, "Discard");
                                         continue;
                                 }
+                               if (states[tos->state].reduce_size > 0 &&
+                                   states[tos->state].reduce_size < tos->since_newline)
+                                       goto force_reduce;
                         }
                         if (shift(&p, &next, tk, states)) {
-                               tk = NULL;
+                               next.since_newline = !(tk->num == TK_newline);
                                 next.starts_indented = 0;
                                 next.indents = 0;
+                               tk = NULL;
                                 parser_trace_action(trace, "Shift");
                                 continue;
                         }
+               force_reduce:
                         if (states[tos->state].reduce_prod >= 0) {
                                 void **body;
                                 void *res;
@@ -2666,6 +2725,7 @@ since the last state which could have been at the start of a line.
                                 else {
                                         frame.indents = next.indents;
                                         frame.starts_indented = frame.indents;
+                                       frame.since_newline = 1;
                                         next.indents = 0;
                                         next.starts_indented = 0;
                                 }
@@ -2685,6 +2745,7 @@ since the last state which could have been at the start of a line.
                                 struct frame frame = { 0 };
                                 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[tos->state].shift_sym);
                                 frame.sym = states[tos->state].shift_sym;
+                               frame.since_newline = 1;
                                 shift(&p, &frame, tok_copy(*tk), states);
                                 // FIXME need to report this error somehow
                                 parser_trace_action(trace, "Synthesize");
@@ -2774,7 +2835,7 @@ end inside square brackets.
                 if (states[f->state].starts_line)
                         fprintf(trace, "s");
                 if (f->newline_permitted)
-                       fprintf(trace, "n");
+                       fprintf(trace, "n%d", f->newline_permitted);
                 fprintf(trace, ") ");
         }
  
@@ -2801,6 +2862,8 @@ end inside square brackets.
                                 if (f->indents)
                                         fprintf(trace, "%c%d", f->starts_indented?':':'.',
                                                 f->indents);
+                               if (f->since_newline == 0)
+                                       fputs("/", trace);
                                 fputs(" ", trace);
                         }
                         parser_trace_state(trace, f, states);
@@ -2814,6 +2877,8 @@ end inside square brackets.
                 if (n->indents)
                         fprintf(trace, "%c%d", n->starts_indented?':':'.',
                                 n->indents);
+               if (n->since_newline == 0)
+                       fputs("/", trace);
                 fputs("]", trace);
         }