parsergen: calculate and record "min_prefix" for each state.

[ocean] / csrc / parsergen.mdc
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index 497b3cb12261d4e84d0d2949e3b84b24fab6a0cc..45fb09e5228f26ce3240a8daa46f1d6d092ffa7d 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -869,29 +869,32 @@ changes happen.
                 }
         }
  
-### Setting `can_eol` and `starts_line`
+### Setting `can_eol` and `line_like`
  
  In order to be able to ignore newline tokens when not relevant, but
  still include them in the parse when needed, we will need to know
  which states can start a "line-like" section of code.  We ignore
  newlines when there is an indent since the most recent start of a
-line-like section.
+line-like symbol.
  
-To know what is line-like, we first need to know which symbols can end
-a line-like section, which is precisely those which can end with a
-newline token.  These symbols don't necessarily alway end with a
-newline, but they can.  Hence they are not described as "lines" but
-only "line-like".
+To know which symbols are line-like, we first need to know which
+symbols start with a NEWLINE token.  Any symbol which is followed by a
+NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol.
+Certainly when trying to parse one of these we must take not of NEWLINEs.
  
-Clearly the `TK_newline` token can end with a newline.  Any symbol
-which is the head of a production that contains a line-ending symbol
-followed only by nullable symbols is also a line-ending symbol.  We
-use a new field `can_eol` to record this attribute of symbols, and
-compute it in a repetitive manner similar to `set_nullable`.
+Clearly the `TK_newline` token can start with a NEWLINE.  Any symbol
+which is the head of a production that contains a starts-with-NEWLINE
+symbol preceeded only by nullable symbols is also a
+starts-with-NEWLINE symbol.  We use a new field `can_eol` to record
+this attribute of symbols, and compute it in a repetitive manner
+similar to `set_nullable`.
+
+Once we have that, we can determine which symbols are `line_like` be
+seeing which are followed by a `can_eol` symbol in any production.
  
  ###### symbol fields
         int can_eol;
-       int starts_line;
+       int line_like;
  
  ###### functions
         static void set_can_eol(struct grammar *g)
@@ -908,7 +911,7 @@ compute it in a repetitive manner similar to `set_nullable`.
                                 if (pr->head->can_eol)
                                         continue;
  
-                               for (s = pr->body_size - 1; s >= 0; s--) {
+                               for (s = 0 ; s < pr->body_size; s++) {
                                         if (pr->body[s]->can_eol) {
                                                 pr->head->can_eol = 1;
                                                 check_again = 1;
@@ -921,16 +924,16 @@ compute it in a repetitive manner similar to `set_nullable`.
                 }
         }
  
-       static void set_starts_line(struct grammar *g)
+       static void set_line_like(struct grammar *g)
         {
                 int p;
                 for (p = 0; p < g->production_count; p++) {
                         struct production *pr = g->productions[p];
                         int s;
  
-                       for (s = 0; s < pr->body_size - 1; s++)
+                       for (s = 1; s < pr->body_size; s++)
                                 if (pr->body[s]->can_eol)
-                                       pr->body[s+1]->starts_line = 1;
+                                       pr->body[s-1]->line_like = 1;
                 }
         }
  
@@ -1175,6 +1178,7 @@ should happen, so we don't need to search a second time.
                 struct symset go_to;
                 char completed;
                 char starts_line;
+               int min_prefix;
         };
  
  ###### grammar fields
@@ -1281,12 +1285,15 @@ though.
                 struct symset LA = INIT_SYMSET;
                 unsigned short sn = 0;
  
+               if (is->min_prefix == 0 ||
+                   (bs > 0 && bs < is->min_prefix))
+                       is->min_prefix = bs;
                 if (bs == pr->body_size)
                         continue;
                 s = pr->body[bs];
                 if (symset_find(&done, s->num) < 0) {
                         symset_add(&done, s->num, 0);
-                       if (s->starts_line)
+                       if (s->line_like)
                                 is->starts_line = 1;
                 }
                 if (s->type != Nonterminal)
@@ -1464,7 +1471,7 @@ changeover point in `first_nonterm`.
  
                 set_nullable(g);
                 set_can_eol(g);
-               set_starts_line(g);
+               set_line_like(g);
                 if (type >= SLR)
                         build_first(g);
  
@@ -1517,7 +1524,7 @@ line (`<`), or if it is nullable (`.`).
                         printf(" %c%c%c%3d%c: ",
                                s->nullable ? '.':' ',
                                s->can_eol ? '>':' ',
-                              s->starts_line ? '<':' ',
+                              s->line_like ? '<':' ',
                                s->num, symtypes[s->type]);
                         prtxt(s->name);
                         if (s->precedence)
@@ -1626,7 +1633,8 @@ Now we can report all the item sets complete with items, LA sets, and GO TO.
                 for (s = 0; s < g->states; s++) {
                         int j;
                         struct itemset *is = g->statetab[s];
-                       printf("  Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
+                       printf("  Itemset %d:%s min prefix=%d\n",
+                              s, is->starts_line?" (startsline)":"", is->min_prefix);
                         for (j = 0; j < is->items.cnt; j++) {
                                 report_item(g, is->items.syms[j]);
                                 if (is->items.data != NO_DATA)
@@ -1890,6 +1898,7 @@ The go to table is stored in a simple array of `sym` and corresponding
                 short reduce_sym;
                 short shift_sym;
                 short starts_line;
+               short min_prefix;
         };
  
  
@@ -1946,15 +1955,15 @@ The go to table is stored in a simple array of `sym` and corresponding
                         }
  
                         if (prod >= 0)
-                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
+                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d, %d },\n",
                                         i, is->go_to.cnt, i, prod,
                                         g->productions[prod]->body_size,
                                         g->productions[prod]->head->num,
-                                       is->starts_line);
+                                       is->starts_line, is->min_prefix);
                         else
-                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
+                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d, %d },\n",
                                         i, is->go_to.cnt, i, shift_sym,
-                                       is->starts_line);
+                                       is->starts_line, is->min_prefix);
                 }
                 fprintf(f, "};\n\n");
         }
@@ -2514,16 +2523,16 @@ So we walk down:
                                            * sizeof(p->asn_stack[0]));
                 }
                 next->state = newstate;
-               next->newline_permitted = 0;
-               if (p->tos)
-                       next->newline_permitted =
-                               (p->stack[p->tos-1].newline_permitted?:-1)+1;
-               if (next->indents > next->starts_indented)
-                       next->newline_permitted = 0;
-               if (next->indents && next->newline_permitted > 2)
-                       next->newline_permitted = 0;
                 if (states[newstate].starts_line)
                         next->newline_permitted = 1;
+               else if (next->indents)
+                       next->newline_permitted = 0;
+               else if (p->tos)
+                       next->newline_permitted =
+                               p->stack[p->tos-1].newline_permitted;
+               else
+                       next->newline_permitted = 0;
+
                 if (next->since_newline) {
                         if (p->tos)
                                 next->since_newline = p->stack[p->tos-1].since_newline + 1;
@@ -2664,7 +2673,9 @@ since the last state which could have been at the start of a line.
                                                 if (states[tos->state].starts_line)
                                                         tos->newline_permitted = 1;
                                                 else if (p.tos > 1)
-                                                       tos->newline_permitted = (p.stack[p.tos-2].newline_permitted ?:-1)+1;
+                                                       tos->newline_permitted = p.stack[p.tos-2].newline_permitted;
+                                               else
+                                                       tos->newline_permitted = 0;
                                         }
                                         free(tk);
                                         tk = NULL;
@@ -2675,7 +2686,7 @@ since the last state which could have been at the start of a line.
                                 // will fail).
                         }
                         if (next.sym == TK_newline) {
-                               if (! tos->newline_permitted) {
+                               if (!tos->newline_permitted) {
                                         free(tk);
                                         tk = NULL;
                                         parser_trace_action(trace, "Discard");