parsergen: don't report expected shift/reduce conflicts.

[ocean] / csrc / parsergen.mdc
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index 0d7832491444975e85ef8b28c50df7e29246f5ab..3c816fa009a23b080597800272f1600123470702 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -1174,12 +1174,29 @@ can just compare the symset and the data values together.
                         a.data[i] - b.data[i];
         }
  
+It will be helpful to know if an itemset has been "completed" or not,
+particularly for LALR where itemsets get merged, at which point they
+need to be consider for completion again.  So  a `completed` flag is needed.
+
+For correct handling of `TK_newline` when parsing, we will need to
+know which states (itemsets) can occur at the start of a line, so we
+will record a `starts_line` flag too.
+
+Finally, for handling `TK_out` we need to know where production in the
+current state started *before* the most recent indent.  A state
+doesn't usually keep details of individual productions, so we need to
+add one extra detail. `min_prefix` is the smallest non-zero number of
+symbols *before* DOT in any production in an itemset.  This will allow
+us to determine if the the most recent indent is sufficiently recent
+to cancel it against a `TK_out`.  If it was seen longer ago than the
+`min_prefix`, and if the current state cannot be reduced, then the
+indented section must have ended in the middle of a syntactic unit, so
+an error must be signaled.
+
  And now we can build the list of itemsets.  The lookup routine returns
  both a success flag and a pointer to where in the list an insert
  should happen, so we don't need to search a second time.
  
-FIXME: document min_prefix
-
  ###### declarations
         struct itemset {
                 struct itemset *next;
@@ -1779,6 +1796,15 @@ terminals to items where that terminal could be shifted and another
  which maps terminals to items that could be reduced when the terminal
  is in look-ahead.  We report when we get conflicts between the two.
  
+As a special case, if we find a SHIFT/REDUCE conflict, where a
+terminal that could be shifted is in the lookahead set of some
+reducable item, then set check if the reducable item also have
+`TK_newline` in its lookahead set.  If it does, then a newline will
+force and reduction, but anything else can reasonably be shifts, so
+that isn't really a conflict.  Such apparent conflicts do not get
+reported.  This will not affect a "tradtional" grammar that does not
+include newlines as token.
+
         static int conflicts_slr(struct grammar *g, enum grammar_type type)
         {
                 int i;
@@ -1806,7 +1832,7 @@ is in look-ahead.  We report when we get conflicts between the two.
                                                 symset_add(&shifts, sym, itm);
                                 }
                         }
-                       /* Now look for reduction and conflicts */
+                       /* Now look for reductions and conflicts */
                         for (j = 0; j < is->items.cnt; j++) {
                                 unsigned short itm = is->items.syms[j];
                                 int p = item_prod(itm);
@@ -1824,7 +1850,7 @@ is in look-ahead.  We report when we get conflicts between the two.
                                 int k;
                                 for (k = 0; k < la.cnt; k++) {
                                         int pos = symset_find(&shifts, la.syms[k]);
-                                       if (pos >= 0) {
+                                       if (pos >= 0 && symset_find(&la, TK_newline) < 0) {
                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
                                                 prtxt(g->symtab[la.syms[k]]->name);
                                                 printf(":\n");
@@ -1950,7 +1976,6 @@ The go to table is stored in a simple array of `sym` and corresponding
                 short reduce_prod;
                 short reduce_size;
                 short reduce_sym;
-               short shift_sym;
                 short starts_line;
                 short min_prefix;
         };
@@ -1984,23 +2009,15 @@ The go to table is stored in a simple array of `sym` and corresponding
                 for (i = 0; i < g->states; i++) {
                         struct itemset *is = g->statetab[i];
                         int j, prod = -1, prod_len;
-                       int shift_sym = -1;
-                       int shift_len = 0, shift_remain = 0;
+
                         for (j = 0; j < is->items.cnt; j++) {
                                 int itm = is->items.syms[j];
                                 int p = item_prod(itm);
                                 int bp = item_index(itm);
                                 struct production *pr = g->productions[p];
  
-                               if (bp < pr->body_size) {
-                                       if (shift_sym < 0 ||
-                                           (shift_len == bp && shift_remain > pr->body_size - bp)) {
-                                               shift_sym = pr->body[bp]->num;
-                                               shift_len = bp;
-                                               shift_remain = pr->body_size - bp;
-                                       }
+                               if (bp < pr->body_size)
                                         continue;
-                               }
                                 /* This is what we reduce */
                                 if (prod < 0 || prod_len < pr->body_size) {
                                         prod = p;
@@ -2009,14 +2026,14 @@ The go to table is stored in a simple array of `sym` and corresponding
                         }
  
                         if (prod >= 0)
-                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d, %d },\n",
+                               fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d },\n",
                                         i, is->go_to.cnt, i, prod,
                                         g->productions[prod]->body_size,
                                         g->productions[prod]->head->num,
                                         is->starts_line, is->min_prefix);
                         else
-                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d, %d },\n",
-                                       i, is->go_to.cnt, i, shift_sym,
+                               fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
+                                       i, is->go_to.cnt, i,
                                         is->starts_line, is->min_prefix);
                 }
                 fprintf(f, "};\n\n");
@@ -2695,8 +2712,8 @@ within the stack.  If we can reduce some symbols that are all since
  the most recent indent, then we do that first.  If the minimum prefix
  of the current state then extends back before the most recent indent,
  that indent can be cancelled.  If the minimum prefix is shorter then
-the indent is premature and we must start error handling, which
-currently doesn't work at all.
+the indent had ended prematurely and we must start error handling, which
+is still a work-in-progress.
  
  `TK_newline` tokens are ignored unless the top stack frame records
  that they are permitted.  In that case they will not be considered for
@@ -2705,9 +2722,32 @@ the most recent start of line.  This is how a newline forcible
  terminates any line-like structure - we try to reduce down to at most
  one symbol for each line where newlines are allowed.
  
+When, during error handling, we discard token read in, we want to keep
+discarding until we see one that is recognised.  If we had a full set
+of LR(1) grammar states, this will mean looking in the look-ahead set,
+but we don't keep a full look-ahead set.  We only record the subset
+that leads to SHIFT.  We can, however, deduce the look-ahead set but
+looking at the SHIFT subsets for all states that we can get to by
+reducing zero or more times.  So we need a little function which
+checks if a given token is in any of these look-ahead sets.
+
  ###### parser includes
         #include "parser.h"
+
  ###### parser_run
+
+       static int in_lookahead(struct token *tk, const struct state *states, int state)
+       {
+               while (state >= 0) {
+                       if (search(&states[state], tk->num) >= 0)
+                               return 1;
+                       if (states[state].reduce_prod < 0)
+                               return 0;
+                       state = search(&states[state], states[state].reduce_sym);
+               }
+               return 0;
+       }
+
         void *parser_run(struct token_state *tokens,
                          const struct state states[],
                          int (*do_reduce)(int, void**, struct token_config*, void*),
@@ -2773,8 +2813,8 @@ one symbol for each line where newlines are allowed.
                                         parser_trace_action(trace, "Cancel");
                                         continue;
                                 }
-                               // fall through and force a REDUCE (as 'shift'
-                               // will fail).
+                               // fall through to error handling as both SHIFT and REDUCE
+                               // will fail.
                         }
                         if (tk->num == TK_newline) {
                                 if (!tos->newline_permitted) {
@@ -2822,16 +2862,6 @@ one symbol for each line where newlines are allowed.
                                 parser_trace_action(trace, "Reduce");
                                 continue;
                         }
-                       if (tk->num == TK_out) {
-                               // Indent problem - synthesise tokens to get us
-                               // out of here.
-                               fprintf(stderr, "Synthesize %d to handle indent problem\n", states[tos->state].shift_sym);
-                               shift(&p, states[tos->state].shift_sym,
-                                     0, 1, tok_copy(*tk), states);
-                               // FIXME need to report this error somehow
-                               parser_trace_action(trace, "Synthesize");
-                               continue;
-                       }
                         /* Error. We walk up the stack until we
                          * find a state which will accept TK_error.
                          * We then shift in TK_error and see what state
@@ -2843,9 +2873,9 @@ one symbol for each line where newlines are allowed.
                         short indents = 0, start_of_line;
  
                         err_tk = tok_copy(*tk);
-                       while (shift(&p, TK_error, 0, 0,
-                                    err_tk, states) == 0
-                              && p.tos > 0)
+                       while (p.tos > 0 &&
+                              shift(&p, TK_error, 0, 0,
+                                    err_tk, states) == 0)
                                 // discard this state
                                 indents += pop(&p, 1, &start_of_line, do_free);
                         if (p.tos == 0) {
@@ -2854,7 +2884,7 @@ one symbol for each line where newlines are allowed.
                                 break;
                         }
                         tos = &p.stack[p.tos-1];
-                       while (search(&states[tos->state], tk->num) < 0 &&
+                       while (!in_lookahead(tk, states, tos->state) &&
                                tk->num != TK_eof) {
                                 free(tk);
                                 tk = tok_copy(token_next(tokens));
@@ -2867,11 +2897,7 @@ one symbol for each line where newlines are allowed.
                                         // FIXME update since_indent here
                                 }
                         }
-                       if (p.tos == 0 && tk->num == TK_eof)
-                               break;
-                       tos = &p.stack[p.tos-1];
                         tos->indents += indents;
-                       exit(1);
                 }
                 free(tk);
                 pop(&p, p.tos, NULL, do_free);