parsergen: avoid infinite loop on error.

[ocean] / csrc / parsergen.mdc
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index 3f36df9a6873c3c94f2f127d8a0ca6419e50603c..cdb5f82204cb670d87d570e96177d52eb1aa5e41 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -400,6 +400,21 @@ The `<` may be included for symbols declared as storing a reference
  (not a structure) and means that the reference is being moved out, so
  it will not automatically be freed.
  
+Symbols that are left-recursive are a little special.  These are symbols
+that both the head of a production and the first body symbol of the same
+production.  They are problematic when they appear in other productions
+elsewhere than at the end, and when indenting is used to describe
+structure.  In this case there is no way for a smaller indent to ensure
+the left-recursive symbol cannot be extended.  When it appears at the
+end of a production, that production can be reduced to ensure the symbol
+isn't extended.  So we record left-recursive symbols while reading the
+grammar, and produce a warning when reporting the grammar if they are
+found in an unsuitable place.
+
+A symbol that is only left recursive in a production where it is
+followed by newline does not cause these problems because the newline
+will effectively terminate it.
+
  While building productions we will need to add to an array which needs to
  grow dynamically.
  
@@ -460,6 +475,7 @@ Now we have all the bits we need to parse a full production.
  
  ###### symbol fields
         int first_production;
+       int left_recursive;
  
  ###### functions
         static char *parse_production(struct grammar *g,
@@ -519,6 +535,11 @@ Now we have all the bits we need to parse a full production.
                         }
                         tk = token_next(state);
                 }
+               if (p.body_size >= 2 &&
+                   p.body[0] == p.head &&
+                   p.body[1]->num != TK_newline)
+                       p.head->left_recursive = 1;
+
                 if (tk.num != TK_newline && tk.num != TK_eof) {
                         err = "stray tokens at end of line";
                         goto abort;
@@ -1524,6 +1545,11 @@ changeover point in `first_nonterm`.
                                 snum++;
                         }
                 g->first_nonterm = snum;
+               for (s = g->syms; s; s = s->next)
+                       if (s->num < 0 && s->type != Virtual) {
+                               s->num = snum;
+                               snum++;
+                       }
                 for (s = g->syms; s; s = s->next)
                         if (s->num < 0) {
                                 s->num = snum;
@@ -1562,7 +1588,7 @@ all the tables that have been generated, plus a description of any conflicts.
                 if (g->follow)
                         report_follow(g);
                 report_itemsets(g);
-               return report_conflicts(g, type);
+               return report_conflicts(g, type) + report_left_recursive(g);
         }
  
  Firstly we have the complete list of symbols, together with the
@@ -1883,6 +1909,43 @@ but handled internally.
                 return cnt;
         }
  
+
+### Reporting non-final left-recursive symbols.
+
+Left recursive symbols are a problem for parses that honour indentation
+when they appear other than at the end of the production.  So we need to
+report these when asked.
+
+###### functions
+
+       static int report_left_recursive(struct grammar *g)
+       {
+               int p;
+               int bad_left_recursive = 0;
+
+               for (p = 0; p < g->production_count; p++) {
+                       struct production *pr = g->productions[p];
+                       int sn;
+
+                       for (sn = 0; sn < pr->body_size - 1; sn++) {
+                               struct symbol *s = pr->body[sn];
+
+                               if (s->left_recursive == 1 &&
+                                   s != pr->head) {
+                                       if (!bad_left_recursive) {
+                                               bad_left_recursive = 1;
+                                               printf("Misplaced left recursive symbols.\n");
+                                       }
+                                       printf("  ");
+                                       prtxt(s->name);
+                                       printf(" in production [%d]\n", p);
+                                       s->left_recursive = 2;
+                               }
+                       }
+               }
+               return bad_left_recursive;
+       }
+
  ## Generating the parser
  
  The exported part of the parser is the `parse_XX` function, where the name
@@ -1953,7 +2016,7 @@ numbers right.
                 for (i = TK_reserved;
                      i < g->num_syms;
                      i++)
-                       if (g->symtab[i]->type != Terminal)
+                       if (g->symtab[i]->type == Nonterminal)
                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
                                         g->symtab[i]->name.txt);
                 fprintf(f, "};\n\n");
@@ -2716,8 +2779,12 @@ accepted the input and can finish.
  We return whatever `asn` was returned by reducing production zero.
  
  If we can neither shift nor reduce we have an error to handle.  We pop
-single entries off the stack until we can shift the `TK_error` symbol, then
-drop input tokens until we find one we can shift into the new error state.
+single entries off the stack until we can shift the `TK_error` symbol,
+then drop input tokens until we find one we can shift into the new error
+state.  We need to ensure that something is dropped or shifted after an
+error, or we could get into an infinite loop, only shifting in
+`TK_error`, then reducing.  So we track if there has been a shift since
+the last error, and if not the next error always discards one token.
  
  When we find `TK_in` and `TK_out` tokens which report indents we need
  to handle them directly as the grammar cannot express what we want to
@@ -2761,11 +2828,11 @@ in the thing that preceed:
  Here the NEWLINE will be shifted because nothing can be reduced until
  the `if` is seen.
  
-When, during error handling, we discard token read in, we want to keep
+When during error handling we discard tokens read in, we want to keep
  discarding until we see one that is recognised.  If we had a full set
-of LR(1) grammar states, this will mean looking in the look-ahead set,
+of LR(1) grammar states, this would mean looking in the look-ahead set,
  but we don't keep a full look-ahead set.  We only record the subset
-that leads to SHIFT.  We can, however, deduce the look-ahead set but
+that leads to SHIFT.  We can, however, deduce the look-ahead set by
  looking at the SHIFT subsets for all states that we can get to by
  reducing zero or more times.  So we need a little function which
  checks if a given token is in any of these look-ahead sets.
@@ -2797,6 +2864,7 @@ checks if a given token is in any of these look-ahead sets.
                 struct parser p = { 0 };
                 struct token *tk = NULL;
                 int accepted = 0;
+               int shift_since_err = 1;
                 void *ret = NULL;
  
                 shift(&p, TK_eof, 0, 1, NULL, states);
@@ -2868,6 +2936,7 @@ checks if a given token is in any of these look-ahead sets.
                                         goto force_reduce;
                         }
                         if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
+                               shift_since_err = 1;
                                 tk = NULL;
                                 parser_trace_action(trace, "Shift");
                                 continue;
@@ -2932,11 +3001,22 @@ checks if a given token is in any of these look-ahead sets.
                                 // no state accepted TK_error
                                 break;
                         }
+                       if (!shift_since_err) {
+                               /* must discard at least one token to avoid
+                                * infinite loop.
+                                */
+                               if (tk->num == TK_eof)
+                                       break;
+                               free(tk);
+                               tk = tok_copy(token_next(tokens));
+                       }
+                       shift_since_err = 0;
                         tos = &p.stack[p.tos-1];
                         while (!in_lookahead(tk, states, tos->state) &&
                                tk->num != TK_eof) {
                                 free(tk);
                                 tk = tok_copy(token_next(tokens));
+                               shift_since_err = 1;
                                 if (tk->num == TK_in)
                                         indents += 1;
                                 if (tk->num == TK_out) {