parsergen: add action tables when needed.

[ocean] / csrc / parsergen.mdc
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index 076f3823941a20c4abb05237da892bc726d70266..e6dcccbf34bc46f4bcca75dca9cb0d7f325d4d55 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -1191,10 +1191,27 @@ And now we can build the list of itemsets.  The lookup routine returns
  both a success flag and a pointer to where in the list an insert should
  happen, so we don't need to search a second time.
  
  both a success flag and a pointer to where in the list an insert should
  happen, so we don't need to search a second time.
  
+Each state will often have at most one reducible production, but
+occasionally will have more.  If there are more we will need a full
+action table (merged in the go to table).  If there is one, we record it
+in the state.  If none, we record that too.  These special values will
+be needed both in the generator and in the parser, so they need to be
+declared twice.
+
+###### exported types
+       enum {
+               NONE_REDUCIBLE = -1,
+               MANY_REDUCIBLE = -2,
+       };
  ###### declarations
  ###### declarations
+       enum {
+               NONE_REDUCIBLE = -1,
+               MANY_REDUCIBLE = -2,
+       };
         struct itemset {
                 struct itemset *next;
                 short state;
         struct itemset {
                 struct itemset *next;
                 short state;
+               short reducible_prod; /* or NONE_REDUCIBLE or MANY_REDUCIBLE */
                 struct symset items;
                 struct symset go_to;
                 enum assoc assoc;
                 struct symset items;
                 struct symset go_to;
                 enum assoc assoc;
@@ -1318,8 +1335,15 @@ so the item is ineffective.
                 struct symset LA = INIT_SYMSET;
                 unsigned short sn = 0;
  
                 struct symset LA = INIT_SYMSET;
                 unsigned short sn = 0;
  
-               if (bs == pr->body_size)
+               if (i == 0)
+                       is->reducible_prod = NONE_REDUCIBLE;
+               if (bs == pr->body_size) {
+                       if (is->reducible_prod == NONE_REDUCIBLE)
+                               is->reducible_prod = p;
+                       else
+                               is->reducible_prod = MANY_REDUCIBLE;
                         continue;
                         continue;
+               }
                 s = pr->body[bs];
                 if (s->precedence && is->precedence &&
                     is->precedence > s->precedence)
                 s = pr->body[bs];
                 if (s->precedence && is->precedence &&
                     is->precedence > s->precedence)
@@ -1888,6 +1912,7 @@ grammar file, so they are generated first.
         {
                 gen_known(f, g);
                 gen_non_term(f, g);
         {
                 gen_known(f, g);
                 gen_non_term(f, g);
+               gen_action(f, g);
                 gen_goto(f, g);
                 gen_states(f, g);
                 gen_reductions(f, g);
                 gen_goto(f, g);
                 gen_states(f, g);
                 gen_reductions(f, g);
@@ -1943,6 +1968,53 @@ The table of nonterminals used for tracing is a similar array.
                 fprintf(f, "};\n\n");
         }
  
                 fprintf(f, "};\n\n");
         }
  
+### The Action table
+
+For this parser we do not have a separate action table.  The action
+table normally maps terminals to one of the actions SHIFT, REDUCE(p), or
+ACCEPT.  We can find which terminals to SHIFT via a lookup in the go to
+table, and most states have at most one reducible production and in that
+case to effect that reduction for any non-shiftable terminal in the
+look-ahead.
+
+However, with SLR, LALR, and canonical-LR grammars, there may
+occasionally be multiple possible reductions which we choose between
+based on the look-ahead.  To handle this possibility we allow reductions
+to be stored in the go to table.  An extra flag indicates if an entry is
+a state to go to, or a production to reduce.  We add the entries with
+that flag here, if necessary.
+
+###### functions
+       #define IS_REDUCTION (0x8000)
+
+       static void gen_action(FILE *f, struct grammar *g)
+       {
+               int i, j, k;
+               for (i = 0; i < g->states; i++) {
+                       struct itemset *is = g->statetab[i];
+                       struct symset *gt = &is->go_to;
+
+                       if (is->reducible_prod != MANY_REDUCIBLE)
+                               continue;
+                       for (j = 0; j < is->items.cnt; j++) {
+                               int itm = is->items.syms[j];
+                               int p = item_prod(itm);
+                               int bp = item_dot(itm);
+                               struct production *pr = g->productions[p];
+                               struct symset la;
+                               if (bp != pr->body_size)
+                                       continue;
+                               /* This is a reducible item */
+                               if (g->follow)
+                                       la = g->follow[pr->head->num];
+                               else
+                                       la = set_find(g, is->items.data[j]);
+                               for (k = 0 ; k < la.cnt; k++)
+                                       symset_add(gt, la.syms[k], p | IS_REDUCTION);
+                       }
+               }
+       }
+
  ### States, reductions, and the go to tables.
  
  For each state we record the go to table and the reducible production if
  ### States, reductions, and the go to tables.
  
  For each state we record the go to table and the reducible production if
@@ -1961,7 +2033,8 @@ The go to table is stored in a simple array of `sym` and corresponding
  
         struct lookup {
                 short sym;
  
         struct lookup {
                 short sym;
-               short state;
+               short state_or_reduction:15;
+               unsigned short is_reduction:1;
         };
         struct reduction {
                 short size;
         };
         struct reduction {
                 short size;
@@ -1969,9 +2042,9 @@ The go to table is stored in a simple array of `sym` and corresponding
                 short result_size;
         };
         struct state {
                 short result_size;
         };
         struct state {
+               short reduce_prod;      // or NONE_REDUCIBLE or MANY_REDUCIBLE
                 short go_to_cnt;
                 short go_to_cnt;
-               const struct lookup * go_to;
-               short reduce_prod;
+               const struct lookup * go_to; // includes reductions
         };
  
  ###### functions
         };
  
  ###### functions
@@ -1988,9 +2061,14 @@ The go to table is stored in a simple array of `sym` and corresponding
                                 continue;
                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
                                 i);
                                 continue;
                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
                                 i);
-                       for (j = 0; j < gt.cnt; j++)
-                               fprintf(f, "\t{ %d, %d },\n",
-                                       gt.syms[j], gt.data[j]);
+                       for (j = 0; j < gt.cnt; j++) {
+                               if (gt.data[j] & IS_REDUCTION)
+                                       fprintf(f, "\t{ %d, %d, 1 },\n",
+                                               gt.syms[j], gt.data[j]&~IS_REDUCTION);
+                               else
+                                       fprintf(f, "\t{ %d, %d, 0 },\n",
+                                               gt.syms[j], gt.data[j]);
+                       }
                         fprintf(f, "};\n");
                 }
         }
                         fprintf(f, "};\n");
                 }
         }
@@ -2022,27 +2100,13 @@ The go to table is stored in a simple array of `sym` and corresponding
                 fprintf(f, "static const struct state states[] = {\n");
                 for (i = 0; i < g->states; i++) {
                         struct itemset *is = g->statetab[i];
                 fprintf(f, "static const struct state states[] = {\n");
                 for (i = 0; i < g->states; i++) {
                         struct itemset *is = g->statetab[i];
-                       int j, prod = -1, prod_len;
-
-                       for (j = 0; j < is->items.cnt; j++) {
-                               int itm = is->items.syms[j];
-                               int p = item_prod(itm);
-                               int bp = item_dot(itm);
-                               struct production *pr = g->productions[p];
+                       int prod = is->reducible_prod;
  
  
-                               if (bp < pr->body_size)
-                                       continue;
-                               /* This is what we reduce - choose longest */
-                               if (prod < 0 || prod_len < pr->body_size) {
-                                       prod = p;
-                                       prod_len = pr->body_size;
-                               }
-                       }
                         if (is->go_to.cnt)
                         if (is->go_to.cnt)
-                               fprintf(f, "\t[%d] = { %d, goto_%d, %d },\n",
-                                       i, is->go_to.cnt, i, prod);
+                               fprintf(f, "\t[%d] = { %d, %d, goto_%d },\n",
+                                       i, prod, is->go_to.cnt, i);
                         else
                         else
-                               fprintf(f, "\t[%d] = { 0, NULL, %d },\n", i, prod);
+                               fprintf(f, "\t[%d] = { %d, 0, NULL },\n", i, prod);
                 }
                 fprintf(f, "};\n\n");
         }
                 }
                 fprintf(f, "};\n\n");
         }
@@ -2584,7 +2648,7 @@ table.
  
  ###### parser functions
  
  
  ###### parser functions
  
-       static int search(const struct state *l, int sym)
+       static int search(const struct state *l, int sym, int reduction)
         {
                 int lo = 0;
                 int hi = l->go_to_cnt;
         {
                 int lo = 0;
                 int hi = l->go_to_cnt;
@@ -2598,10 +2662,11 @@ table.
                         else
                                 hi = mid;
                 }
                         else
                                 hi = mid;
                 }
-               if (l->go_to[lo].sym == sym)
-                       return l->go_to[lo].state;
-               else
+               if (l->go_to[lo].sym != sym)
                         return -1;
                         return -1;
+               if (!reduction == !l->go_to[lo].is_reduction)
+                       return l->go_to[lo].state_or_reduction;
+               return -1;
         }
  
  ### Memory allocation
         }
  
  ### Memory allocation
@@ -2688,25 +2753,6 @@ stack is empty, it always chooses zero as the next state.
  So `shift` finds the next state.  If that succeeds it extends the
  allocations if needed and pushes all the information onto the stacks.
  
  So `shift` finds the next state.  If that succeeds it extends the
  allocations if needed and pushes all the information onto the stacks.
  
-An extra complication is added to `shift` by the `EOL` token.  This
-token must be generated when a `NEWLINE` is seen, but an `EOL` is
-expected.  When this happens, the `NEWLINE` is NOT consumed, so multiple
-EOL can appear before a NEWLINE.  To indicate that the token was shifted
-by not consumed, `shift` can return the special value `2`.  The token
-number for `EOL` cannot be statically declared, so when the parser
-starts we need to look through the array of non-terminals to find the
-EOL.
-
-###### parser state
-       int tk_eol;
-
-###### find eol
-       p.tk_eol = 0;
-       while (strcmp(non_term[p.tk_eol], "EOL") != 0)
-               p.tk_eol += 1;
-       p.tk_eol += TK_reserved + config->known_count;
-
-
  ###### parser functions
  
         static int shift(struct parser *p,
  ###### parser functions
  
         static int shift(struct parser *p,
@@ -2714,27 +2760,12 @@ EOL.
                          const struct state states[])
         {
                 struct frame next = {0};
                          const struct state states[])
         {
                 struct frame next = {0};
-               int ret;
                 int newstate = p->tos
                         ? search(&states[p->stack[p->tos-1].state],
                 int newstate = p->tos
                         ? search(&states[p->stack[p->tos-1].state],
-                                sym)
+                                sym, 0)
                         : 0;
                         : 0;
-               if (newstate >= 0)
-                       ret = 1;
-               else if (sym != TK_newline)
+               if (newstate < 0)
                         return 0;
                         return 0;
-               else {
-                       // have a NEWLINE, might need an EOL
-                       sym = p->tk_eol;
-                       newstate = p->tos
-                               ? search(&states[p->stack[p->tos-1].state],
-                                        sym)
-                               : 0;
-                       if (newstate < 0)
-                               return 0;
-                       ret = 2;
-                       asn = tok_copy(*(struct token*)asn);
-               }
  
                 if (p->tos >= p->stack_size) {
                         p->stack_size += 10;
  
                 if (p->tos >= p->stack_size) {
                         p->stack_size += 10;
@@ -2749,7 +2780,7 @@ EOL.
                 p->stack[p->tos] = next;
                 p->asn_stack[p->tos] = asn;
                 p->tos++;
                 p->stack[p->tos] = next;
                 p->asn_stack[p->tos] = asn;
                 p->tos++;
-               return ret;
+               return 1;
         }
  
  `pop` primarily moves the top of stack (`tos`) back down the required
         }
  
  `pop` primarily moves the top of stack (`tos`) back down the required
@@ -2807,9 +2838,27 @@ of indentation.
  ###### parser state
         unsigned long ignored_indents;
  
  ###### parser state
         unsigned long ignored_indents;
  
-NEWLINE/EOL is ignored when in an indented section of text which was not
+NEWLINE is ignored when in an indented section of text which was not
  explicitly expected by the grammar.  So if the most recent indent is
  explicitly expected by the grammar.  So if the most recent indent is
-ignored, so is any EOL token.
+ignored, so is any NEWLINE token.
+
+If a NEWLINE is seen but it cannot be shifted, we try to shift an EOL
+token instead.  If that succeeds, we make a new copy of the NEWLINE
+token and continue.  This allows a NEWLINE to appear to be preceded by
+an indefinite number of EOL tokens.
+
+The token number for `EOL` cannot be statically declared, so when the
+parser starts we need to look through the array of non-terminals to find
+the EOL.
+
+###### parser state
+       int tk_eol;
+
+###### find eol
+       p.tk_eol = 0;
+       while (strcmp(non_term[p.tk_eol], "EOL") != 0)
+               p.tk_eol += 1;
+       p.tk_eol += TK_reserved + config->known_count;
  
  For other tokens, we shift the next token if that is possible, otherwise
  we try to reduce a production.
  
  For other tokens, we shift the next token if that is possible, otherwise
  we try to reduce a production.
@@ -2827,29 +2876,42 @@ we try to reduce a production.
                 continue;
         }
  
                 continue;
         }
  
-       switch (shift(&p, tk->num, tk, states)) {
-       case 1:
+       if (shift(&p, tk->num, tk, states)) {
                 if (tk->num == TK_out)
                         p.ignored_indents >>= 1;
                 if (tk->num == TK_in)
                         p.ignored_indents <<= 1;
  
                 if (tk->num == TK_out)
                         p.ignored_indents >>= 1;
                 if (tk->num == TK_in)
                         p.ignored_indents <<= 1;
  
+               parser_trace_action(trace, "Shift");
                 tk = NULL;
                 tk = NULL;
-               /* fallthrough */
-       case 2:
-               parser_trace_action(trace, tk ? "ShiftEOL" : "Shift");
                 ## did shift
                 continue;
                 ## did shift
                 continue;
+       } else if (tk->num == TK_newline &&
+                  shift(&p, p.tk_eol, tk, states)) {
+               tk = tok_copy(*tk);
+               parser_trace_action(trace, "ShiftEOL");
+               continue;
         }
  
         }
  
-       if (tk->num == TK_in && states[p.stack[p.tos-1].state].go_to_cnt > 0) {
-               /* No indent expected here and reduce is not mandatory, so ignore IN */
-               free(tk);
-               tk = NULL;
-               p.ignored_indents <<= 1;
-               p.ignored_indents |= 1;
-               parser_trace_action(trace, "Ignore");
-               continue;
+       if (tk->num == TK_in) {
+               int should_reduce;
+               if (states[tos->state].reduce_prod == MANY_REDUCIBLE)
+                       /* Only reduce if TK_in in lookahead */
+                       should_reduce = (search(&states[tos->state], TK_in, 1) >= 0);
+               else
+                       /* Only reduce if we cannot shift anything */
+                       should_reduce = (states[tos->state].go_to_cnt == 0);
+               if (!should_reduce) {
+                       /* No indent expected here and reduce is not indicated,
+                        * so ignore IN
+                        */
+                       free(tk);
+                       tk = NULL;
+                       p.ignored_indents <<= 1;
+                       p.ignored_indents |= 1;
+                       parser_trace_action(trace, "Ignore");
+                       continue;
+               }
         }
  
  We have already discussed the bulk of the handling of a "reduce" action,
         }
  
  We have already discussed the bulk of the handling of a "reduce" action,
@@ -2865,24 +2927,26 @@ report that that input has been accepted.
  ###### parser vars
  
         void *ret = NULL;
  ###### parser vars
  
         void *ret = NULL;
+       int reduction;
  
  ###### try reduce
  
  
  ###### try reduce
  
-       if (states[tos->state].reduce_prod >= 0) {
+       reduction = states[tos->state].reduce_prod;
+       if (reduction == MANY_REDUCIBLE)
+               reduction = search(&states[tos->state], tk->num, 1);
+       if (reduction >= 0) {
                 void **body;
                 void *res;
                 void **body;
                 void *res;
-               const struct state *nextstate = &states[tos->state];
-               int prod = nextstate->reduce_prod;
-               int size = reductions[prod].size;
-               int res_size = reductions[prod].result_size;
+               int size = reductions[reduction].size;
+               int res_size = reductions[reduction].result_size;
  
                 body = p.asn_stack + (p.tos - size);
                 res = res_size ? calloc(1, res_size) : NULL;
  
                 body = p.asn_stack + (p.tos - size);
                 res = res_size ? calloc(1, res_size) : NULL;
-               res_size = do_reduce(prod, body, config, res);
-               if (res_size != reductions[prod].result_size)
+               res_size = do_reduce(reduction, body, config, res);
+               if (res_size != reductions[reduction].result_size)
                         abort();
                 pop(&p, size, do_free);
                         abort();
                 pop(&p, size, do_free);
-               if (!shift(&p, reductions[prod].sym, res, states)) {
+               if (!shift(&p, reductions[reduction].sym, res, states)) {
                         accepted = 1;
                         ret = res;
                         parser_trace_action(trace, "Accept");
                         accepted = 1;
                         ret = res;
                         parser_trace_action(trace, "Accept");
@@ -3176,3 +3240,73 @@ exits with an error.
         355//113
  
         error
         355//113
  
         error
+
+## A second example
+
+I had originally thought that the LR05 style grammar would suit me there
+should never be a sequence of token that means either of two different
+things depending on what comes follows it.  Without that ambiguity, the
+extra strength of LALR, or even SLR, would not be needed.  However I
+discovered this was too simplistic.  There are times when it is quite
+reasonable for the empty string, or maybe a NEWLINE, to have different
+meanings depending on the next token.  For this, LR05 is not sufficient,
+so I added full action-table support.
+
+This example tests the selection of REDUCE actions based on the
+look-ahead symbol, using a trivial grammar where the empty string can
+reasonably mean different things depending on what it precedes.
+
+
+####### File: parsergen.mk
+       demos :: do_lalr_demo
+       lalr_demo.c lalr_demo.h : parsergen parsergen.mdc
+               ./parsergen --LALR --tag demo -o lalr_demo parsergen.mdc
+       lalr_demo: lalr_demo.o libparser.o libscanner.o libmdcode.o
+               $(CC) -o lalr_demo lalr_demo.c libparser.o libscanner.o libmdcode.o -licuuc
+       do_lalr_demo: lalr_demo
+               NOTRACE=1 ./lalr_demo
+               @echo 'Should produce "start of line, empty sign, empty sigl"'
+
+####### demo: grammar
+
+       $TERM $ @ + -
+       line -> ${ printf("start of line"); }$
+               | line term
+       term -> sigl IDENTIFIER
+               | sign NUMBER
+       sigl ->   $
+               | @
+               | ${ printf(", empty sigl"); }$
+       sign ->   +
+               | -
+               | ${ printf(", empty sign"); }$
+
+####### demo: header
+
+       //
+
+####### demo: code
+
+       #include <stdlib.h>
+       #include <stdio.h>
+       #include "mdcode.h"
+       #include "scanner.h"
+       #include "parser.h"
+       #include "lalr_demo.h"
+       int main(int argc, char *argv[])
+       {
+               char test[] = "````\n$a -1 5 b\n";
+               struct section *table = code_extract(test, test+sizeof(test)-1, NULL);
+               struct token_config config = {
+                       .ignored = (1 << TK_line_comment)
+                                | (1 << TK_newline)
+                                | (1 << TK_in)
+                                | (1 << TK_out),
+                       .number_chars = "",
+                       .word_start = "",
+                       .word_cont = "",
+               };
+               parse_lalr_demo(table->code, &config, getenv("NOTRACE") ? NULL : stderr);
+               printf("\n");
+               exit(0);
+       }