]> ocean-lang.org Git - ocean/commitdiff
parsergen: add action tables when needed.
authorNeilBrown <neil@brown.name>
Sun, 14 Nov 2021 05:07:15 +0000 (16:07 +1100)
committerNeilBrown <neil@brown.name>
Sun, 14 Nov 2021 05:07:15 +0000 (16:07 +1100)
In most cases there is at most one reducible production per state, and
that is all we previously handled.
However occasionally it can be useful to have more than one, triggered
by different look-ahead symbols.

With this patch, we add entries to the go_to table in that case.  The
go_to table now has a flag to indicate if the symbol maps to a state (in
which case it can be SHIFTed if a terminal), or to a production (in
which case it triggers a reduction).

If the in-state production has the special value MANY_REDUCIBLE, when
the parser performs a lookup to see which production, if any, should be
reduced.

Signed-off-by: NeilBrown <neil@brown.name>
csrc/parsergen.mdc

index d9467442eeab1dfec4720f7a2b4f5c6666d43867..e6dcccbf34bc46f4bcca75dca9cb0d7f325d4d55 100644 (file)
@@ -1191,10 +1191,27 @@ And now we can build the list of itemsets.  The lookup routine returns
 both a success flag and a pointer to where in the list an insert should
 happen, so we don't need to search a second time.
 
+Each state will often have at most one reducible production, but
+occasionally will have more.  If there are more we will need a full
+action table (merged in the go to table).  If there is one, we record it
+in the state.  If none, we record that too.  These special values will
+be needed both in the generator and in the parser, so they need to be
+declared twice.
+
+###### exported types
+       enum {
+               NONE_REDUCIBLE = -1,
+               MANY_REDUCIBLE = -2,
+       };
 ###### declarations
+       enum {
+               NONE_REDUCIBLE = -1,
+               MANY_REDUCIBLE = -2,
+       };
        struct itemset {
                struct itemset *next;
                short state;
+               short reducible_prod; /* or NONE_REDUCIBLE or MANY_REDUCIBLE */
                struct symset items;
                struct symset go_to;
                enum assoc assoc;
@@ -1318,8 +1335,15 @@ so the item is ineffective.
                struct symset LA = INIT_SYMSET;
                unsigned short sn = 0;
 
-               if (bs == pr->body_size)
+               if (i == 0)
+                       is->reducible_prod = NONE_REDUCIBLE;
+               if (bs == pr->body_size) {
+                       if (is->reducible_prod == NONE_REDUCIBLE)
+                               is->reducible_prod = p;
+                       else
+                               is->reducible_prod = MANY_REDUCIBLE;
                        continue;
+               }
                s = pr->body[bs];
                if (s->precedence && is->precedence &&
                    is->precedence > s->precedence)
@@ -1888,6 +1912,7 @@ grammar file, so they are generated first.
        {
                gen_known(f, g);
                gen_non_term(f, g);
+               gen_action(f, g);
                gen_goto(f, g);
                gen_states(f, g);
                gen_reductions(f, g);
@@ -1943,6 +1968,53 @@ The table of nonterminals used for tracing is a similar array.
                fprintf(f, "};\n\n");
        }
 
+### The Action table
+
+For this parser we do not have a separate action table.  The action
+table normally maps terminals to one of the actions SHIFT, REDUCE(p), or
+ACCEPT.  We can find which terminals to SHIFT via a lookup in the go to
+table, and most states have at most one reducible production and in that
+case to effect that reduction for any non-shiftable terminal in the
+look-ahead.
+
+However, with SLR, LALR, and canonical-LR grammars, there may
+occasionally be multiple possible reductions which we choose between
+based on the look-ahead.  To handle this possibility we allow reductions
+to be stored in the go to table.  An extra flag indicates if an entry is
+a state to go to, or a production to reduce.  We add the entries with
+that flag here, if necessary.
+
+###### functions
+       #define IS_REDUCTION (0x8000)
+
+       static void gen_action(FILE *f, struct grammar *g)
+       {
+               int i, j, k;
+               for (i = 0; i < g->states; i++) {
+                       struct itemset *is = g->statetab[i];
+                       struct symset *gt = &is->go_to;
+
+                       if (is->reducible_prod != MANY_REDUCIBLE)
+                               continue;
+                       for (j = 0; j < is->items.cnt; j++) {
+                               int itm = is->items.syms[j];
+                               int p = item_prod(itm);
+                               int bp = item_dot(itm);
+                               struct production *pr = g->productions[p];
+                               struct symset la;
+                               if (bp != pr->body_size)
+                                       continue;
+                               /* This is a reducible item */
+                               if (g->follow)
+                                       la = g->follow[pr->head->num];
+                               else
+                                       la = set_find(g, is->items.data[j]);
+                               for (k = 0 ; k < la.cnt; k++)
+                                       symset_add(gt, la.syms[k], p | IS_REDUCTION);
+                       }
+               }
+       }
+
 ### States, reductions, and the go to tables.
 
 For each state we record the go to table and the reducible production if
@@ -1961,7 +2033,8 @@ The go to table is stored in a simple array of `sym` and corresponding
 
        struct lookup {
                short sym;
-               short state;
+               short state_or_reduction:15;
+               unsigned short is_reduction:1;
        };
        struct reduction {
                short size;
@@ -1969,9 +2042,9 @@ The go to table is stored in a simple array of `sym` and corresponding
                short result_size;
        };
        struct state {
-               short reduce_prod;
+               short reduce_prod;      // or NONE_REDUCIBLE or MANY_REDUCIBLE
                short go_to_cnt;
-               const struct lookup * go_to;
+               const struct lookup * go_to; // includes reductions
        };
 
 ###### functions
@@ -1988,9 +2061,14 @@ The go to table is stored in a simple array of `sym` and corresponding
                                continue;
                        fprintf(f, "static const struct lookup goto_%d[] = {\n",
                                i);
-                       for (j = 0; j < gt.cnt; j++)
-                               fprintf(f, "\t{ %d, %d },\n",
-                                       gt.syms[j], gt.data[j]);
+                       for (j = 0; j < gt.cnt; j++) {
+                               if (gt.data[j] & IS_REDUCTION)
+                                       fprintf(f, "\t{ %d, %d, 1 },\n",
+                                               gt.syms[j], gt.data[j]&~IS_REDUCTION);
+                               else
+                                       fprintf(f, "\t{ %d, %d, 0 },\n",
+                                               gt.syms[j], gt.data[j]);
+                       }
                        fprintf(f, "};\n");
                }
        }
@@ -2022,22 +2100,8 @@ The go to table is stored in a simple array of `sym` and corresponding
                fprintf(f, "static const struct state states[] = {\n");
                for (i = 0; i < g->states; i++) {
                        struct itemset *is = g->statetab[i];
-                       int j, prod = -1, prod_len;
-
-                       for (j = 0; j < is->items.cnt; j++) {
-                               int itm = is->items.syms[j];
-                               int p = item_prod(itm);
-                               int bp = item_dot(itm);
-                               struct production *pr = g->productions[p];
+                       int prod = is->reducible_prod;
 
-                               if (bp < pr->body_size)
-                                       continue;
-                               /* This is what we reduce - choose longest */
-                               if (prod < 0 || prod_len < pr->body_size) {
-                                       prod = p;
-                                       prod_len = pr->body_size;
-                               }
-                       }
                        if (is->go_to.cnt)
                                fprintf(f, "\t[%d] = { %d, %d, goto_%d },\n",
                                        i, prod, is->go_to.cnt, i);
@@ -2584,7 +2648,7 @@ table.
 
 ###### parser functions
 
-       static int search(const struct state *l, int sym)
+       static int search(const struct state *l, int sym, int reduction)
        {
                int lo = 0;
                int hi = l->go_to_cnt;
@@ -2598,10 +2662,11 @@ table.
                        else
                                hi = mid;
                }
-               if (l->go_to[lo].sym == sym)
-                       return l->go_to[lo].state;
-               else
+               if (l->go_to[lo].sym != sym)
                        return -1;
+               if (!reduction == !l->go_to[lo].is_reduction)
+                       return l->go_to[lo].state_or_reduction;
+               return -1;
        }
 
 ### Memory allocation
@@ -2697,7 +2762,7 @@ allocations if needed and pushes all the information onto the stacks.
                struct frame next = {0};
                int newstate = p->tos
                        ? search(&states[p->stack[p->tos-1].state],
-                                sym)
+                                sym, 0)
                        : 0;
                if (newstate < 0)
                        return 0;
@@ -2828,14 +2893,25 @@ we try to reduce a production.
                continue;
        }
 
-       if (tk->num == TK_in && states[p.stack[p.tos-1].state].go_to_cnt > 0) {
-               /* No indent expected here and reduce is not mandatory, so ignore IN */
-               free(tk);
-               tk = NULL;
-               p.ignored_indents <<= 1;
-               p.ignored_indents |= 1;
-               parser_trace_action(trace, "Ignore");
-               continue;
+       if (tk->num == TK_in) {
+               int should_reduce;
+               if (states[tos->state].reduce_prod == MANY_REDUCIBLE)
+                       /* Only reduce if TK_in in lookahead */
+                       should_reduce = (search(&states[tos->state], TK_in, 1) >= 0);
+               else
+                       /* Only reduce if we cannot shift anything */
+                       should_reduce = (states[tos->state].go_to_cnt == 0);
+               if (!should_reduce) {
+                       /* No indent expected here and reduce is not indicated,
+                        * so ignore IN
+                        */
+                       free(tk);
+                       tk = NULL;
+                       p.ignored_indents <<= 1;
+                       p.ignored_indents |= 1;
+                       parser_trace_action(trace, "Ignore");
+                       continue;
+               }
        }
 
 We have already discussed the bulk of the handling of a "reduce" action,
@@ -2851,24 +2927,26 @@ report that that input has been accepted.
 ###### parser vars
 
        void *ret = NULL;
+       int reduction;
 
 ###### try reduce
 
-       if (states[tos->state].reduce_prod >= 0) {
+       reduction = states[tos->state].reduce_prod;
+       if (reduction == MANY_REDUCIBLE)
+               reduction = search(&states[tos->state], tk->num, 1);
+       if (reduction >= 0) {
                void **body;
                void *res;
-               const struct state *nextstate = &states[tos->state];
-               int prod = nextstate->reduce_prod;
-               int size = reductions[prod].size;
-               int res_size = reductions[prod].result_size;
+               int size = reductions[reduction].size;
+               int res_size = reductions[reduction].result_size;
 
                body = p.asn_stack + (p.tos - size);
                res = res_size ? calloc(1, res_size) : NULL;
-               res_size = do_reduce(prod, body, config, res);
-               if (res_size != reductions[prod].result_size)
+               res_size = do_reduce(reduction, body, config, res);
+               if (res_size != reductions[reduction].result_size)
                        abort();
                pop(&p, size, do_free);
-               if (!shift(&p, reductions[prod].sym, res, states)) {
+               if (!shift(&p, reductions[reduction].sym, res, states)) {
                        accepted = 1;
                        ret = res;
                        parser_trace_action(trace, "Accept");
@@ -3162,3 +3240,73 @@ exits with an error.
        355//113
 
        error
+
+## A second example
+
+I had originally thought that the LR05 style grammar would suit me there
+should never be a sequence of token that means either of two different
+things depending on what comes follows it.  Without that ambiguity, the
+extra strength of LALR, or even SLR, would not be needed.  However I
+discovered this was too simplistic.  There are times when it is quite
+reasonable for the empty string, or maybe a NEWLINE, to have different
+meanings depending on the next token.  For this, LR05 is not sufficient,
+so I added full action-table support.
+
+This example tests the selection of REDUCE actions based on the
+look-ahead symbol, using a trivial grammar where the empty string can
+reasonably mean different things depending on what it precedes.
+
+
+####### File: parsergen.mk
+       demos :: do_lalr_demo
+       lalr_demo.c lalr_demo.h : parsergen parsergen.mdc
+               ./parsergen --LALR --tag demo -o lalr_demo parsergen.mdc
+       lalr_demo: lalr_demo.o libparser.o libscanner.o libmdcode.o
+               $(CC) -o lalr_demo lalr_demo.c libparser.o libscanner.o libmdcode.o -licuuc
+       do_lalr_demo: lalr_demo
+               NOTRACE=1 ./lalr_demo
+               @echo 'Should produce "start of line, empty sign, empty sigl"'
+
+####### demo: grammar
+
+       $TERM $ @ + -
+       line -> ${ printf("start of line"); }$
+               | line term
+       term -> sigl IDENTIFIER
+               | sign NUMBER
+       sigl ->   $
+               | @
+               | ${ printf(", empty sigl"); }$
+       sign ->   +
+               | -
+               | ${ printf(", empty sign"); }$
+
+####### demo: header
+
+       //
+
+####### demo: code
+
+       #include <stdlib.h>
+       #include <stdio.h>
+       #include "mdcode.h"
+       #include "scanner.h"
+       #include "parser.h"
+       #include "lalr_demo.h"
+       int main(int argc, char *argv[])
+       {
+               char test[] = "````\n$a -1 5 b\n";
+               struct section *table = code_extract(test, test+sizeof(test)-1, NULL);
+               struct token_config config = {
+                       .ignored = (1 << TK_line_comment)
+                                | (1 << TK_newline)
+                                | (1 << TK_in)
+                                | (1 << TK_out),
+                       .number_chars = "",
+                       .word_start = "",
+                       .word_cont = "",
+               };
+               parse_lalr_demo(table->code, &config, getenv("NOTRACE") ? NULL : stderr);
+               printf("\n");
+               exit(0);
+       }