both a success flag and a pointer to where in the list an insert should
happen, so we don't need to search a second time.
+Each state will often have at most one reducible production, but
+occasionally will have more. If there are more we will need a full
+action table (merged in the go to table). If there is one, we record it
+in the state. If none, we record that too. These special values will
+be needed both in the generator and in the parser, so they need to be
+declared twice.
+
+###### exported types
+ enum {
+ NONE_REDUCIBLE = -1,
+ MANY_REDUCIBLE = -2,
+ };
###### declarations
+ enum {
+ NONE_REDUCIBLE = -1,
+ MANY_REDUCIBLE = -2,
+ };
struct itemset {
struct itemset *next;
short state;
+ short reducible_prod; /* or NONE_REDUCIBLE or MANY_REDUCIBLE */
struct symset items;
struct symset go_to;
enum assoc assoc;
struct symset LA = INIT_SYMSET;
unsigned short sn = 0;
- if (bs == pr->body_size)
+ if (i == 0)
+ is->reducible_prod = NONE_REDUCIBLE;
+ if (bs == pr->body_size) {
+ if (is->reducible_prod == NONE_REDUCIBLE)
+ is->reducible_prod = p;
+ else
+ is->reducible_prod = MANY_REDUCIBLE;
continue;
+ }
s = pr->body[bs];
if (s->precedence && is->precedence &&
is->precedence > s->precedence)
{
gen_known(f, g);
gen_non_term(f, g);
+ gen_action(f, g);
gen_goto(f, g);
gen_states(f, g);
gen_reductions(f, g);
fprintf(f, "};\n\n");
}
+### The Action table
+
+For this parser we do not have a separate action table. The action
+table normally maps terminals to one of the actions SHIFT, REDUCE(p), or
+ACCEPT. We can find which terminals to SHIFT via a lookup in the go to
+table, and most states have at most one reducible production and in that
+case to effect that reduction for any non-shiftable terminal in the
+look-ahead.
+
+However, with SLR, LALR, and canonical-LR grammars, there may
+occasionally be multiple possible reductions which we choose between
+based on the look-ahead. To handle this possibility we allow reductions
+to be stored in the go to table. An extra flag indicates if an entry is
+a state to go to, or a production to reduce. We add the entries with
+that flag here, if necessary.
+
+###### functions
+ #define IS_REDUCTION (0x8000)
+
+ static void gen_action(FILE *f, struct grammar *g)
+ {
+ int i, j, k;
+ for (i = 0; i < g->states; i++) {
+ struct itemset *is = g->statetab[i];
+ struct symset *gt = &is->go_to;
+
+ if (is->reducible_prod != MANY_REDUCIBLE)
+ continue;
+ for (j = 0; j < is->items.cnt; j++) {
+ int itm = is->items.syms[j];
+ int p = item_prod(itm);
+ int bp = item_dot(itm);
+ struct production *pr = g->productions[p];
+ struct symset la;
+ if (bp != pr->body_size)
+ continue;
+ /* This is a reducible item */
+ if (g->follow)
+ la = g->follow[pr->head->num];
+ else
+ la = set_find(g, is->items.data[j]);
+ for (k = 0 ; k < la.cnt; k++)
+ symset_add(gt, la.syms[k], p | IS_REDUCTION);
+ }
+ }
+ }
+
### States, reductions, and the go to tables.
For each state we record the go to table and the reducible production if
struct lookup {
short sym;
- short state;
+ short state_or_reduction:15;
+ unsigned short is_reduction:1;
};
struct reduction {
short size;
short result_size;
};
struct state {
- short reduce_prod;
+ short reduce_prod; // or NONE_REDUCIBLE or MANY_REDUCIBLE
short go_to_cnt;
- const struct lookup * go_to;
+ const struct lookup * go_to; // includes reductions
};
###### functions
continue;
fprintf(f, "static const struct lookup goto_%d[] = {\n",
i);
- for (j = 0; j < gt.cnt; j++)
- fprintf(f, "\t{ %d, %d },\n",
- gt.syms[j], gt.data[j]);
+ for (j = 0; j < gt.cnt; j++) {
+ if (gt.data[j] & IS_REDUCTION)
+ fprintf(f, "\t{ %d, %d, 1 },\n",
+ gt.syms[j], gt.data[j]&~IS_REDUCTION);
+ else
+ fprintf(f, "\t{ %d, %d, 0 },\n",
+ gt.syms[j], gt.data[j]);
+ }
fprintf(f, "};\n");
}
}
fprintf(f, "static const struct state states[] = {\n");
for (i = 0; i < g->states; i++) {
struct itemset *is = g->statetab[i];
- int j, prod = -1, prod_len;
-
- for (j = 0; j < is->items.cnt; j++) {
- int itm = is->items.syms[j];
- int p = item_prod(itm);
- int bp = item_dot(itm);
- struct production *pr = g->productions[p];
+ int prod = is->reducible_prod;
- if (bp < pr->body_size)
- continue;
- /* This is what we reduce - choose longest */
- if (prod < 0 || prod_len < pr->body_size) {
- prod = p;
- prod_len = pr->body_size;
- }
- }
if (is->go_to.cnt)
fprintf(f, "\t[%d] = { %d, %d, goto_%d },\n",
i, prod, is->go_to.cnt, i);
###### parser functions
- static int search(const struct state *l, int sym)
+ static int search(const struct state *l, int sym, int reduction)
{
int lo = 0;
int hi = l->go_to_cnt;
else
hi = mid;
}
- if (l->go_to[lo].sym == sym)
- return l->go_to[lo].state;
- else
+ if (l->go_to[lo].sym != sym)
return -1;
+ if (!reduction == !l->go_to[lo].is_reduction)
+ return l->go_to[lo].state_or_reduction;
+ return -1;
}
### Memory allocation
struct frame next = {0};
int newstate = p->tos
? search(&states[p->stack[p->tos-1].state],
- sym)
+ sym, 0)
: 0;
if (newstate < 0)
return 0;
continue;
}
- if (tk->num == TK_in && states[p.stack[p.tos-1].state].go_to_cnt > 0) {
- /* No indent expected here and reduce is not mandatory, so ignore IN */
- free(tk);
- tk = NULL;
- p.ignored_indents <<= 1;
- p.ignored_indents |= 1;
- parser_trace_action(trace, "Ignore");
- continue;
+ if (tk->num == TK_in) {
+ int should_reduce;
+ if (states[tos->state].reduce_prod == MANY_REDUCIBLE)
+ /* Only reduce if TK_in in lookahead */
+ should_reduce = (search(&states[tos->state], TK_in, 1) >= 0);
+ else
+ /* Only reduce if we cannot shift anything */
+ should_reduce = (states[tos->state].go_to_cnt == 0);
+ if (!should_reduce) {
+ /* No indent expected here and reduce is not indicated,
+ * so ignore IN
+ */
+ free(tk);
+ tk = NULL;
+ p.ignored_indents <<= 1;
+ p.ignored_indents |= 1;
+ parser_trace_action(trace, "Ignore");
+ continue;
+ }
}
We have already discussed the bulk of the handling of a "reduce" action,
###### parser vars
void *ret = NULL;
+ int reduction;
###### try reduce
- if (states[tos->state].reduce_prod >= 0) {
+ reduction = states[tos->state].reduce_prod;
+ if (reduction == MANY_REDUCIBLE)
+ reduction = search(&states[tos->state], tk->num, 1);
+ if (reduction >= 0) {
void **body;
void *res;
- const struct state *nextstate = &states[tos->state];
- int prod = nextstate->reduce_prod;
- int size = reductions[prod].size;
- int res_size = reductions[prod].result_size;
+ int size = reductions[reduction].size;
+ int res_size = reductions[reduction].result_size;
body = p.asn_stack + (p.tos - size);
res = res_size ? calloc(1, res_size) : NULL;
- res_size = do_reduce(prod, body, config, res);
- if (res_size != reductions[prod].result_size)
+ res_size = do_reduce(reduction, body, config, res);
+ if (res_size != reductions[reduction].result_size)
abort();
pop(&p, size, do_free);
- if (!shift(&p, reductions[prod].sym, res, states)) {
+ if (!shift(&p, reductions[reduction].sym, res, states)) {
accepted = 1;
ret = res;
parser_trace_action(trace, "Accept");
355//113
error
+
+## A second example
+
+I had originally thought that the LR05 style grammar would suit me there
+should never be a sequence of token that means either of two different
+things depending on what comes follows it. Without that ambiguity, the
+extra strength of LALR, or even SLR, would not be needed. However I
+discovered this was too simplistic. There are times when it is quite
+reasonable for the empty string, or maybe a NEWLINE, to have different
+meanings depending on the next token. For this, LR05 is not sufficient,
+so I added full action-table support.
+
+This example tests the selection of REDUCE actions based on the
+look-ahead symbol, using a trivial grammar where the empty string can
+reasonably mean different things depending on what it precedes.
+
+
+####### File: parsergen.mk
+ demos :: do_lalr_demo
+ lalr_demo.c lalr_demo.h : parsergen parsergen.mdc
+ ./parsergen --LALR --tag demo -o lalr_demo parsergen.mdc
+ lalr_demo: lalr_demo.o libparser.o libscanner.o libmdcode.o
+ $(CC) -o lalr_demo lalr_demo.c libparser.o libscanner.o libmdcode.o -licuuc
+ do_lalr_demo: lalr_demo
+ NOTRACE=1 ./lalr_demo
+ @echo 'Should produce "start of line, empty sign, empty sigl"'
+
+####### demo: grammar
+
+ $TERM $ @ + -
+ line -> ${ printf("start of line"); }$
+ | line term
+ term -> sigl IDENTIFIER
+ | sign NUMBER
+ sigl -> $
+ | @
+ | ${ printf(", empty sigl"); }$
+ sign -> +
+ | -
+ | ${ printf(", empty sign"); }$
+
+####### demo: header
+
+ //
+
+####### demo: code
+
+ #include <stdlib.h>
+ #include <stdio.h>
+ #include "mdcode.h"
+ #include "scanner.h"
+ #include "parser.h"
+ #include "lalr_demo.h"
+ int main(int argc, char *argv[])
+ {
+ char test[] = "````\n$a -1 5 b\n";
+ struct section *table = code_extract(test, test+sizeof(test)-1, NULL);
+ struct token_config config = {
+ .ignored = (1 << TK_line_comment)
+ | (1 << TK_newline)
+ | (1 << TK_in)
+ | (1 << TK_out),
+ .number_chars = "",
+ .word_start = "",
+ .word_cont = "",
+ };
+ parse_lalr_demo(table->code, &config, getenv("NOTRACE") ? NULL : stderr);
+ printf("\n");
+ exit(0);
+ }