From c1e21eb4619c6e86bfebf324eaf6becbc25450bd Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Oct 2014 20:42:04 +1000 Subject: [PATCH] parsergen: various updates. - add starts_line flag for symbols - add starts_newline flag for stack frames and related changes Signed-off-by: NeilBrown --- csrc/parsergen.mdc | 83 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 19 deletions(-) diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc index 9306592..2aa296d 100644 --- a/csrc/parsergen.mdc +++ b/csrc/parsergen.mdc @@ -869,7 +869,7 @@ changes happen. } } -### Setting `can_eol` +### Setting `can_eol` and `starts_line` In order to be able to ignore newline tokens when not relevant, but still include them in the parse when needed, we will need to know @@ -891,6 +891,7 @@ compute it in a repetitive manner similar to `set_nullable`. ###### symbol fields int can_eol; + int starts_line; ###### functions static void set_can_eol(struct grammar *g) @@ -920,6 +921,19 @@ compute it in a repetitive manner similar to `set_nullable`. } } + static void set_starts_line(struct grammar *g) + { + int p; + for (p = 0; p < g->production_count; p++) { + struct production *pr = g->productions[p]; + int s; + + for (s = 0; s < pr->body_size - 1; s++) + if (pr->body[s]->can_eol) + pr->body[s+1]->starts_line = 1; + } + } + ### Building the `first` sets When calculating what can follow a particular non-terminal, we will need to @@ -1198,7 +1212,7 @@ recalculated and their LA sets updated. them to a data structure, of freeing them. static int add_itemset(struct grammar *g, struct symset ss, - enum grammar_type type, int starts_line) + enum grammar_type type) { struct itemset **where, *is; int i; @@ -1210,7 +1224,6 @@ them to a data structure, of freeing them. is->items = ss; is->next = *where; is->go_to = INIT_DATASET; - is->starts_line = starts_line; *where = is; return is->state; } @@ -1252,6 +1265,8 @@ be supplemented by the LA set for the item which produce the new item. We also collect a set of all symbols which follow "DOT" (in `done`) as this is used in the next stage. +If any of these symbols are flagged as starting a line, then this +state must be a `starts_line` state so now is a good time to record that. NOTE: precedence handling should happen here - I haven't written this yet though. @@ -1269,8 +1284,11 @@ though. if (bs == pr->body_size) continue; s = pr->body[bs]; - if (symset_find(&done, s->num) < 0) + if (symset_find(&done, s->num) < 0) { symset_add(&done, s->num, 0); + if (s->starts_line) + is->starts_line = 1; + } if (s->type != Nonterminal) continue; again = 1; @@ -1324,15 +1342,11 @@ with a pre-existing itemset). for (i = 0; i < done.cnt; i++) { int j; unsigned short state; - int starts_line = 0; struct symbol *sym = g->symtab[done.syms[i]]; struct symset newitemset = INIT_SYMSET; if (type >= LALR) newitemset = INIT_DATASET; - if (sym->can_eol || - (sym->nullable && is->starts_line)) - starts_line = 1; for (j = 0; j < is->items.cnt; j++) { int itm = is->items.syms[j]; int p = item_prod(itm); @@ -1365,7 +1379,7 @@ with a pre-existing itemset). } } } - state = add_itemset(g, newitemset, type, starts_line); + state = add_itemset(g, newitemset, type); if (symset_find(&is->go_to, done.syms[i]) < 0) symset_add(&is->go_to, done.syms[i], state); } @@ -1389,7 +1403,7 @@ with `TK_eof` as the LA set. } // production 0, offset 0 (with no data) symset_add(&first, item_num(0, 0), la); - add_itemset(g, first, type, g->productions[0]->body[0]->can_eol); + add_itemset(g, first, type); for (again = 0, is = g->items; is; is = is->next ?: again ? (again = 0, g->items) : NULL) { @@ -1450,6 +1464,7 @@ changeover point in `first_nonterm`. set_nullable(g); set_can_eol(g); + set_starts_line(g); if (type >= SLR) build_first(g); @@ -1481,7 +1496,8 @@ all the tables that have been generated, plus a description of any conflicts. Firstly we have the complete list of symbols, together with the "FIRST" set if that was generated. We add a mark to each symbol to -show if it can end in a newline (`>`), or if it is nullable (`.`). +show if it can end in a newline (`>`), if it implies the start of a +line (`<`), or if it is nullable (`.`). ###### functions @@ -1498,9 +1514,10 @@ show if it can end in a newline (`>`), or if it is nullable (`.`). if (!s) continue; - printf(" %c%c%3d%c: ", + printf(" %c%c%c%3d%c: ", s->nullable ? '.':' ', s->can_eol ? '>':' ', + s->starts_line ? '<':' ', s->num, symtypes[s->type]); prtxt(s->name); if (s->precedence) @@ -2415,10 +2432,14 @@ The `indents` count and the `starts_indented` flag track the line indents in the symbol. These are used to allow indent information to guide parsing and error recovery. +`since_newline` tracks how many stack frames since the last +start-of-line (whether indented or not). So if `since_newline` is +zero, then this symbol is at the start of a line. + `newline_permitted` keeps track of whether newlines should be ignored or not, and `starts_line` records if this state stated on a newline. -The stack is more properly seen as alternating states and symbols - +The stack is most properly seen as alternating states and symbols - states, like the 'DOT' in items, are between symbols. Each frame in our stack holds a state and the symbol that was before it. The bottom of stack holds the start state, but no symbol, as nothing came @@ -2434,6 +2455,7 @@ before the beginning. short sym; short starts_indented; short indents; + short starts_newline; } *stack; void **asn_stack; int stack_size; @@ -2462,6 +2484,15 @@ stack is empty, it always chooses zero as the next state. So `shift` finds the next state. If that succeed it extends the allocations if needed and pushes all the information onto the stacks. +Newlines are permitted after a starts_line state until an internal +indent. So we need to find the topmost state which `starts_line` and +see if there are any indents other than immediately after it. + +So we walk down: + +- if state starts_line, then newlines_permitted. +- if any non-initial indents, newlines not permitted + ###### parser functions static int shift(struct parser *p, struct frame *next, @@ -2486,8 +2517,10 @@ if needed and pushes all the information onto the stacks. next->newline_permitted = 0; if (p->tos) next->newline_permitted = - p->stack[p->tos-1].newline_permitted; - if (next->indents) + (p->stack[p->tos-1].newline_permitted?:-1)+1; + if (next->indents > next->starts_indented) + next->newline_permitted = 0; + if (next->indents && next->newline_permitted > 2) next->newline_permitted = 0; if (states[newstate].starts_line) next->newline_permitted = 1; @@ -2512,7 +2545,10 @@ removed. It is called _after_ we reduce a production, just before we { int i; p->tos -= num; - next->starts_indented = p->stack[p->tos].starts_indented; + next->starts_indented = + p->stack[p->tos].starts_indented; + next->starts_newline = + p->stack[p->tos].starts_newline; next->indents = 0; for (i = 0; i < num; i++) { next->indents += p->stack[p->tos+i].indents; @@ -2594,6 +2630,7 @@ since the last state which could have been at the start of a line. int accepted = 0; void *ret = NULL; + next.starts_newline = 1; shift(&p, &next, NULL, states); while (!accepted) { struct token *err_tk; @@ -2606,6 +2643,7 @@ since the last state which could have been at the start of a line. if (next.sym == TK_in) { next.starts_indented = 1; next.indents = 1; + next.starts_newline = 1; free(tk); tk = NULL; parser_trace_action(trace, "Record"); @@ -2621,7 +2659,7 @@ since the last state which could have been at the start of a line. if (states[tos->state].starts_line) tos->newline_permitted = 1; else if (p.tos > 1) - tos->newline_permitted = p.stack[p.tos-2].newline_permitted; + tos->newline_permitted = (p.stack[p.tos-2].newline_permitted ?:-1)+1; } free(tk); tk = NULL; @@ -2640,9 +2678,11 @@ since the last state which could have been at the start of a line. } } if (shift(&p, &next, tk, states)) { - tk = NULL; + next.starts_newline = + tk->num == TK_newline; next.starts_indented = 0; next.indents = 0; + tk = NULL; parser_trace_action(trace, "Shift"); continue; } @@ -2666,6 +2706,7 @@ since the last state which could have been at the start of a line. else { frame.indents = next.indents; frame.starts_indented = frame.indents; + frame.starts_newline = 0; next.indents = 0; next.starts_indented = 0; } @@ -2774,7 +2815,7 @@ end inside square brackets. if (states[f->state].starts_line) fprintf(trace, "s"); if (f->newline_permitted) - fprintf(trace, "n"); + fprintf(trace, "n%d", f->newline_permitted); fprintf(trace, ") "); } @@ -2801,6 +2842,8 @@ end inside square brackets. if (f->indents) fprintf(trace, "%c%d", f->starts_indented?':':'.', f->indents); + if (f->starts_newline) + fputs("/", trace); fputs(" ", trace); } parser_trace_state(trace, f, states); @@ -2814,6 +2857,8 @@ end inside square brackets. if (n->indents) fprintf(trace, "%c%d", n->starts_indented?':':'.', n->indents); + if (n->starts_newline) + fputs("/", trace); fputs("]", trace); } -- 2.43.0