}
}
-### Setting `can_eol`
+### Setting `can_eol` and `starts_line`
In order to be able to ignore newline tokens when not relevant, but
still include them in the parse when needed, we will need to know
###### symbol fields
int can_eol;
+ int starts_line;
###### functions
static void set_can_eol(struct grammar *g)
}
}
+ static void set_starts_line(struct grammar *g)
+ {
+ int p;
+ for (p = 0; p < g->production_count; p++) {
+ struct production *pr = g->productions[p];
+ int s;
+
+ for (s = 0; s < pr->body_size - 1; s++)
+ if (pr->body[s]->can_eol)
+ pr->body[s+1]->starts_line = 1;
+ }
+ }
+
### Building the `first` sets
When calculating what can follow a particular non-terminal, we will need to
them to a data structure, of freeing them.
static int add_itemset(struct grammar *g, struct symset ss,
- enum grammar_type type, int starts_line)
+ enum grammar_type type)
{
struct itemset **where, *is;
int i;
is->items = ss;
is->next = *where;
is->go_to = INIT_DATASET;
- is->starts_line = starts_line;
*where = is;
return is->state;
}
We also collect a set of all symbols which follow "DOT" (in `done`) as this
is used in the next stage.
+If any of these symbols are flagged as starting a line, then this
+state must be a `starts_line` state so now is a good time to record that.
NOTE: precedence handling should happen here - I haven't written this yet
though.
if (bs == pr->body_size)
continue;
s = pr->body[bs];
- if (symset_find(&done, s->num) < 0)
+ if (symset_find(&done, s->num) < 0) {
symset_add(&done, s->num, 0);
+ if (s->starts_line)
+ is->starts_line = 1;
+ }
if (s->type != Nonterminal)
continue;
again = 1;
for (i = 0; i < done.cnt; i++) {
int j;
unsigned short state;
- int starts_line = 0;
struct symbol *sym = g->symtab[done.syms[i]];
struct symset newitemset = INIT_SYMSET;
if (type >= LALR)
newitemset = INIT_DATASET;
- if (sym->can_eol ||
- (sym->nullable && is->starts_line))
- starts_line = 1;
for (j = 0; j < is->items.cnt; j++) {
int itm = is->items.syms[j];
int p = item_prod(itm);
}
}
}
- state = add_itemset(g, newitemset, type, starts_line);
+ state = add_itemset(g, newitemset, type);
if (symset_find(&is->go_to, done.syms[i]) < 0)
symset_add(&is->go_to, done.syms[i], state);
}
}
// production 0, offset 0 (with no data)
symset_add(&first, item_num(0, 0), la);
- add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
+ add_itemset(g, first, type);
for (again = 0, is = g->items;
is;
is = is->next ?: again ? (again = 0, g->items) : NULL) {
set_nullable(g);
set_can_eol(g);
+ set_starts_line(g);
if (type >= SLR)
build_first(g);
Firstly we have the complete list of symbols, together with the
"FIRST" set if that was generated. We add a mark to each symbol to
-show if it can end in a newline (`>`), or if it is nullable (`.`).
+show if it can end in a newline (`>`), if it implies the start of a
+line (`<`), or if it is nullable (`.`).
###### functions
if (!s)
continue;
- printf(" %c%c%3d%c: ",
+ printf(" %c%c%c%3d%c: ",
s->nullable ? '.':' ',
s->can_eol ? '>':' ',
+ s->starts_line ? '<':' ',
s->num, symtypes[s->type]);
prtxt(s->name);
if (s->precedence)
indents in the symbol. These are used to allow indent information to
guide parsing and error recovery.
+`since_newline` tracks how many stack frames since the last
+start-of-line (whether indented or not). So if `since_newline` is
+zero, then this symbol is at the start of a line.
+
`newline_permitted` keeps track of whether newlines should be ignored
or not, and `starts_line` records if this state stated on a newline.
-The stack is more properly seen as alternating states and symbols -
+The stack is most properly seen as alternating states and symbols -
states, like the 'DOT' in items, are between symbols. Each frame in
our stack holds a state and the symbol that was before it. The
bottom of stack holds the start state, but no symbol, as nothing came
short sym;
short starts_indented;
short indents;
+ short starts_newline;
} *stack;
void **asn_stack;
int stack_size;
So `shift` finds the next state. If that succeed it extends the allocations
if needed and pushes all the information onto the stacks.
+Newlines are permitted after a starts_line state until an internal
+indent. So we need to find the topmost state which `starts_line` and
+see if there are any indents other than immediately after it.
+
+So we walk down:
+
+- if state starts_line, then newlines_permitted.
+- if any non-initial indents, newlines not permitted
+
###### parser functions
static int shift(struct parser *p, struct frame *next,
next->newline_permitted = 0;
if (p->tos)
next->newline_permitted =
- p->stack[p->tos-1].newline_permitted;
- if (next->indents)
+ (p->stack[p->tos-1].newline_permitted?:-1)+1;
+ if (next->indents > next->starts_indented)
+ next->newline_permitted = 0;
+ if (next->indents && next->newline_permitted > 2)
next->newline_permitted = 0;
if (states[newstate].starts_line)
next->newline_permitted = 1;
{
int i;
p->tos -= num;
- next->starts_indented = p->stack[p->tos].starts_indented;
+ next->starts_indented =
+ p->stack[p->tos].starts_indented;
+ next->starts_newline =
+ p->stack[p->tos].starts_newline;
next->indents = 0;
for (i = 0; i < num; i++) {
next->indents += p->stack[p->tos+i].indents;
int accepted = 0;
void *ret = NULL;
+ next.starts_newline = 1;
shift(&p, &next, NULL, states);
while (!accepted) {
struct token *err_tk;
if (next.sym == TK_in) {
next.starts_indented = 1;
next.indents = 1;
+ next.starts_newline = 1;
free(tk);
tk = NULL;
parser_trace_action(trace, "Record");
if (states[tos->state].starts_line)
tos->newline_permitted = 1;
else if (p.tos > 1)
- tos->newline_permitted = p.stack[p.tos-2].newline_permitted;
+ tos->newline_permitted = (p.stack[p.tos-2].newline_permitted ?:-1)+1;
}
free(tk);
tk = NULL;
}
}
if (shift(&p, &next, tk, states)) {
- tk = NULL;
+ next.starts_newline =
+ tk->num == TK_newline;
next.starts_indented = 0;
next.indents = 0;
+ tk = NULL;
parser_trace_action(trace, "Shift");
continue;
}
else {
frame.indents = next.indents;
frame.starts_indented = frame.indents;
+ frame.starts_newline = 0;
next.indents = 0;
next.starts_indented = 0;
}
if (states[f->state].starts_line)
fprintf(trace, "s");
if (f->newline_permitted)
- fprintf(trace, "n");
+ fprintf(trace, "n%d", f->newline_permitted);
fprintf(trace, ") ");
}
if (f->indents)
fprintf(trace, "%c%d", f->starts_indented?':':'.',
f->indents);
+ if (f->starts_newline)
+ fputs("/", trace);
fputs(" ", trace);
}
parser_trace_state(trace, f, states);
if (n->indents)
fprintf(trace, "%c%d", n->starts_indented?':':'.',
n->indents);
+ if (n->starts_newline)
+ fputs("/", trace);
fputs("]", trace);
}