}
}
-### Setting `can_eol`
+### Setting `can_eol` and `line_like`
In order to be able to ignore newline tokens when not relevant, but
still include them in the parse when needed, we will need to know
which states can start a "line-like" section of code. We ignore
newlines when there is an indent since the most recent start of a
-line-like section.
+line-like symbol.
-To know what is line-like, we first need to know which symbols can end
-a line-like section, which is precisely those which can end with a
-newline token. These symbols don't necessarily alway end with a
-newline, but they can. Hence they are not described as "lines" but
-only "line-like".
+To know which symbols are line-like, we first need to know which
+symbols start with a NEWLINE token. Any symbol which is followed by a
+NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol.
+Certainly when trying to parse one of these we must take not of NEWLINEs.
-Clearly the `TK_newline` token can end with a newline. Any symbol
-which is the head of a production that contains a line-ending symbol
-followed only by nullable symbols is also a line-ending symbol. We
-use a new field `can_eol` to record this attribute of symbols, and
-compute it in a repetitive manner similar to `set_nullable`.
+Clearly the `TK_newline` token can start with a NEWLINE. Any symbol
+which is the head of a production that contains a starts-with-NEWLINE
+symbol preceeded only by nullable symbols is also a
+starts-with-NEWLINE symbol. We use a new field `can_eol` to record
+this attribute of symbols, and compute it in a repetitive manner
+similar to `set_nullable`.
+
+Once we have that, we can determine which symbols are `line_like` be
+seeing which are followed by a `can_eol` symbol in any production.
###### symbol fields
int can_eol;
+ int line_like;
###### functions
static void set_can_eol(struct grammar *g)
if (pr->head->can_eol)
continue;
- for (s = pr->body_size - 1; s >= 0; s--) {
+ for (s = 0 ; s < pr->body_size; s++) {
if (pr->body[s]->can_eol) {
pr->head->can_eol = 1;
check_again = 1;
}
}
+ static void set_line_like(struct grammar *g)
+ {
+ int p;
+ for (p = 0; p < g->production_count; p++) {
+ struct production *pr = g->productions[p];
+ int s;
+
+ for (s = 1; s < pr->body_size; s++)
+ if (pr->body[s]->can_eol)
+ pr->body[s-1]->line_like = 1;
+ }
+ }
+
### Building the `first` sets
When calculating what can follow a particular non-terminal, we will need to
struct symset go_to;
char completed;
char starts_line;
+ int min_prefix;
};
###### grammar fields
them to a data structure, of freeing them.
static int add_itemset(struct grammar *g, struct symset ss,
- enum grammar_type type, int starts_line)
+ enum grammar_type type)
{
struct itemset **where, *is;
int i;
is->items = ss;
is->next = *where;
is->go_to = INIT_DATASET;
- is->starts_line = starts_line;
*where = is;
return is->state;
}
We also collect a set of all symbols which follow "DOT" (in `done`) as this
is used in the next stage.
+If any of these symbols are flagged as starting a line, then this
+state must be a `starts_line` state so now is a good time to record that.
NOTE: precedence handling should happen here - I haven't written this yet
though.
struct symset LA = INIT_SYMSET;
unsigned short sn = 0;
+ if (is->min_prefix == 0 ||
+ (bs > 0 && bs < is->min_prefix))
+ is->min_prefix = bs;
if (bs == pr->body_size)
continue;
s = pr->body[bs];
- if (symset_find(&done, s->num) < 0)
+ if (symset_find(&done, s->num) < 0) {
symset_add(&done, s->num, 0);
+ if (s->line_like)
+ is->starts_line = 1;
+ }
if (s->type != Nonterminal)
continue;
again = 1;
for (i = 0; i < done.cnt; i++) {
int j;
unsigned short state;
- int starts_line = 0;
struct symbol *sym = g->symtab[done.syms[i]];
struct symset newitemset = INIT_SYMSET;
if (type >= LALR)
newitemset = INIT_DATASET;
- if (sym->can_eol ||
- (sym->nullable && is->starts_line))
- starts_line = 1;
for (j = 0; j < is->items.cnt; j++) {
int itm = is->items.syms[j];
int p = item_prod(itm);
}
}
}
- state = add_itemset(g, newitemset, type, starts_line);
+ state = add_itemset(g, newitemset, type);
if (symset_find(&is->go_to, done.syms[i]) < 0)
symset_add(&is->go_to, done.syms[i], state);
}
}
// production 0, offset 0 (with no data)
symset_add(&first, item_num(0, 0), la);
- add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
+ add_itemset(g, first, type);
for (again = 0, is = g->items;
is;
is = is->next ?: again ? (again = 0, g->items) : NULL) {
set_nullable(g);
set_can_eol(g);
+ set_line_like(g);
if (type >= SLR)
build_first(g);
Firstly we have the complete list of symbols, together with the
"FIRST" set if that was generated. We add a mark to each symbol to
-show if it can end in a newline (`>`), or if it is nullable (`.`).
+show if it can end in a newline (`>`), if it implies the start of a
+line (`<`), or if it is nullable (`.`).
###### functions
if (!s)
continue;
- printf(" %c%c%3d%c: ",
+ printf(" %c%c%c%3d%c: ",
s->nullable ? '.':' ',
s->can_eol ? '>':' ',
+ s->line_like ? '<':' ',
s->num, symtypes[s->type]);
prtxt(s->name);
if (s->precedence)
for (s = 0; s < g->states; s++) {
int j;
struct itemset *is = g->statetab[s];
- printf(" Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
+ printf(" Itemset %d:%s min prefix=%d\n",
+ s, is->starts_line?" (startsline)":"", is->min_prefix);
for (j = 0; j < is->items.cnt; j++) {
report_item(g, is->items.syms[j]);
if (is->items.data != NO_DATA)
short reduce_sym;
short shift_sym;
short starts_line;
+ short min_prefix;
};
}
if (prod >= 0)
- fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
+ fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d, %d },\n",
i, is->go_to.cnt, i, prod,
g->productions[prod]->body_size,
g->productions[prod]->head->num,
- is->starts_line);
+ is->starts_line, is->min_prefix);
else
- fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
+ fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d, %d },\n",
i, is->go_to.cnt, i, shift_sym,
- is->starts_line);
+ is->starts_line, is->min_prefix);
}
fprintf(f, "};\n\n");
}
freeing function. The symbol leads us to the right free function through
`do_free`.
-The `indents` count and the `starts_indented` flag track the line
-indents in the symbol. These are used to allow indent information to
-guide parsing and error recovery.
+The `indents` count tracks the line indents in the symbol. These are
+used to allow indent information to guide parsing and error recovery.
+
+`since_newline` tracks how many stack frames since the last
+start-of-line (whether indented or not). So if `since_newline` is
+zero, then this symbol is at the start of a line.
`newline_permitted` keeps track of whether newlines should be ignored
or not, and `starts_line` records if this state stated on a newline.
-As well as the stack of frames we have a `next` frame which is
-assembled from the incoming token and other information prior to
-pushing it onto the stack.
+The stack is most properly seen as alternating states and symbols -
+states, like the 'DOT' in items, are between symbols. Each frame in
+our stack holds a state and the symbol that was before it. The
+bottom of stack holds the start state, but no symbol, as nothing came
+before the beginning.
###### parser functions
struct parser {
struct frame {
short state;
+ short newline_permitted;
+
short sym;
- short starts_indented;
short indents;
- short newline_permitted;
- } *stack, next;
+ short since_newline;
+ short since_indent;
+ } *stack;
void **asn_stack;
int stack_size;
int tos;
reduce a production we will pop off entries corresponding to the body
symbols, then push on an item for the head of the production. This last is
exactly the same process as shifting in a terminal so we use the same
-function for both.
+function for both. In both cases we provide a stack frame which
+contains the symbol to shift and related indent information.
To simplify other code we arrange for `shift` to fail if there is no `goto`
state for the symbol. This is useful in basic parsing due to our design
that we shift when we can, and reduce when we cannot. So the `shift`
function reports if it could.
+`shift` is also used to push state zero onto the stack, so if the
+stack is empty, it always chooses zero as the next state.
+
So `shift` finds the next state. If that succeed it extends the allocations
if needed and pushes all the information onto the stacks.
+Newlines are permitted after a starts_line state until an internal
+indent. So we need to find the topmost state which `starts_line` and
+see if there are any indents other than immediately after it.
+
+So we walk down:
+
+- if state starts_line, then newlines_permitted.
+- if any non-initial indents, newlines not permitted
+
###### parser functions
static int shift(struct parser *p,
+ short sym, short indents, short start_of_line,
void *asn,
const struct state states[])
{
// Push an entry onto the stack
- int newstate = search(&states[p->next.state], p->next.sym);
+ struct frame next = {0};
+ int newstate = p->tos
+ ? search(&states[p->stack[p->tos-1].state],
+ sym)
+ : 0;
if (newstate < 0)
return 0;
if (p->tos >= p->stack_size) {
p->asn_stack = realloc(p->asn_stack, p->stack_size
* sizeof(p->asn_stack[0]));
}
- p->stack[p->tos] = p->next;
+ next.sym = sym;
+ next.indents = indents;
+ next.state = newstate;
+ if (states[newstate].starts_line)
+ next.newline_permitted = 1;
+ else if (indents)
+ next.newline_permitted = 0;
+ else if (p->tos)
+ next.newline_permitted =
+ p->stack[p->tos-1].newline_permitted;
+ else
+ next.newline_permitted = 0;
+
+ if (!start_of_line) {
+ if (p->tos)
+ next.since_newline = p->stack[p->tos-1].since_newline + 1;
+ else
+ next.since_newline = 1;
+ }
+ if (indents)
+ next.since_indent = 0;
+ else if (p->tos)
+ next.since_indent = p->stack[p->tos-1].since_indent + 1;
+ else
+ next.since_indent = 1;
+
+ p->stack[p->tos] = next;
p->asn_stack[p->tos] = asn;
p->tos++;
- p->next.state = newstate;
- p->next.indents = 0;
- p->next.starts_indented = 0;
- // if new state doesn't start a line, we inherit newline_permitted status
- if (states[newstate].starts_line)
- p->next.newline_permitted = 1;
return 1;
}
-`pop` simply moves the top of stack (`tos`) back down the required amount
-and frees any `asn` entries that need to be freed. It is called _after_ we
-reduce a production, just before we `shift` the nonterminal in.
+`pop` primarily moves the top of stack (`tos`) back down the required
+amount and frees any `asn` entries that need to be freed. It also
+collects a summary of the indents in the symbols that are being
+removed. It is called _after_ we reduce a production, just before we
+`shift` the nonterminal in.
+
+`pop` is only called if there are entries to remove, so `num` is never zero.
###### parser functions
- static void pop(struct parser *p, int num,
- void(*do_free)(short sym, void *asn))
+ static int pop(struct parser *p, int num,
+ short *start_of_line,
+ void(*do_free)(short sym, void *asn))
{
int i;
+ short indents = 0;
+ int sol = 0;
+
p->tos -= num;
for (i = 0; i < num; i++) {
- p->next.indents += p->stack[p->tos+i].indents;
+ sol |= !p->stack[p->tos+1].since_newline;
+ indents += p->stack[p->tos+i].indents;
do_free(p->stack[p->tos+i].sym,
p->asn_stack[p->tos+i]);
}
-
- if (num) {
- p->next.state = p->stack[p->tos].state;
- p->next.starts_indented = p->stack[p->tos].starts_indented;
- p->next.newline_permitted = p->stack[p->tos].newline_permitted;
- if (p->next.indents > p->next.starts_indented)
- p->next.newline_permitted = 0;
- }
+ if (start_of_line)
+ *start_of_line = sol;
+ return indents;
}
### Memory allocation
### The heart of the parser.
-Now we have the parser. If we can shift, we do. If not and we can reduce
-we do. If the production we reduced was production zero, then we have
+Now we have the parser. If we can shift, we do, though newlines and
+reducing indenting may block that. If not and we can reduce we do.
+If the production we reduced was production zero, then we have
accepted the input and can finish.
We return whatever `asn` was returned by reducing production zero.
int accepted = 0;
void *ret = NULL;
- p.next.newline_permitted = states[0].starts_line;
+ shift(&p, TK_eof, 0, 1, NULL, states);
while (!accepted) {
struct token *err_tk;
+ struct frame *tos = &p.stack[p.tos-1];
if (!tk)
tk = tok_copy(token_next(tokens));
- p.next.sym = tk->num;
- if (trace)
- parser_trace(trace, &p, tk, states, non_term, config->known_count);
-
- if (p.next.sym == TK_in) {
- p.next.starts_indented = 1;
- p.next.indents = 1;
+ parser_trace(trace, &p,
+ tk, states, non_term, config->known_count);
+
+ if (tk->num == TK_in) {
+ tos->indents += 1;
+ tos->since_newline = 0;
+ tos->since_indent = 0;
+ if (!states[tos->state].starts_line)
+ tos->newline_permitted = 0;
free(tk);
tk = NULL;
+ parser_trace_action(trace, "Record");
continue;
}
- if (p.next.sym == TK_out) {
- if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
- (p.stack[p.tos-1].indents == 1 &&
- states[p.next.state].reduce_size > 1)) {
- p.stack[p.tos-1].indents -= 1;
- if (p.stack[p.tos-1].indents == p.stack[p.tos-1].starts_indented) {
- // no internal indent any more, reassess 'newline_permitted'
- if (states[p.stack[p.tos-1].state].starts_line)
- p.stack[p.tos-1].newline_permitted = 1;
- else if (p.tos > 1)
- p.stack[p.tos-1].newline_permitted = p.stack[p.tos-2].newline_permitted;
+ if (tk->num == TK_out) {
+ if (states[tos->state].reduce_size >= 0 &&
+ states[tos->state].reduce_size <= tos->since_indent)
+ goto force_reduce;
+ if (states[tos->state].min_prefix >= tos->since_indent) {
+ // OK to cancel
+ struct frame *in = tos - tos->since_indent;
+ in->indents -= 1;
+ if (in->indents == 0) {
+ /* Reassess since_indent and newline_permitted */
+ if (in > p.stack) {
+ in->since_indent = in[-1].since_indent + 1;
+ in->newline_permitted = in[-1].newline_permitted;
+ } else {
+ in->since_indent = 0;
+ in->newline_permitted = 0;
+ }
+ if (states[in->state].starts_line)
+ in->newline_permitted = 1;
+ while (in < tos) {
+ in += 1;
+ in->since_indent = in[-1].since_indent + 1;
+ if (states[in->state].starts_line)
+ in->newline_permitted = 1;
+ else
+ in->newline_permitted = in[-1].newline_permitted;
+ }
}
free(tk);
tk = NULL;
+ parser_trace_action(trace, "Cancel");
continue;
}
// fall through and force a REDUCE (as 'shift'
// will fail).
}
- if (p.next.sym == TK_newline) {
- if (!p.tos || ! p.stack[p.tos-1].newline_permitted) {
+ if (tk->num == TK_newline) {
+ if (!tos->newline_permitted) {
free(tk);
tk = NULL;
+ parser_trace_action(trace, "Discard");
continue;
}
+ if (tos->since_newline > 1 &&
+ states[tos->state].reduce_size >= 0 &&
+ states[tos->state].reduce_size <= tos->since_newline)
+ goto force_reduce;
}
- if (shift(&p, tk, states)) {
+ if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
tk = NULL;
+ parser_trace_action(trace, "Shift");
continue;
}
- if (states[p.next.state].reduce_prod >= 0) {
+ force_reduce:
+ if (states[tos->state].reduce_prod >= 0) {
void **body;
- int prod = states[p.next.state].reduce_prod;
- int size = states[p.next.state].reduce_size;
+ void *res;
+ const struct state *nextstate = &states[tos->state];
+ int prod = nextstate->reduce_prod;
+ int size = nextstate->reduce_size;
int bufsize;
static char buf[16*1024];
- p.next.sym = states[p.next.state].reduce_sym;
+ short indents, start_of_line;
- body = p.asn_stack +
- (p.tos - states[p.next.state].reduce_size);
+ body = p.asn_stack + (p.tos - size);
bufsize = do_reduce(prod, body, config, buf);
- pop(&p, size, do_free);
- shift(&p, memdup(buf, bufsize), states);
- if (prod == 0)
+ if (size)
+ indents = pop(&p, size, &start_of_line,
+ do_free);
+ else {
+ indents = 0;
+ start_of_line = 0;
+ }
+ res = memdup(buf, bufsize);
+ memset(buf, 0, bufsize);
+ if (!shift(&p, nextstate->reduce_sym,
+ indents, start_of_line,
+ res, states)) {
+ if (prod != 0) abort();
accepted = 1;
+ ret = res;
+ }
+ parser_trace_action(trace, "Reduce");
continue;
}
if (tk->num == TK_out) {
// Indent problem - synthesise tokens to get us
// out of here.
- fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
- p.next.sym = states[p.next.state].shift_sym;
- shift(&p, tok_copy(*tk), states);
+ fprintf(stderr, "Synthesize %d to handle indent problem\n", states[tos->state].shift_sym);
+ shift(&p, states[tos->state].shift_sym,
+ 0, 1, tok_copy(*tk), states);
// FIXME need to report this error somehow
+ parser_trace_action(trace, "Synthesize");
continue;
}
/* Error. We walk up the stack until we
* Then we discard input tokens until
* we find one that is acceptable.
*/
+ parser_trace_action(trace, "ERROR");
+ short indents = 0, start_of_line;
err_tk = tok_copy(*tk);
- p.next.sym = TK_error;
- while (shift(&p, err_tk, states) == 0
+ while (shift(&p, TK_error, 0, 0,
+ err_tk, states) == 0
&& p.tos > 0)
// discard this state
- pop(&p, 1, do_free);
+ indents += pop(&p, 1, &start_of_line, do_free);
if (p.tos == 0) {
free(err_tk);
// no state accepted TK_error
break;
}
- while (search(&states[p.next.state], tk->num) < 0 &&
+ tos = &p.stack[p.tos-1];
+ while (search(&states[tos->state], tk->num) < 0 &&
tk->num != TK_eof) {
free(tk);
tk = tok_copy(token_next(tokens));
if (tk->num == TK_in)
- p.next.indents += 1;
+ indents += 1;
if (tk->num == TK_out) {
- if (p.next.indents == 0)
+ if (indents == 0)
break;
- p.next.indents -= 1;
+ indents -= 1;
+ // FIXME update since_indent here
}
}
if (p.tos == 0 && tk->num == TK_eof)
break;
+ tos = &p.stack[p.tos-1];
+ tos->indents += indents;
}
free(tk);
- if (accepted)
- ret = p.asn_stack[0];
- else
- pop(&p, p.tos, do_free);
+ if (p.tos)
+ pop(&p, p.tos, NULL, do_free);
free(p.asn_stack);
free(p.stack);
return ret;
static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
{
fprintf(trace, "(%d", f->state);
- if (f->indents)
- fprintf(trace, "%c%d", f->starts_indented?':':'.',
- f->indents);
if (states[f->state].starts_line)
fprintf(trace, "s");
if (f->newline_permitted)
- fprintf(trace, "n");
+ fprintf(trace, "n%d", f->since_newline);
fprintf(trace, ") ");
}
const char *non_term[], int knowns)
{
int i;
+ if (!trace)
+ return;
for (i = 0; i < p->tos; i++) {
- int sym = p->stack[i].sym;
- parser_trace_state(trace, &p->stack[i], states);
- if (sym < TK_reserved &&
- reserved_words[sym] != NULL)
- fputs(reserved_words[sym], trace);
- else if (sym < TK_reserved + knowns) {
- struct token *t = p->asn_stack[i];
- text_dump(trace, t->txt, 20);
- } else
- fputs(non_term[sym - TK_reserved - knowns],
- trace);
- fputs(" ", trace);
+ struct frame *f = &p->stack[i];
+ if (i) {
+ int sym = f->sym;
+ if (sym < TK_reserved &&
+ reserved_words[sym] != NULL)
+ fputs(reserved_words[sym], trace);
+ else if (sym < TK_reserved + knowns) {
+ struct token *t = p->asn_stack[i];
+ text_dump(trace, t->txt, 20);
+ } else
+ fputs(non_term[sym - TK_reserved - knowns],
+ trace);
+ if (f->indents)
+ fprintf(trace, ".%d", f->indents);
+ if (f->since_newline == 0)
+ fputs("/", trace);
+ fputs(" ", trace);
+ }
+ parser_trace_state(trace, f, states);
}
- parser_trace_state(trace, &p->next, states);
- fprintf(trace, " [");
+ fprintf(trace, "[");
if (tk->num < TK_reserved &&
reserved_words[tk->num] != NULL)
fputs(reserved_words[tk->num], trace);
else
text_dump(trace, tk->txt, 20);
- fputs("]\n", trace);
+ fputs("]", trace);
+ }
+
+ void parser_trace_action(FILE *trace, char *action)
+ {
+ if (trace)
+ fprintf(trace, " - %s\n", action);
}
# A Worked Example