write out C code files which can be compiled to make a parser.
"2D support" means that indentation and line breaks can be significant.
-Indent tokens (IN and OUT) and end-of-line (EOL) tokens can be used to
-describe the grammar and the parser can selectively ignore these where
-they aren't relevant.
+Indent tokens (IN and OUT) and end-of-line tokens (EOL and NEWLINE) can
+be used to describe the grammar and the parser can selectively ignore
+these where they aren't relevant.
There are several distinct sections.
Productions are comprised primarily of symbols - terminal and
non-terminal. We do not make a syntactic distinction between the two,
-though non-terminals must be identifiers. Non-terminal symbols are
-those which appear in the head of a production, terminal symbols are
-those which don't. There are also "virtual" symbols used for precedence
-marking discussed later, and sometimes we won't know what type a symbol
-is yet.
+though non-terminals must be identifiers while terminals can also be
+marks. Non-terminal symbols are those which appear in the head of a
+production, terminal symbols are those which don't. There are also
+"virtual" symbols used for precedence marking discussed later, and
+sometimes we won't know what type a symbol is yet.
To help with code safety it is possible to declare the terminal symbols.
If this is done, then any symbol used in a production that does not
production can declare that it inherits the precedence of a given
virtual symbol.
-This use for `$$` precludes it from being used as a symbol in the
+This use of `$$` precludes it from being used as a symbol in the
described language. Two other symbols: `${` and `}$` are also
unavailable.
found += 1;
t = token_next(ts);
}
- if (found == 0)
+ if (found == 0) {
err = "No symbols given on precedence/TERM line";
goto abort;
+ }
return NULL;
abort:
while (t.num != TK_newline && t.num != TK_eof)
tk = token_next(state);
while (tk.num == TK_ident || tk.num == TK_mark) {
struct symbol *bs = sym_find(g, tk.txt);
- if (bs->type == Unknown) {
- if (!g->terminals_declared)
- bs->type = Terminal;
- }
if (bs->type == Virtual) {
err = "Virtual symbol not permitted in production";
goto abort;
goto abort;
}
token_close(state);
- if (g->terminals_declared) {
- struct symbol *s;
- int errs = 0;
- for (s = g->syms; s; s = s->next) {
- if (s->type != Unknown)
- continue;
- errs += 1;
- fprintf(stderr, "Token %.*s not declared\n",
- s->name.len, s->name.txt);
- }
- if (errs) {
- free(g); // FIXME free content
- g = NULL;
+
+ struct symbol *s;
+ for (s = g->syms; s; s = s->next) {
+ if (s->type != Unknown)
+ continue;
+ if (!g->terminals_declared) {
+ s->type = Terminal;
+ continue;
}
+ err = "not declared";
+ fprintf(stderr, "Token %.*s not declared\n",
+ s->name.len, s->name.txt);
+ }
+ if (err) {
+ free(g); // FIXME free content
+ g = NULL;
}
return g;
abort:
list of unique sets, each assigned a number.
Once we have the data structures in hand to manage these sets and lists,
-we can start setting the 'nullable' flag, build the 'FIRST' sets, and
-then create the item sets which define the various states.
+we can start setting the 'nullable' flag, build the 'FIRST' and 'FOLLOW'
+sets, and then create the item sets which define the various states.
### Symbol sets.
known words added and then is used with the `code_node` to initialize the
scanner.
-`parse_XX` then calls the library function `parser_run` to actually complete
-the parse. This needs the `states` table and functions to call the various
-pieces of code provided in the grammar file, so they are generated first.
+`parse_XX` then calls the library function `parser_run` to actually
+complete the parse. This needs the `states` table, the `reductions`
+table and functions to call the various pieces of code provided in the
+grammar file, so they are generated first.
###### parser_generate
gen_non_term(f, g);
gen_goto(f, g);
gen_states(f, g);
+ gen_reductions(f, g);
gen_reduce(f, g, file, pre_reduce);
gen_free(f, g);
fprintf(f, "\tconfig->words_marks = known;\n");
fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
fprintf(f, "\ttokens = token_open(code, config);\n");
- fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
+ fprintf(f, "\tvoid *rv = parser_run(tokens, states, reductions, do_reduce, do_free, trace, non_term, config);\n");
fprintf(f, "\ttoken_close(tokens);\n");
fprintf(f, "\treturn rv;\n");
fprintf(f, "}\n\n");
fprintf(f, "};\n\n");
}
-### States and the goto tables.
+### States, reductions, and the go to tables.
-For each state we record the goto table and details of the reducible
-production if there is one.
-Some of the details of the reducible production are stored in the
-`do_reduce` function to come later. Here we store the production
-number, the body size (useful for stack management), and the resulting
-symbol (useful for knowing how to free data later).
+For each state we record the go to table and the reducible production if
+there is one, the details of which are in a separate table of
+reductions. Some of the details of the reducible production are stored
+in the `do_reduce` function to come later. In the go to table we store
+the production number and in the reductions table: the body size (useful
+for stack management), the resulting symbol (useful for knowing how to
+free data later), and the size of the resulting asn object (useful for
+preallocation space.
The go to table is stored in a simple array of `sym` and corresponding
`state`.
short sym;
short state;
};
+ struct reduction {
+ short size;
+ short sym;
+ short result_size;
+ };
struct state {
+ short reduce_prod;
short go_to_cnt;
const struct lookup * go_to;
- short reduce_prod;
- short reduce_size;
- short reduce_sym;
- short result_size;
};
###### functions
}
}
+ static void gen_reductions(FILE *f, struct grammar *g)
+ {
+ int i;
+ fprintf(f, "#line 0 \"gen_reductions\"\n");
+ fprintf(f, "static const struct reduction reductions[] = {\n");
+ for (i = 0; i < g->production_count; i++) {
+ struct production *pr = g->productions[i];
+ struct symbol *hd = pr->head;
+ fprintf(f, "\t{%d, %d, ", pr->body_size, hd->num);
+ if (hd->struct_name.txt == NULL)
+ fprintf(f, "0 },\n");
+ else
+ fprintf(f, "sizeof(struct %.*s%s) },\n",
+ hd->struct_name.len,
+ hd->struct_name.txt,
+ hd->isref ? "*" : "");
+ }
+ fprintf(f, "};\n\n");
+ }
+
static void gen_states(FILE *f, struct grammar *g)
{
int i;
}
}
if (is->go_to.cnt)
- fprintf(f, "\t[%d] = { %d, goto_%d, ",
- i, is->go_to.cnt, i);
+ fprintf(f, "\t[%d] = { %d, %d, goto_%d },\n",
+ i, prod, is->go_to.cnt, i);
else
- fprintf(f, "\t[%d] = { 0, NULL, ", i);
- if (prod >= 0) {
- struct production *pr = g->productions[prod];
- struct symbol *hd = pr->head;
- fprintf(f, "%d, %d, %d, ",
- prod, pr->body_size, pr->head->num);
- if (hd->struct_name.txt == NULL)
- fprintf(f, "0 },\n");
- else
- fprintf(f, "sizeof(struct %.*s%s) },\n",
- hd->struct_name.len,
- hd->struct_name.txt,
- hd->isref ? "*" : "");
- } else
- fprintf(f, "-1, -1, -1, -1 },\n");
+ fprintf(f, "\t[%d] = { %d, 0, NULL },\n", i, prod);
}
fprintf(f, "};\n\n");
}
`do_reduce` which runs the code that was included with the production,
if any.
-This code needs to be able to store data somewhere. Rather than
-requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
-buffer and have `do_reduce` return the size to be saved.
+This code needs to be able to store data somewhere so we record the size
+of the data expected with each state so it can be allocated before
+`do_reduce` is called.
In order for the code to access "global" context, we pass in the
"config" pointer that was passed to the parser function. If the `struct
Having analysed the grammar and generated all the tables, we only need
the shift/reduce engine to bring it all together.
-### Goto table lookup
+### Go to table lookup
-The parser generator has nicely provided us with goto tables sorted by
+The parser generator has nicely provided us with go to tables sorted by
symbol number. We need a binary search function to find a symbol in the
table.
So we dynamically grow an array as required but never bother to
shrink it down again.
-We keep the stack as two separate allocations. One, `asn_stack` stores
+We keep the stack as two separate allocations. One, `asn_stack`, stores
the "abstract syntax nodes" which are created by each reduction. When
we call `do_reduce` we need to pass an array of the `asn`s of the body
of the production, and by keeping a separate `asn` stack, we can just
pass a pointer into this stack.
The other allocation stores all other stack fields of which there are
-several. The `state` is the most important one and guides the parsing
+two. The `state` is the most important one and guides the parsing
process. The `sym` is nearly unnecessary as it is implicit in the
state. However when we want to free entries from the `asn_stack`, it
helps to know what type they are so we can call the right freeing
before the beginning. As we need to store some value, `TK_eof` is used
to mark the beginning of the file as well as the end.
-Indents (IN) are sometimes shifted and sometimes only accounted.
-Whatever decision is made must apply equally to the matching OUT. To
-ensure this we keep a stack of bits in `ignored_indents` and
-`indent_depth`. When we process an IN, we record if it was ignored.
-When we see an out, we pop of the relavant bit and use it to decide how
-to handle the OUT.
-
###### parser functions
struct parser {
function for both. In both cases we provide the symbol. The state is
deduced from the current top-of-stack state and the new symbol.
-To simplify other code we arrange for `shift` to fail if there is no `goto`
+To simplify other code we arrange for `shift` to fail if there is no `go to`
state for the symbol. This is useful in basic parsing due to our design
that we shift when we can, and reduce when we cannot. So the `shift`
function reports if it could.
So `shift` finds the next state. If that succeeds it extends the
allocations if needed and pushes all the information onto the stacks.
+An extra complication is added to `shift` by the `EOL` token. This
+token must be generated when a `NEWLINE` is seen, but an `EOL` is
+expected. When this happens, the `NEWLINE` is NOT consumed, so multiple
+EOL can appear before a NEWLINE. To indicate that the token was shifted
+by not consumed, `shift` can return the special value `2`. The token
+number for `EOL` cannot be statically declared, so when the parser
+starts we need to look through the array of non-terminals to find the
+EOL.
+
+###### parser state
+ int tk_eol;
+
+###### find eol
+ p.tk_eol = 0;
+ while (strcmp(non_term[p.tk_eol], "EOL") != 0)
+ p.tk_eol += 1;
+ p.tk_eol += TK_reserved + config->known_count;
+
+
###### parser functions
static int shift(struct parser *p,
const struct state states[])
{
struct frame next = {0};
+ int ret;
int newstate = p->tos
? search(&states[p->stack[p->tos-1].state],
sym)
: 0;
- if (newstate < 0)
+ if (newstate >= 0)
+ ret = 1;
+ else if (sym != TK_newline)
return 0;
+ else {
+ // have a NEWLINE, might need an EOL
+ sym = p->tk_eol;
+ newstate = p->tos
+ ? search(&states[p->stack[p->tos-1].state],
+ sym)
+ : 0;
+ if (newstate < 0)
+ return 0;
+ ret = 2;
+ asn = tok_copy(*(struct token*)asn);
+ }
+
if (p->tos >= p->stack_size) {
p->stack_size += 10;
p->stack = realloc(p->stack, p->stack_size
p->stack[p->tos] = next;
p->asn_stack[p->tos] = asn;
p->tos++;
- return 1;
+ return ret;
}
`pop` primarily moves the top of stack (`tos`) back down the required
### The heart of the parser.
Now we have the parser. For each token we might shift it, trigger a
-reduction, or start error handling. 2D tokens (IN, OUT, NEWLINE) might
-also be ignored. Ignoring tokens is combined with shifting.
+reduction, or start error handling. 2D tokens (IN, OUT, NEWLINE, EOL)
+might also be ignored. Ignoring tokens is combined with shifting.
###### parser vars
}
Indents are ignored unless they can be shifted onto the stack
-immediately. The end of an indented section - the OUT token - is
-ignored precisely when the indent was ignored. To keep track of this we
-need a small stack of flags, which is easily stored as bits in an
-`unsigned long`. This will never overflow and the scanner only allows
-20 levels of indentation.
+immediately or nothing can be shifted (in which case we reduce, and try
+again). The end of an indented section - the OUT token - is ignored
+precisely when the indent was ignored. To keep track of this we need a
+small stack of flags, which is easily stored as bits in an `unsigned
+long`. This will never overflow and the scanner only allows 20 levels
+of indentation.
###### parser state
unsigned long ignored_indents;
- int indent_depth;
-NEWLINE is ignored when in an indented section of text which was not
+NEWLINE/EOL is ignored when in an indented section of text which was not
explicitly expected by the grammar. So if the most recent indent is
ignored, so is any EOL token.
###### try shift or ignore
if ((tk->num == TK_newline || tk->num == TK_out) &&
- (p.ignored_indents & (1 << p.indent_depth))) {
+ (p.ignored_indents & 1)) {
/* indented, so ignore OUT and NEWLINE */
if (tk->num == TK_out)
- p.indent_depth -= 1;
+ p.ignored_indents >>= 1;
free(tk);
tk = NULL;
parser_trace_action(trace, "Ignore");
continue;
}
- if (tk->num == TK_newline) {
- if (1) {
- free(tk);
- tk = NULL;
- parser_trace_action(trace, "Discard");
- continue;
- }
- }
- if (shift(&p, tk->num, tk, states)) {
+ switch (shift(&p, tk->num, tk, states)) {
+ case 1:
if (tk->num == TK_out)
- p.indent_depth -= 1;
- if (tk->num == TK_in) {
- p.indent_depth += 1;
- p.ignored_indents &= ~(1 << p.indent_depth);
- }
+ p.ignored_indents >>= 1;
+ if (tk->num == TK_in)
+ p.ignored_indents <<= 1;
+
tk = NULL;
- parser_trace_action(trace, "Shift");
+ /* fallthrough */
+ case 2:
+ parser_trace_action(trace, tk ? "ShiftEOL" : "Shift");
## did shift
continue;
}
- if (tk->num == TK_in) {
- /* No indent expected here, so ignore IN */
+ if (tk->num == TK_in && states[p.stack[p.tos-1].state].go_to_cnt > 0) {
+ /* No indent expected here and reduce is not mandatory, so ignore IN */
free(tk);
tk = NULL;
- p.indent_depth += 1;
- p.ignored_indents |= (1 << p.indent_depth);
+ p.ignored_indents <<= 1;
+ p.ignored_indents |= 1;
parser_trace_action(trace, "Ignore");
continue;
}
void *res;
const struct state *nextstate = &states[tos->state];
int prod = nextstate->reduce_prod;
- int size = nextstate->reduce_size;
- int res_size = nextstate->result_size;
+ int size = reductions[prod].size;
+ int res_size = reductions[prod].result_size;
body = p.asn_stack + (p.tos - size);
res = res_size ? calloc(1, res_size) : NULL;
res_size = do_reduce(prod, body, config, res);
- if (res_size != nextstate->result_size)
+ if (res_size != reductions[prod].result_size)
abort();
pop(&p, size, do_free);
- if (!shift(&p, nextstate->reduce_sym, res, states)) {
+ if (!shift(&p, reductions[prod].sym, res, states)) {
accepted = 1;
ret = res;
parser_trace_action(trace, "Accept");
void *parser_run(struct token_state *tokens,
const struct state states[],
+ const struct reduction reductions[],
int (*do_reduce)(int, void**, struct token_config*, void*),
void (*do_free)(short, void*),
FILE *trace, const char *non_term[],
{
## parser vars
+ ## find eol
+
## heart of parser
free(tk);
###### exported functions
void *parser_run(struct token_state *tokens,
const struct state states[],
+ const struct reduction reductions[],
int (*do_reduce)(int, void**, struct token_config*, void*),
void (*do_free)(short, void*),
FILE *trace, const char *non_term[],