marking discussed later, and sometimes we won't know what type a symbol
is yet.
+To help with code safety it is possible to declare the terminal symbols.
+If this is done, then any symbol used in a production that does not
+appear in a head and is not declared is treated as an error.
+
###### forward declarations
enum symtype { Unknown, Virtual, Terminal, Nonterminal };
char *symtypes = "UVTN";
###### symbol fields
enum symtype type;
+###### grammar fields
+ int terminals_declared;
Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
table of known symbols and the resulting parser will report them as
### Data types and precedence.
-Data type specification and precedence specification are both
-introduced by a dollar sign at the start of the line. If the next
-word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+Data type specification, precedence specification, and declaration of
+terminals are all introduced by a dollar sign at the start of the line.
+If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+precedence, if it is `TERM` the the line declares terminals without
precedence, otherwise it specifies a data type.
The data type name is simply stored and applied to the head of all
struct token t = token_next(ts);
char *err;
enum assoc assoc;
+ int term = 0;
int found;
if (t.num != TK_ident) {
assoc = Right;
else if (text_is(t.txt, "NON"))
assoc = Non;
- else {
+ else if (text_is(t.txt, "TERM")) {
+ term = 1;
+ g->terminals_declared = 1;
+ } else {
g->current_type = t.txt;
g->type_isref = isref;
if (text_is(t.txt, "void"))
goto abort;
}
- // This is a precedence line, need some symbols.
+ // This is a precedence or TERM line, need some symbols.
found = 0;
g->prec_levels += 1;
t = token_next(ts);
err = "$$ must be followed by a word";
goto abort;
}
+ if (term) {
+ err = "Virtual symbols not permitted on $TERM line";
+ goto abort;
+ }
} else if (t.num != TK_ident &&
t.num != TK_mark) {
err = "Illegal token in precedence line";
}
s = sym_find(g, t.txt);
if (s->type != Unknown) {
- err = "Symbols in precedence line must not already be known.";
+ err = "Symbols in precedence/TERM line must not already be known.";
goto abort;
}
s->type = type;
- s->precedence = g->prec_levels;
- s->assoc = assoc;
+ if (!term) {
+ s->precedence = g->prec_levels;
+ s->assoc = assoc;
+ }
found += 1;
t = token_next(ts);
}
if (found == 0)
- err = "No symbols given on precedence line";
+ err = "No symbols given on precedence/TERM line";
goto abort;
return NULL;
abort:
(not a structure) and means that the reference is being moved out, so
it will not automatically be freed.
+Symbols that are left-recursive are a little special. These are symbols
+that both the head of a production and the first body symbol of the same
+production. They are problematic when they appear in other productions
+elsewhere than at the end, and when indenting is used to describe
+structure. In this case there is no way for a smaller indent to ensure
+the left-recursive symbol cannot be extended. When it appears at the
+end of a production, that production can be reduced to ensure the symbol
+isn't extended. So we record left-recursive symbols while reading the
+grammar, and produce a warning when reporting the grammar if they are
+found in an unsuitable place.
+
+A symbol that is only left recursive in a production where it is
+followed by newline does not cause these problems because the newline
+will effectively terminate it.
+
While building productions we will need to add to an array which needs to
grow dynamically.
###### symbol fields
int first_production;
+ int left_recursive;
###### functions
static char *parse_production(struct grammar *g,
tk = token_next(state);
while (tk.num == TK_ident || tk.num == TK_mark) {
struct symbol *bs = sym_find(g, tk.txt);
- if (bs->type == Unknown)
- bs->type = Terminal;
+ if (bs->type == Unknown) {
+ if (!g->terminals_declared)
+ bs->type = Terminal;
+ }
if (bs->type == Virtual) {
err = "Virtual symbol not permitted in production";
goto abort;
vs = sym_find(g, tk.txt);
if (vs->num == TK_newline)
p.line_like = 1;
+ else if (vs->num == TK_out)
+ p.line_like = 2;
else if (vs->precedence == 0) {
err = "symbol after $$ must have precedence";
goto abort;
}
tk = token_next(state);
}
+ if (p.body_size >= 2 &&
+ p.body[0] == p.head &&
+ p.body[1]->num != TK_newline)
+ p.head->left_recursive = 1;
+
if (tk.num != TK_newline && tk.num != TK_eof) {
err = "stray tokens at end of line";
goto abort;
} else if (tk.num == TK_mark
&& text_is(tk.txt, "$*")) {
err = dollar_line(state, g, 1);
+ } else if (tk.num == TK_mark
+ && text_is(tk.txt, "//")) {
+ while (tk.num != TK_newline &&
+ tk.num != TK_eof)
+ tk = token_next(state);
} else {
err = "Unrecognised token at start of line.";
}
goto abort;
}
token_close(state);
+ if (g->terminals_declared) {
+ struct symbol *s;
+ int errs = 0;
+ for (s = g->syms; s; s = s->next) {
+ if (s->type != Unknown)
+ continue;
+ errs += 1;
+ fprintf(stderr, "Token %.*s not declared\n",
+ s->name.len, s->name.txt);
+ }
+ if (errs) {
+ free(g);
+ g = NULL;
+ }
+ }
return g;
abort:
fprintf(stderr, "Error at line %d: %s\n",
continue;
if (symset_find(&done, s->num) < 0) {
symset_add(&done, s->num, 0);
- if (s->line_like)
- is->starts_line = 1;
}
if (s->type != Nonterminal)
continue;
+ if (s->line_like)
+ is->starts_line = 1;
again = 1;
if (type >= LALR) {
// Need the LA set.
}
sn = save_set(g, LA);
LA = set_find(g, sn);
- symset_add(&LAnl, TK_newline, 0);
+ if (symset_find(&LA, TK_newline))
+ symset_add(&LAnl, TK_newline, 0);
snnl = save_set(g, LAnl);
LAnl = set_find(g, snnl);
}
snum++;
}
g->first_nonterm = snum;
+ for (s = g->syms; s; s = s->next)
+ if (s->num < 0 && s->type != Virtual) {
+ s->num = snum;
+ snum++;
+ }
for (s = g->syms; s; s = s->next)
if (s->num < 0) {
s->num = snum;
if (g->follow)
report_follow(g);
report_itemsets(g);
- return report_conflicts(g, type);
+ return report_conflicts(g, type) + report_left_recursive(g);
}
Firstly we have the complete list of symbols, together with the
printf(" [%d%s]", s->precedence,
assoc_names[s->assoc]);
}
- if (pr->line_like)
+ if (pr->line_like == 1)
printf(" $$NEWLINE");
+ else if (pr->line_like)
+ printf(" $$OUT");
printf("\n");
}
which maps terminals to items that could be reduced when the terminal
is in look-ahead. We report when we get conflicts between the two.
-As a special case, if we find a SHIFT/REDUCE conflict, where a
-terminal that could be shifted is in the lookahead set of some
-reducable item, then set check if the reducable item also have
-`TK_newline` in its lookahead set. If it does, then a newline will
-force the reduction, but anything else can reasonably be shifted, so
-that isn't really a conflict. Such apparent conflicts do not get
-counted, and are reported as non-critical. This will not affect a
-"traditional" grammar that does not include newlines as token.
+As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
+terminal, we ignore it. NEWLINES are handled specially with its own
+rules for when to shift and when to reduce. Conflicts are expected,
+but handled internally.
static int conflicts_slr(struct grammar *g, enum grammar_type type)
{
for (k = 0; k < la.cnt; k++) {
int pos = symset_find(&shifts, la.syms[k]);
if (pos >= 0 && la.syms[k] != TK_newline) {
- if (symset_find(&la, TK_newline) < 0) {
- printf(" State %d has SHIFT/REDUCE conflict on ", i);
- cnt++;
- } else
- printf(" State %d has non-critical SHIFT/REDUCE conflict on ", i);
- prtxt(g->symtab[la.syms[k]]->name);
+ printf(" State %d has SHIFT/REDUCE conflict on ", i);
+ cnt++;
+ prtxt(g->symtab[la.syms[k]]->name);
printf(":\n");
report_item(g, shifts.data[pos]);
report_item(g, itm);
return cnt;
}
+
+### Reporting non-final left-recursive symbols.
+
+Left recursive symbols are a problem for parses that honour indentation
+when they appear other than at the end of the production. So we need to
+report these when asked.
+
+###### functions
+
+ static int report_left_recursive(struct grammar *g)
+ {
+ int p;
+ int bad_left_recursive = 0;
+
+ for (p = 0; p < g->production_count; p++) {
+ struct production *pr = g->productions[p];
+ int sn;
+
+ for (sn = 0; sn < pr->body_size - 1; sn++) {
+ struct symbol *s = pr->body[sn];
+
+ if (s->left_recursive == 1 &&
+ s != pr->head) {
+ if (!bad_left_recursive) {
+ bad_left_recursive = 1;
+ printf("Misplaced left recursive symbols.\n");
+ }
+ printf(" ");
+ prtxt(s->name);
+ printf(" in production [%d]\n", p);
+ s->left_recursive = 2;
+ }
+ }
+ }
+ return bad_left_recursive;
+ }
+
## Generating the parser
The exported part of the parser is the `parse_XX` function, where the name
fprintf(f, "\tstruct token_state *tokens;\n");
fprintf(f, "\tconfig->words_marks = known;\n");
fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
- fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
fprintf(f, "\ttokens = token_open(code, config);\n");
fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
fprintf(f, "\ttoken_close(tokens);\n");
for (i = TK_reserved;
i < g->num_syms;
i++)
- if (g->symtab[i]->type != Terminal)
+ if (g->symtab[i]->type == Nonterminal)
fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
g->symtab[i]->name.txt);
fprintf(f, "};\n\n");
We return whatever `asn` was returned by reducing production zero.
If we can neither shift nor reduce we have an error to handle. We pop
-single entries off the stack until we can shift the `TK_error` symbol, then
-drop input tokens until we find one we can shift into the new error state.
+single entries off the stack until we can shift the `TK_error` symbol,
+then drop input tokens until we find one we can shift into the new error
+state. We need to ensure that something is dropped or shifted after an
+error, or we could get into an infinite loop, only shifting in
+`TK_error`, then reducing. So we track if there has been a shift since
+the last error, and if not the next error always discards one token.
When we find `TK_in` and `TK_out` tokens which report indents we need
to handle them directly as the grammar cannot express what we want to
Here the NEWLINE will be shifted because nothing can be reduced until
the `if` is seen.
-When, during error handling, we discard token read in, we want to keep
+When during error handling we discard tokens read in, we want to keep
discarding until we see one that is recognised. If we had a full set
-of LR(1) grammar states, this will mean looking in the look-ahead set,
+of LR(1) grammar states, this would mean looking in the look-ahead set,
but we don't keep a full look-ahead set. We only record the subset
-that leads to SHIFT. We can, however, deduce the look-ahead set but
+that leads to SHIFT. We can, however, deduce the look-ahead set by
looking at the SHIFT subsets for all states that we can get to by
reducing zero or more times. So we need a little function which
checks if a given token is in any of these look-ahead sets.
struct parser p = { 0 };
struct token *tk = NULL;
int accepted = 0;
+ int shift_since_err = 1;
void *ret = NULL;
shift(&p, TK_eof, 0, 1, NULL, states);
goto force_reduce;
}
if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
+ shift_since_err = 1;
tk = NULL;
parser_trace_action(trace, "Shift");
continue;
force_reduce:
if (states[tos->state].reduce_prod >= 0 &&
states[tos->state].newline_only &&
- tk->num != TK_newline && tk->num != TK_eof && tk->num != TK_out) {
- /* Anything other than newline in an error as this
- * production must end at EOL
+ !(tk->num == TK_newline ||
+ tk->num == TK_eof ||
+ tk->num == TK_out ||
+ (tos->indents == 0 && tos->since_newline == 0))) {
+ /* Anything other than newline or out or eof
+ * in an error unless we are already at start
+ * of line, as this production must end at EOL.
*/
} else if (states[tos->state].reduce_prod >= 0) {
void **body;
// no state accepted TK_error
break;
}
+ if (!shift_since_err) {
+ /* must discard at least one token to avoid
+ * infinite loop.
+ */
+ if (tk->num == TK_eof)
+ break;
+ free(tk);
+ tk = tok_copy(token_next(tokens));
+ }
+ shift_since_err = 0;
tos = &p.stack[p.tos-1];
while (!in_lookahead(tk, states, tos->state) &&
tk->num != TK_eof) {
free(tk);
tk = tok_copy(token_next(tokens));
+ shift_since_err = 1;
if (tk->num == TK_in)
indents += 1;
if (tk->num == TK_out) {
fputs(reserved_words[tk->num], trace);
else
text_dump(trace, tk->txt, 20);
- fputs("]", trace);
+ fprintf(trace, ":%d:%d]", tk->line, tk->col);
}
void parser_trace_action(FILE *trace, char *action)
# calc: header
- #include "number.h"
+ #include "parse_number.h"
// what do we use for a demo-grammar? A calculator of course.
struct number {
mpq_t val;
#include <string.h>
#include "mdcode.h"
#include "scanner.h"
- #include "number.h"
#include "parser.h"
#include "calc.h"
struct section *s;
struct token_config config = {
.ignored = (1 << TK_line_comment)
- | (1 << TK_block_comment)
| (1 << TK_in)
| (1 << TK_out),
.number_chars = ".,_+-",
# calc: grammar
$LEFT + -
- $LEFT * /
+ $LEFT * / //
Session -> Session Line
| Line
| Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
| Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
| Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+ | Expression // Expression ${ {
+ mpz_t z0, z1, z2;
+ mpq_init($0.val);
+ mpz_init(z0); mpz_init(z1); mpz_init(z2);
+ mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
+ mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
+ mpz_tdiv_q(z0, z1, z2);
+ mpq_set_z($0.val, z0);
+ mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
+ } }$
| NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
| ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
10 * 9 / 2
1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
+ 355//113
+
error