`parsergen` program built from the C code in this file can extract
that grammar directly from this file and process it.
-
###### File: parsergen.c
#include <unistd.h>
#include <stdlib.h>
start with the tag is ignored, and the tag is striped from the rest. So
`--tag Foo`
means that the three needed sections must be `Foo: header`, `Foo: code`,
-and `Foo: grammar`.
+and `Foo: grammar`. The tag `calc` is used to extract the same calculator
+from this file.
[mdcode]: mdcode.html
[scanner]: scanner.html
struct production {
unsigned short precedence;
enum assoc assoc;
+ char line_like;
## production fields
};
struct grammar {
marking discussed later, and sometimes we won't know what type a symbol
is yet.
+To help with code safety it is possible to declare the terminal symbols.
+If this is done, then any symbol used in a production that does not
+appear in a head and is not declared is treated as an error.
+
###### forward declarations
enum symtype { Unknown, Virtual, Terminal, Nonterminal };
char *symtypes = "UVTN";
###### symbol fields
enum symtype type;
+###### grammar fields
+ int terminals_declared;
Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
table of known symbols and the resulting parser will report them as
### Data types and precedence.
-Data type specification and precedence specification are both
-introduced by a dollar sign at the start of the line. If the next
-word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+Data type specification, precedence specification, and declaration of
+terminals are all introduced by a dollar sign at the start of the line.
+If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+precedence, if it is `TERM` the the line declares terminals without
precedence, otherwise it specifies a data type.
The data type name is simply stored and applied to the head of all
listed and may be inherited by any production which uses the symbol. A
production inherits from the last symbol which has a precedence.
+The symbols on the first precedence line have the lowest precedence.
+Subsequent lines introduce symbols with higher precedence.
+
###### grammar fields
struct text current_type;
int type_isref;
struct token t = token_next(ts);
char *err;
enum assoc assoc;
+ int term = 0;
int found;
if (t.num != TK_ident) {
assoc = Right;
else if (text_is(t.txt, "NON"))
assoc = Non;
- else {
+ else if (text_is(t.txt, "TERM")) {
+ term = 1;
+ g->terminals_declared = 1;
+ } else {
g->current_type = t.txt;
g->type_isref = isref;
if (text_is(t.txt, "void"))
goto abort;
}
- // This is a precedence line, need some symbols.
+ // This is a precedence or TERM line, need some symbols.
found = 0;
g->prec_levels += 1;
t = token_next(ts);
err = "$$ must be followed by a word";
goto abort;
}
+ if (term) {
+ err = "Virtual symbols not permitted on $TERM line";
+ goto abort;
+ }
} else if (t.num != TK_ident &&
t.num != TK_mark) {
err = "Illegal token in precedence line";
}
s = sym_find(g, t.txt);
if (s->type != Unknown) {
- err = "Symbols in precedence line must not already be known.";
+ err = "Symbols in precedence/TERM line must not already be known.";
goto abort;
}
s->type = type;
- s->precedence = g->prec_levels;
- s->assoc = assoc;
+ if (!term) {
+ s->precedence = g->prec_levels;
+ s->assoc = assoc;
+ }
found += 1;
+ t = token_next(ts);
}
if (found == 0)
- err = "No symbols given on precedence line";
+ err = "No symbols given on precedence/TERM line";
goto abort;
return NULL;
abort:
(not a structure) and means that the reference is being moved out, so
it will not automatically be freed.
+Symbols that are left-recursive are a little special. These are symbols
+that both the head of a production and the first body symbol of the same
+production. They are problematic when they appear in other productions
+elsewhere than at the end, and when indenting is used to describe
+structure. In this case there is no way for a smaller indent to ensure
+the left-recursive symbol cannot be extended. When it appears at the
+end of a production, that production can be reduced to ensure the symbol
+isn't extended. So we record left-recursive symbols while reading the
+grammar, and produce a warning when reporting the grammar if they are
+found in an unsuitable place.
+
+A symbol that is only left recursive in a production where it is
+followed by newline does not cause these problems because the newline
+will effectively terminate it.
+
While building productions we will need to add to an array which needs to
grow dynamically.
return code;
}
-Now we have all the bit we need to parse a full production.
+Now we have all the bits we need to parse a full production.
###### includes
#include <memory.h>
struct symbol **body;
int body_size;
struct text code;
+ int code_line;
###### symbol fields
int first_production;
+ int left_recursive;
###### functions
static char *parse_production(struct grammar *g,
tk = token_next(state);
while (tk.num == TK_ident || tk.num == TK_mark) {
struct symbol *bs = sym_find(g, tk.txt);
- if (bs->type == Unknown)
- bs->type = Terminal;
+ if (bs->type == Unknown) {
+ if (!g->terminals_declared)
+ bs->type = Terminal;
+ }
if (bs->type == Virtual) {
err = "Virtual symbol not permitted in production";
goto abort;
goto abort;
}
vs = sym_find(g, tk.txt);
- if (vs->type != Virtual) {
- err = "symbol after $$ must be virtual";
+ if (vs->num == TK_newline)
+ p.line_like = 1;
+ else if (vs->num == TK_out)
+ p.line_like = 2;
+ else if (vs->precedence == 0) {
+ err = "symbol after $$ must have precedence";
goto abort;
+ } else {
+ p.precedence = vs->precedence;
+ p.assoc = vs->assoc;
}
- p.precedence = vs->precedence;
- p.assoc = vs->assoc;
tk = token_next(state);
}
if (tk.num == TK_open) {
+ p.code_line = tk.line;
p.code = collect_code(state, tk);
if (p.code.txt == NULL) {
err = "code fragment not closed properly";
}
tk = token_next(state);
}
+ if (p.body_size >= 2 &&
+ p.body[0] == p.head &&
+ p.body[1]->num != TK_newline)
+ p.head->left_recursive = 1;
+
if (tk.num != TK_newline && tk.num != TK_eof) {
err = "stray tokens at end of line";
goto abort;
p->head->first_production = g->production_count;
array_add(&g->productions, &g->production_count, p);
-Now we are ready to read in the grammar.
+Now we are ready to read in the grammar. We ignore comments
+and strings so that the marks which introduce them can be
+used as terminals in the grammar. We don't ignore numbers
+even though we don't allow them as that causes the scanner
+to produce errors that the parser is better positioned to handle.
###### grammar_read
static struct grammar *grammar_read(struct code_node *code)
} else if (tk.num == TK_mark
&& text_is(tk.txt, "$*")) {
err = dollar_line(state, g, 1);
+ } else if (tk.num == TK_mark
+ && text_is(tk.txt, "//")) {
+ while (tk.num != TK_newline &&
+ tk.num != TK_eof)
+ tk = token_next(state);
} else {
err = "Unrecognised token at start of line.";
}
goto abort;
}
token_close(state);
+ if (g->terminals_declared) {
+ struct symbol *s;
+ int errs = 0;
+ for (s = g->syms; s; s = s->next) {
+ if (s->type != Unknown)
+ continue;
+ errs += 1;
+ fprintf(stderr, "Token %.*s not declared\n",
+ s->name.len, s->name.txt);
+ }
+ if (errs) {
+ free(g);
+ g = NULL;
+ }
+ }
return g;
abort:
fprintf(stderr, "Error at line %d: %s\n",
}
We will often want to form the union of two symsets. When we do, we
-will often want to know if anything changed (as they might mean there
+will often want to know if anything changed (as that might mean there
is more work to do). So `symset_union` reports whether anything was
added to the first set. We use a slow quadratic approach as these
-sets don't really get very big. If profiles shows this to be a problem is
+sets don't really get very big. If profiles shows this to be a problem it
can be optimised later.
static int symset_union(struct symset *a, struct symset *b)
return sl->ss;
}
-
### Setting `nullable`
We set `nullable` on the head symbol for any production for which all
}
}
-### Setting `can_eol` and `line_like`
+### Setting `line_like`
In order to be able to ignore newline tokens when not relevant, but
still include them in the parse when needed, we will need to know
newlines when there is an indent since the most recent start of a
line-like symbol.
-To know which symbols are line-like, we first need to know which
-symbols start with a NEWLINE token. Any symbol which is followed by a
-NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol.
-Certainly when trying to parse one of these we must take not of NEWLINEs.
+A "line_like" symbol is simply any symbol that can derive a NEWLINE.
+If a symbol cannot derive a NEWLINE, then it is only part of a line -
+so is word-like. If it can derive a NEWLINE, then we consider it to
+be like a line.
-Clearly the `TK_newline` token can start with a NEWLINE. Any symbol
-which is the head of a production that contains a starts-with-NEWLINE
-symbol preceeded only by nullable symbols is also a
-starts-with-NEWLINE symbol. We use a new field `can_eol` to record
-this attribute of symbols, and compute it in a repetitive manner
-similar to `set_nullable`.
-
-Once we have that, we can determine which symbols are `line_like` be
-seeing which are followed by a `can_eol` symbol in any production.
+Clearly the `TK_newline` token can derive a NEWLINE. Any symbol which
+is the head of a production that contains a line_like symbol is also a
+line-like symbol. We use a new field `line_like` to record this
+attribute of symbols, and compute it in a repetitive manner similar to
+`set_nullable`.
###### symbol fields
- int can_eol;
int line_like;
###### functions
- static void set_can_eol(struct grammar *g)
+ static void set_line_like(struct grammar *g)
{
int check_again = 1;
- g->symtab[TK_newline]->can_eol = 1;
+ g->symtab[TK_newline]->line_like = 1;
while (check_again) {
int p;
check_again = 0;
struct production *pr = g->productions[p];
int s;
- if (pr->head->can_eol)
+ if (pr->head->line_like)
continue;
for (s = 0 ; s < pr->body_size; s++) {
- if (pr->body[s]->can_eol) {
- pr->head->can_eol = 1;
+ if (pr->body[s]->line_like) {
+ pr->head->line_like = 1;
check_again = 1;
break;
}
- if (!pr->body[s]->nullable)
- break;
}
}
}
}
- static void set_line_like(struct grammar *g)
- {
- int p;
- for (p = 0; p < g->production_count; p++) {
- struct production *pr = g->productions[p];
- int s;
-
- for (s = 1; s < pr->body_size; s++)
- if (pr->body[s]->can_eol)
- pr->body[s-1]->line_like = 1;
- }
- }
-
### Building the `first` sets
When calculating what can follow a particular non-terminal, we will need to
`set_nullable`. This makes use of the fact that `symset_union`
reports if any change happens.
-The core of this which finds the "first" of part of a production body
+The core of this, which finds the "first" of part of a production body,
will be reused for computing the "follow" sets, so we split it out
into a separate function.
a.data[i] - b.data[i];
}
+It will be helpful to know if an itemset has been "completed" or not,
+particularly for LALR where itemsets get merged, at which point they
+need to be consider for completion again. So a `completed` flag is needed.
+
+For correct handling of `TK_newline` when parsing, we will need to
+know which states (itemsets) can occur at the start of a line, so we
+will record a `starts_line` flag too whenever DOT is at the start of a
+`line_like` symbol.
+
+Finally, for handling `TK_out` we need to know whether productions in the
+current state started *before* the most recent indent. A state
+doesn't usually keep details of individual productions, so we need to
+add one extra detail. `min_prefix` is the smallest non-zero number of
+symbols *before* DOT in any production in an itemset. This will allow
+us to determine if the the most recent indent is sufficiently recent
+to cancel it against a `TK_out`. If it was seen longer ago than the
+`min_prefix`, and if the current state cannot be reduced, then the
+indented section must have ended in the middle of a syntactic unit, so
+an error must be signaled.
+
And now we can build the list of itemsets. The lookup routine returns
both a success flag and a pointer to where in the list an insert
should happen, so we don't need to search a second time.
short state;
struct symset items;
struct symset go_to;
+ enum assoc assoc;
+ unsigned short precedence;
char completed;
char starts_line;
int min_prefix;
them to a data structure, of freeing them.
static int add_itemset(struct grammar *g, struct symset ss,
+ enum assoc assoc, unsigned short precedence,
enum grammar_type type)
{
struct itemset **where, *is;
is->state = g->states;
g->states += 1;
is->items = ss;
+ is->assoc = assoc;
+ is->precedence = precedence;
is->next = *where;
is->go_to = INIT_DATASET;
*where = is;
We also collect a set of all symbols which follow "DOT" (in `done`) as this
is used in the next stage.
-If any of these symbols are flagged as starting a line, then this
+If any of these symbols are flagged as `line_like`, then this
state must be a `starts_line` state so now is a good time to record that.
-NOTE: precedence handling should happen here - I haven't written this yet
-though.
+When itemsets are created we assign a precedence to the itemset from
+the complete item, if there is one. We ignore the possibility of
+there being two and don't (currently) handle precedence in such
+grammars. When completing a grammar we ignore any item where DOT is
+followed by a terminal with a precedence lower than that for the
+itemset. Unless the terminal has right associativity, we also ignore
+items where the terminal has the same precedence. The result is that
+unwanted items are still in the itemset, but the terminal doesn't get
+into the go to set, so the item is ineffective.
###### complete itemset
for (i = 0; i < is->items.cnt; i++) {
struct symbol *s;
struct symset LA = INIT_SYMSET;
unsigned short sn = 0;
+ struct symset LAnl = INIT_SYMSET;
+ unsigned short snnl = 0;
if (is->min_prefix == 0 ||
(bs > 0 && bs < is->min_prefix))
if (bs == pr->body_size)
continue;
s = pr->body[bs];
+ if (s->precedence && is->precedence &&
+ is->precedence > s->precedence)
+ /* This terminal has a low precedence and
+ * shouldn't be shifted
+ */
+ continue;
+ if (s->precedence && is->precedence &&
+ is->precedence == s->precedence && s->assoc != Right)
+ /* This terminal has a matching precedence and is
+ * not Right-associative, so we mustn't shift it.
+ */
+ continue;
if (symset_find(&done, s->num) < 0) {
symset_add(&done, s->num, 0);
- if (s->line_like)
- is->starts_line = 1;
}
if (s->type != Nonterminal)
continue;
+ if (s->line_like)
+ is->starts_line = 1;
again = 1;
if (type >= LALR) {
// Need the LA set.
}
sn = save_set(g, LA);
LA = set_find(g, sn);
+ if (symset_find(&LA, TK_newline))
+ symset_add(&LAnl, TK_newline, 0);
+ snnl = save_set(g, LAnl);
+ LAnl = set_find(g, snnl);
}
/* Add productions for this symbol */
int itm = item_num(p2, 0);
int pos = symset_find(&is->items, itm);
if (pos < 0) {
- symset_add(&is->items, itm, sn);
+ if (g->productions[p2]->line_like)
+ symset_add(&is->items, itm, snnl);
+ else
+ symset_add(&is->items, itm, sn);
/* Will have re-ordered, so start
* from beginning again */
i = -1;
} else if (type >= LALR) {
struct symset ss = set_find(g, is->items.data[pos]);
struct symset tmp = INIT_SYMSET;
+ struct symset *la = &LA;
+ if (g->productions[p2]->line_like)
+ la = &LAnl;
symset_union(&tmp, &ss);
- if (symset_union(&tmp, &LA)) {
+ if (symset_union(&tmp, la)) {
is->items.data[pos] = save_set(g, tmp);
i = -1;
- }else
+ } else
symset_free(tmp);
}
}
int j;
unsigned short state;
struct symbol *sym = g->symtab[done.syms[i]];
+ enum assoc assoc = Non;
+ unsigned short precedence = 0;
struct symset newitemset = INIT_SYMSET;
if (type >= LALR)
newitemset = INIT_DATASET;
if (type >= LALR)
la = is->items.data[j];
pos = symset_find(&newitemset, pr->head->num);
+ if (bp + 1 == pr->body_size &&
+ pr->precedence > 0 &&
+ pr->precedence > precedence) {
+ // new itemset is reducible and has a precedence.
+ precedence = pr->precedence;
+ assoc = pr->assoc;
+ }
if (pos < 0)
symset_add(&newitemset, item_num(p, bp+1), la);
else if (type >= LALR) {
}
}
}
- state = add_itemset(g, newitemset, type);
+ state = add_itemset(g, newitemset, assoc, precedence, type);
if (symset_find(&is->go_to, done.syms[i]) < 0)
symset_add(&is->go_to, done.syms[i], state);
}
-All that is left is to crate the initial itemset from production zero, and
+All that is left is to create the initial itemset from production zero, and
with `TK_eof` as the LA set.
###### functions
}
// production 0, offset 0 (with no data)
symset_add(&first, item_num(0, 0), la);
- add_itemset(g, first, type);
+ add_itemset(g, first, Non, 0, type);
for (again = 0, is = g->items;
is;
is = is->next ?: again ? (again = 0, g->items) : NULL) {
snum++;
}
g->first_nonterm = snum;
+ for (s = g->syms; s; s = s->next)
+ if (s->num < 0 && s->type != Virtual) {
+ s->num = snum;
+ snum++;
+ }
for (s = g->syms; s; s = s->next)
if (s->num < 0) {
s->num = snum;
g->symtab[s->num] = s;
set_nullable(g);
- set_can_eol(g);
set_line_like(g);
if (type >= SLR)
build_first(g);
if (g->follow)
report_follow(g);
report_itemsets(g);
- return report_conflicts(g, type);
+ return report_conflicts(g, type) + report_left_recursive(g);
}
Firstly we have the complete list of symbols, together with the
if (!s)
continue;
- printf(" %c%c%c%3d%c: ",
+ printf(" %c%c%3d%c: ",
s->nullable ? '.':' ',
- s->can_eol ? '>':' ',
s->line_like ? '<':' ',
s->num, symtypes[s->type]);
prtxt(s->name);
if (dot == pr->body_size)
printf(" .");
printf(" [%d]", p);
- if (pr->precedence)
+ if (pr->precedence && dot == pr->body_size)
printf(" (%d%s)", pr->precedence,
assoc_names[pr->assoc]);
+ if (dot < pr->body_size &&
+ pr->body[dot]->precedence) {
+ struct symbol *s = pr->body[dot];
+ printf(" [%d%s]", s->precedence,
+ assoc_names[s->assoc]);
+ }
+ if (pr->line_like == 1)
+ printf(" $$NEWLINE");
+ else if (pr->line_like)
+ printf(" $$OUT");
printf("\n");
}
Then the go to sets:
-
static void report_goto(struct grammar *g, struct symset gt)
{
int i;
for (s = 0; s < g->states; s++) {
int j;
struct itemset *is = g->statetab[s];
- printf(" Itemset %d:%s min prefix=%d\n",
+ printf(" Itemset %d:%s min prefix=%d",
s, is->starts_line?" (startsline)":"", is->min_prefix);
+ if (is->precedence)
+ printf(" %d%s", is->precedence, assoc_names[is->assoc]);
+ printf("\n");
for (j = 0; j < is->items.cnt; j++) {
report_item(g, is->items.syms[j]);
if (is->items.data != NO_DATA)
LR0 conflicts are any state which have both a reducible item and
a shiftable item, or two reducible items.
-LR05 conflicts only occurs if two possibly reductions exist,
+LR05 conflicts only occur if two possible reductions exist,
as shifts always over-ride reductions.
###### conflict functions
which maps terminals to items that could be reduced when the terminal
is in look-ahead. We report when we get conflicts between the two.
+As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
+terminal, we ignore it. NEWLINES are handled specially with its own
+rules for when to shift and when to reduce. Conflicts are expected,
+but handled internally.
+
static int conflicts_slr(struct grammar *g, enum grammar_type type)
{
int i;
int p = item_prod(itm);
int bp = item_index(itm);
struct production *pr = g->productions[p];
+ struct symbol *s;
- if (bp < pr->body_size &&
- pr->body[bp]->type == Terminal) {
- /* shiftable */
- int sym = pr->body[bp]->num;
- if (symset_find(&shifts, sym) < 0)
- symset_add(&shifts, sym, itm);
- }
+ if (bp >= pr->body_size ||
+ pr->body[bp]->type != Terminal)
+ /* not shiftable */
+ continue;
+
+ s = pr->body[bp];
+ if (s->precedence && is->precedence)
+ /* Precedence resolves this, so no conflict */
+ continue;
+
+ if (symset_find(&shifts, s->num) < 0)
+ symset_add(&shifts, s->num, itm);
}
- /* Now look for reduction and conflicts */
+ /* Now look for reductions and conflicts */
for (j = 0; j < is->items.cnt; j++) {
unsigned short itm = is->items.syms[j];
int p = item_prod(itm);
int k;
for (k = 0; k < la.cnt; k++) {
int pos = symset_find(&shifts, la.syms[k]);
- if (pos >= 0) {
+ if (pos >= 0 && la.syms[k] != TK_newline) {
printf(" State %d has SHIFT/REDUCE conflict on ", i);
- prtxt(g->symtab[la.syms[k]]->name);
+ cnt++;
+ prtxt(g->symtab[la.syms[k]]->name);
printf(":\n");
report_item(g, shifts.data[pos]);
report_item(g, itm);
- cnt++;
}
pos = symset_find(&reduce, la.syms[k]);
if (pos < 0) {
}
+### Reporting non-final left-recursive symbols.
+
+Left recursive symbols are a problem for parses that honour indentation
+when they appear other than at the end of the production. So we need to
+report these when asked.
+
+###### functions
+
+ static int report_left_recursive(struct grammar *g)
+ {
+ int p;
+ int bad_left_recursive = 0;
+
+ for (p = 0; p < g->production_count; p++) {
+ struct production *pr = g->productions[p];
+ int sn;
+
+ for (sn = 0; sn < pr->body_size - 1; sn++) {
+ struct symbol *s = pr->body[sn];
+
+ if (s->left_recursive == 1 &&
+ s != pr->head) {
+ if (!bad_left_recursive) {
+ bad_left_recursive = 1;
+ printf("Misplaced left recursive symbols.\n");
+ }
+ printf(" ");
+ prtxt(s->name);
+ printf(" in production [%d]\n", p);
+ s->left_recursive = 2;
+ }
+ }
+ }
+ return bad_left_recursive;
+ }
+
## Generating the parser
The exported part of the parser is the `parse_XX` function, where the name
###### parser_generate
- static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
+ static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
+ struct code_node *pre_reduce)
{
gen_known(f, g);
gen_non_term(f, g);
gen_goto(f, g);
gen_states(f, g);
- gen_reduce(f, g, file);
+ gen_reduce(f, g, file, pre_reduce);
gen_free(f, g);
fprintf(f, "#line 0 \"gen_parser\"\n");
fprintf(f, "\tstruct token_state *tokens;\n");
fprintf(f, "\tconfig->words_marks = known;\n");
fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
- fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
fprintf(f, "\ttokens = token_open(code, config);\n");
fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
fprintf(f, "\ttoken_close(tokens);\n");
### Known words table
The known words table is simply an array of terminal symbols.
-The table of nonterminals used for tracing is a similar array.
+The table of nonterminals used for tracing is a similar array. We
+include virtual symbols in the table of non_terminals to keep the
+numbers right.
###### functions
short reduce_prod;
short reduce_size;
short reduce_sym;
- short shift_sym;
- short starts_line;
+ char starts_line;
+ char newline_only;
short min_prefix;
};
-
###### functions
static void gen_goto(FILE *f, struct grammar *g)
for (i = 0; i < g->states; i++) {
struct itemset *is = g->statetab[i];
int j, prod = -1, prod_len;
- int shift_sym = -1;
- int shift_len = 0, shift_remain = 0;
+
for (j = 0; j < is->items.cnt; j++) {
int itm = is->items.syms[j];
int p = item_prod(itm);
int bp = item_index(itm);
struct production *pr = g->productions[p];
- if (bp < pr->body_size) {
- if (shift_sym < 0 ||
- (shift_len == bp && shift_remain > pr->body_size - bp)) {
- shift_sym = pr->body[bp]->num;
- shift_len = bp;
- shift_remain = pr->body_size - bp;
- }
+ if (bp < pr->body_size)
continue;
- }
/* This is what we reduce */
if (prod < 0 || prod_len < pr->body_size) {
prod = p;
}
if (prod >= 0)
- fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d, %d },\n",
+ fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n",
i, is->go_to.cnt, i, prod,
g->productions[prod]->body_size,
g->productions[prod]->head->num,
- is->starts_line, is->min_prefix);
+ is->starts_line,
+ g->productions[prod]->line_like,
+ is->min_prefix);
else
- fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d, %d },\n",
- i, is->go_to.cnt, i, shift_sym,
+ fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n",
+ i, is->go_to.cnt, i,
is->starts_line, is->min_prefix);
}
fprintf(f, "};\n\n");
The code fragment requires translation when written out. Any `$N` needs to
be converted to a reference either to that buffer (if `$0`) or to the
structure returned by a previous reduction. These pointers need to be cast
-to the appropriate type for each access. All this is handling in
+to the appropriate type for each access. All this is handled in
`gen_code`.
`gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
fputs("\n", f);
for (i = 0; i < p->body_size; i++) {
if (p->body[i]->struct_name.txt &&
- p->body[i]->isref &&
- used[i])
+ used[i]) {
// assume this has been copied out
- fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
+ if (p->body[i]->isref)
+ fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
+ else
+ fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
+ }
}
free(used);
}
###### functions
- static void gen_reduce(FILE *f, struct grammar *g, char *file)
+ static void gen_reduce(FILE *f, struct grammar *g, char *file,
+ struct code_node *code)
{
int i;
- fprintf(f, "#line 0 \"gen_reduce\"\n");
+ fprintf(f, "#line 1 \"gen_reduce\"\n");
fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
fprintf(f, "{\n");
fprintf(f, "\tint ret_size = 0;\n");
+ if (code)
+ code_node_print(f, code, file);
+ fprintf(f, "#line 4 \"gen_reduce\"\n");
fprintf(f, "\tswitch(prod) {\n");
for (i = 0; i < g->production_count; i++) {
struct production *p = g->productions[i];
fprintf(f, "\tcase %d:\n", i);
- if (p->code.txt)
+ if (p->code.txt) {
+ fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
gen_code(p, f, g);
+ }
if (p->head->struct_name.txt)
fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
For this, the grammar author is required to defined a `free_XX` function for
each structure that is used by a non-terminal. `do_free` will call whichever
is appropriate given a symbol number, and will call `free` (as is
-appropriate for tokens on any terminal symbol.
+appropriate for tokens) on any terminal symbol.
###### functions
To be able to run `mdcode` and `scanner` on the grammar we need to memory
map it.
-One we have extracted the code (with `mdcode`) we expect to find three
+Once we have extracted the code (with `mdcode`) we expect to find three
sections: header, code, and grammar. Anything else that is not
excluded by the `--tag` option is an error.
struct code_node *hdr = NULL;
struct code_node *code = NULL;
struct code_node *gram = NULL;
+ struct code_node *pre_reduce = NULL;
for (s = table; s; s = s->next) {
struct text sec = s->section;
if (tag && !strip_tag(&sec, tag))
code = s->code;
else if (text_is(sec, "grammar"))
gram = s->code;
+ else if (text_is(sec, "reduce"))
+ pre_reduce = s->code;
else {
fprintf(stderr, "Unknown content section: %.*s\n",
s->section.len, s->section.txt);
rv |= 1;
}
-If a headers section is defined, we write it out and include a declaration
+If a "headers" section is defined, we write it out and include a declaration
for the `parse_XX` function so it can be used from separate code.
if (rv == 0 && hdr && outfile) {
if (f) {
if (code)
code_node_print(f, code, infile);
- gen_parser(f, g, infile, name);
+ gen_parser(f, g, infile, name, pre_reduce);
fclose(f);
} else {
fprintf(stderr, "Cannot create %s.c\n",
freeing function. The symbol leads us to the right free function through
`do_free`.
-The `indents` count tracks the line indents with in the symbol or
+The `indents` count tracks the line indents with-in the symbol or
immediately follow it. These are used to allow indent information to
guide parsing and error recovery.
We return whatever `asn` was returned by reducing production zero.
If we can neither shift nor reduce we have an error to handle. We pop
-single entries off the stack until we can shift the `TK_error` symbol, then
-drop input tokens until we find one we can shift into the new error state.
+single entries off the stack until we can shift the `TK_error` symbol,
+then drop input tokens until we find one we can shift into the new error
+state. We need to ensure that something is dropped or shifted after an
+error, or we could get into an infinite loop, only shifting in
+`TK_error`, then reducing. So we track if there has been a shift since
+the last error, and if not the next error always discards one token.
When we find `TK_in` and `TK_out` tokens which report indents we need
to handle them directly as the grammar cannot express what we want to
`TK_in` tokens are easy: we simply update indent count in the top stack frame to
record how many indents there are following the previous token.
-`TK_out` tokens must either be canceled against an indent count
+`TK_out` tokens must be canceled against an indent count
within the stack. If we can reduce some symbols that are all since
the most recent indent, then we do that first. If the minimum prefix
-of the current state then extents back before the most recent indent,
+of the current state then extends back before the most recent indent,
that indent can be cancelled. If the minimum prefix is shorter then
-the indent is premature and we must start error handling, which
-currently doesn't work at all.
+the indent had ended prematurely and we must start error handling, which
+is still a work-in-progress.
`TK_newline` tokens are ignored unless the top stack frame records
that they are permitted. In that case they will not be considered for
shifting if it is possible to reduce some symbols that are all since
-the most recent start of line. This is how a newline forcible
+the most recent start of line. This is how a newline forcibly
terminates any line-like structure - we try to reduce down to at most
one symbol for each line where newlines are allowed.
+A consequence of this is that a rule like
+
+###### Example: newlines - broken
+
+ Newlines ->
+ | NEWLINE Newlines
+ IfStatement -> Newlines if ....
+
+cannot work, as the NEWLINE will never be shifted as the empty string
+will be reduced first. Optional sets of newlines need to be include
+in the thing that preceed:
+
+###### Example: newlines - works
+
+ If -> if
+ | NEWLINE If
+ IfStatement -> If ....
+
+Here the NEWLINE will be shifted because nothing can be reduced until
+the `if` is seen.
+
+When during error handling we discard tokens read in, we want to keep
+discarding until we see one that is recognised. If we had a full set
+of LR(1) grammar states, this would mean looking in the look-ahead set,
+but we don't keep a full look-ahead set. We only record the subset
+that leads to SHIFT. We can, however, deduce the look-ahead set by
+looking at the SHIFT subsets for all states that we can get to by
+reducing zero or more times. So we need a little function which
+checks if a given token is in any of these look-ahead sets.
###### parser includes
#include "parser.h"
+
###### parser_run
+
+ static int in_lookahead(struct token *tk, const struct state *states, int state)
+ {
+ while (state >= 0) {
+ if (search(&states[state], tk->num) >= 0)
+ return 1;
+ if (states[state].reduce_prod < 0)
+ return 0;
+ state = search(&states[state], states[state].reduce_sym);
+ }
+ return 0;
+ }
+
void *parser_run(struct token_state *tokens,
const struct state states[],
int (*do_reduce)(int, void**, struct token_config*, void*),
struct parser p = { 0 };
struct token *tk = NULL;
int accepted = 0;
+ int shift_since_err = 1;
void *ret = NULL;
shift(&p, TK_eof, 0, 1, NULL, states);
parser_trace_action(trace, "Cancel");
continue;
}
- // fall through and force a REDUCE (as 'shift'
- // will fail).
+ // fall through to error handling as both SHIFT and REDUCE
+ // will fail.
}
if (tk->num == TK_newline) {
if (!tos->newline_permitted) {
goto force_reduce;
}
if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
+ shift_since_err = 1;
tk = NULL;
parser_trace_action(trace, "Shift");
continue;
}
force_reduce:
- if (states[tos->state].reduce_prod >= 0) {
+ if (states[tos->state].reduce_prod >= 0 &&
+ states[tos->state].newline_only &&
+ !(tk->num == TK_newline ||
+ tk->num == TK_eof ||
+ tk->num == TK_out ||
+ (tos->indents == 0 && tos->since_newline == 0))) {
+ /* Anything other than newline or out or eof
+ * in an error unless we are already at start
+ * of line, as this production must end at EOL.
+ */
+ } else if (states[tos->state].reduce_prod >= 0) {
void **body;
void *res;
const struct state *nextstate = &states[tos->state];
parser_trace_action(trace, "Reduce");
continue;
}
- if (tk->num == TK_out) {
- // Indent problem - synthesise tokens to get us
- // out of here.
- fprintf(stderr, "Synthesize %d to handle indent problem\n", states[tos->state].shift_sym);
- shift(&p, states[tos->state].shift_sym,
- 0, 1, tok_copy(*tk), states);
- // FIXME need to report this error somehow
- parser_trace_action(trace, "Synthesize");
- continue;
- }
/* Error. We walk up the stack until we
* find a state which will accept TK_error.
* We then shift in TK_error and see what state
short indents = 0, start_of_line;
err_tk = tok_copy(*tk);
- while (shift(&p, TK_error, 0, 0,
- err_tk, states) == 0
- && p.tos > 0)
+ while (p.tos > 0 &&
+ shift(&p, TK_error, 0, 0,
+ err_tk, states) == 0)
// discard this state
indents += pop(&p, 1, &start_of_line, do_free);
if (p.tos == 0) {
// no state accepted TK_error
break;
}
+ if (!shift_since_err) {
+ /* must discard at least one token to avoid
+ * infinite loop.
+ */
+ if (tk->num == TK_eof)
+ break;
+ free(tk);
+ tk = tok_copy(token_next(tokens));
+ }
+ shift_since_err = 0;
tos = &p.stack[p.tos-1];
- while (search(&states[tos->state], tk->num) < 0 &&
+ while (!in_lookahead(tk, states, tos->state) &&
tk->num != TK_eof) {
free(tk);
tk = tok_copy(token_next(tokens));
+ shift_since_err = 1;
if (tk->num == TK_in)
indents += 1;
if (tk->num == TK_out) {
// FIXME update since_indent here
}
}
- if (p.tos == 0 && tk->num == TK_eof)
- break;
- tos = &p.stack[p.tos-1];
tos->indents += indents;
}
free(tk);
fputs(reserved_words[tk->num], trace);
else
text_dump(trace, tk->txt, 20);
- fputs("]", trace);
+ fprintf(trace, ":%d:%d]", tk->line, tk->col);
}
void parser_trace_action(FILE *trace, char *action)
As `scanner` provides number parsing function using `libgmp` is it not much
work to perform arbitrary rational number calculations.
-This calculator takes one expression, or an equality test per line. The
+This calculator takes one expression, or an equality test, per line. The
results are printed and if any equality test fails, the program exits with
an error.
./parsergen --tag calc -o calc parsergen.mdc
calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
$(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
+ calctest : calc
+ ./calc parsergen.mdc
+ demos :: calctest
# calc: header
- #include "number.h"
+ #include "parse_number.h"
// what do we use for a demo-grammar? A calculator of course.
struct number {
mpq_t val;
#include <stdio.h>
#include <malloc.h>
#include <gmp.h>
+ #include <string.h>
#include "mdcode.h"
#include "scanner.h"
- #include "number.h"
#include "parser.h"
#include "calc.h"
free(n);
}
+ static int text_is(struct text t, char *s)
+ {
+ return (strlen(s) == t.len &&
+ strncmp(s, t.txt, t.len) == 0);
+ }
+
int main(int argc, char *argv[])
{
int fd = open(argv[1], O_RDONLY);
int len = lseek(fd, 0, 2);
char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
- struct section *s = code_extract(file, file+len, NULL);
+ struct section *table = code_extract(file, file+len, NULL);
+ struct section *s;
struct token_config config = {
.ignored = (1 << TK_line_comment)
- | (1 << TK_block_comment)
| (1 << TK_in)
| (1 << TK_out),
.number_chars = ".,_+-",
.word_start = "",
.word_cont = "",
};
- parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
- while (s) {
- struct section *t = s->next;
- code_free(s->code);
- free(s);
- s = t;
+ for (s = table; s; s = s->next)
+ if (text_is(s->section, "example: input"))
+ parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
+ while (table) {
+ struct section *t = table->next;
+ code_free(table->code);
+ free(table);
+ table = t;
}
exit(0);
}
# calc: grammar
+ $LEFT + -
+ $LEFT * / //
+
Session -> Session Line
| Line
| ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
$number
- Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
- | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
- | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
+ Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
+ | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
+ | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
+ | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+ | Expression // Expression ${ {
+ mpz_t z0, z1, z2;
+ mpq_init($0.val);
+ mpz_init(z0); mpz_init(z1); mpz_init(z2);
+ mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
+ mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
+ mpz_tdiv_q(z0, z1, z2);
+ mpq_set_z($0.val, z0);
+ mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
+ } }$
+ | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
+ | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
- Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
- | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
- | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
+# example: input
- Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
- | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
+ 355/113
+ 3.1415926535 - 355/113
+ 2 + 4 * 5
+ 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
+ 10 * 9 / 2
+ 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
+
+ 355//113
+
+ error