(not a structure) and means that the reference is being moved out, so
it will not automatically be freed.
+Symbols that are left-recursive are a little special. These are symbols
+that both the head of a production and the first body symbol of the same
+production. They are problematic when they appear in other productions
+elsewhere than at the end, and when indenting is used to describe
+structure. In this case there is no way for a smaller indent to ensure
+the left-recursive symbol cannot be extended. When it appears at the
+end of a production, that production can be reduced to ensure the symbol
+isn't extended. So we record left-recursive symbols while reading the
+grammar, and produce a warning when reporting the grammar if they are
+found in an unsuitable place.
+
+A symbol that is only left recursive in a production where it is
+followed by newline does not cause these problems because the newline
+will effectively terminate it.
+
While building productions we will need to add to an array which needs to
grow dynamically.
###### symbol fields
int first_production;
+ int left_recursive;
###### functions
static char *parse_production(struct grammar *g,
}
tk = token_next(state);
}
+ if (p.body_size >= 2 &&
+ p.body[0] == p.head &&
+ p.body[1]->num != TK_newline)
+ p.head->left_recursive = 1;
+
if (tk.num != TK_newline && tk.num != TK_eof) {
err = "stray tokens at end of line";
goto abort;
} else if (tk.num == TK_mark
&& text_is(tk.txt, "$*")) {
err = dollar_line(state, g, 1);
+ } else if (tk.num == TK_mark
+ && text_is(tk.txt, "//")) {
+ while (tk.num != TK_newline &&
+ tk.num != TK_eof)
+ tk = token_next(state);
} else {
err = "Unrecognised token at start of line.";
}
continue;
if (symset_find(&done, s->num) < 0) {
symset_add(&done, s->num, 0);
- if (s->line_like)
- is->starts_line = 1;
}
if (s->type != Nonterminal)
continue;
+ if (s->line_like)
+ is->starts_line = 1;
again = 1;
if (type >= LALR) {
// Need the LA set.
snum++;
}
g->first_nonterm = snum;
+ for (s = g->syms; s; s = s->next)
+ if (s->num < 0 && s->type != Virtual) {
+ s->num = snum;
+ snum++;
+ }
for (s = g->syms; s; s = s->next)
if (s->num < 0) {
s->num = snum;
if (g->follow)
report_follow(g);
report_itemsets(g);
- return report_conflicts(g, type);
+ return report_conflicts(g, type) + report_left_recursive(g);
}
Firstly we have the complete list of symbols, together with the
return cnt;
}
+
+### Reporting non-final left-recursive symbols.
+
+Left recursive symbols are a problem for parses that honour indentation
+when they appear other than at the end of the production. So we need to
+report these when asked.
+
+###### functions
+
+ static int report_left_recursive(struct grammar *g)
+ {
+ int p;
+ int bad_left_recursive = 0;
+
+ for (p = 0; p < g->production_count; p++) {
+ struct production *pr = g->productions[p];
+ int sn;
+
+ for (sn = 0; sn < pr->body_size - 1; sn++) {
+ struct symbol *s = pr->body[sn];
+
+ if (s->left_recursive == 1 &&
+ s != pr->head) {
+ if (!bad_left_recursive) {
+ bad_left_recursive = 1;
+ printf("Misplaced left recursive symbols.\n");
+ }
+ printf(" ");
+ prtxt(s->name);
+ printf(" in production [%d]\n", p);
+ s->left_recursive = 2;
+ }
+ }
+ }
+ return bad_left_recursive;
+ }
+
## Generating the parser
The exported part of the parser is the `parse_XX` function, where the name
fprintf(f, "\tstruct token_state *tokens;\n");
fprintf(f, "\tconfig->words_marks = known;\n");
fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
- fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
fprintf(f, "\ttokens = token_open(code, config);\n");
fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
fprintf(f, "\ttoken_close(tokens);\n");
for (i = TK_reserved;
i < g->num_syms;
i++)
- if (g->symtab[i]->type != Terminal)
+ if (g->symtab[i]->type == Nonterminal)
fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
g->symtab[i]->name.txt);
fprintf(f, "};\n\n");
fputs(reserved_words[tk->num], trace);
else
text_dump(trace, tk->txt, 20);
- fputs("]", trace);
+ fprintf(trace, ":%d:%d]", tk->line, tk->col);
}
void parser_trace_action(FILE *trace, char *action)
# calc: header
- #include "number.h"
+ #include "parse_number.h"
// what do we use for a demo-grammar? A calculator of course.
struct number {
mpq_t val;
#include <string.h>
#include "mdcode.h"
#include "scanner.h"
- #include "number.h"
#include "parser.h"
#include "calc.h"
struct section *s;
struct token_config config = {
.ignored = (1 << TK_line_comment)
- | (1 << TK_block_comment)
| (1 << TK_in)
| (1 << TK_out),
.number_chars = ".,_+-",
# calc: grammar
$LEFT + -
- $LEFT * /
+ $LEFT * / //
Session -> Session Line
| Line
| Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
| Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
| Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+ | Expression // Expression ${ {
+ mpz_t z0, z1, z2;
+ mpq_init($0.val);
+ mpz_init(z0); mpz_init(z1); mpz_init(z2);
+ mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
+ mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
+ mpz_tdiv_q(z0, z1, z2);
+ mpq_set_z($0.val, z0);
+ mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
+ } }$
| NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
| ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
10 * 9 / 2
1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
+ 355//113
+
error