X-Git-Url: https://ocean-lang.org/code/?a=blobdiff_plain;ds=sidebyside;f=csrc%2Fparsergen.mdc;h=66a41b5f5a297abd1a90510c23d1d40e5e89e676;hb=229d6941cd1da3ba78d38e093dc51246c081a847;hp=3f36df9a6873c3c94f2f127d8a0ca6419e50603c;hpb=850a39a0a761e0af89c15253f075ecd9e9ecc6ee;p=ocean diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc index 3f36df9..66a41b5 100644 --- a/csrc/parsergen.mdc +++ b/csrc/parsergen.mdc @@ -151,11 +151,17 @@ those which don't. There are also "virtual" symbols used for precedence marking discussed later, and sometimes we won't know what type a symbol is yet. +To help with code safety it is possible to declare the terminal symbols. +If this is done, then any symbol used in a production that does not +appear in a head and is not declared is treated as an error. + ###### forward declarations enum symtype { Unknown, Virtual, Terminal, Nonterminal }; char *symtypes = "UVTN"; ###### symbol fields enum symtype type; +###### grammar fields + int terminals_declared; Symbols can be either `TK_ident` or `TK_mark`. They are saved in a table of known symbols and the resulting parser will report them as @@ -241,9 +247,10 @@ symbol, but its type might be `Unknown`. ### Data types and precedence. -Data type specification and precedence specification are both -introduced by a dollar sign at the start of the line. If the next -word is `LEFT`, `RIGHT` or `NON`, then the line specifies a +Data type specification, precedence specification, and declaration of +terminals are all introduced by a dollar sign at the start of the line. +If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a +precedence, if it is `TERM` the the line declares terminals without precedence, otherwise it specifies a data type. The data type name is simply stored and applied to the head of all @@ -296,6 +303,7 @@ Subsequent lines introduce symbols with higher precedence. struct token t = token_next(ts); char *err; enum assoc assoc; + int term = 0; int found; if (t.num != TK_ident) { @@ -308,7 +316,10 @@ Subsequent lines introduce symbols with higher precedence. assoc = Right; else if (text_is(t.txt, "NON")) assoc = Non; - else { + else if (text_is(t.txt, "TERM")) { + term = 1; + g->terminals_declared = 1; + } else { g->current_type = t.txt; g->type_isref = isref; if (text_is(t.txt, "void")) @@ -326,7 +337,7 @@ Subsequent lines introduce symbols with higher precedence. goto abort; } - // This is a precedence line, need some symbols. + // This is a precedence or TERM line, need some symbols. found = 0; g->prec_levels += 1; t = token_next(ts); @@ -340,6 +351,10 @@ Subsequent lines introduce symbols with higher precedence. err = "$$ must be followed by a word"; goto abort; } + if (term) { + err = "Virtual symbols not permitted on $TERM line"; + goto abort; + } } else if (t.num != TK_ident && t.num != TK_mark) { err = "Illegal token in precedence line"; @@ -347,17 +362,19 @@ Subsequent lines introduce symbols with higher precedence. } s = sym_find(g, t.txt); if (s->type != Unknown) { - err = "Symbols in precedence line must not already be known."; + err = "Symbols in precedence/TERM line must not already be known."; goto abort; } s->type = type; - s->precedence = g->prec_levels; - s->assoc = assoc; + if (!term) { + s->precedence = g->prec_levels; + s->assoc = assoc; + } found += 1; t = token_next(ts); } if (found == 0) - err = "No symbols given on precedence line"; + err = "No symbols given on precedence/TERM line"; goto abort; return NULL; abort: @@ -400,6 +417,21 @@ The `<` may be included for symbols declared as storing a reference (not a structure) and means that the reference is being moved out, so it will not automatically be freed. +Symbols that are left-recursive are a little special. These are symbols +that both the head of a production and the first body symbol of the same +production. They are problematic when they appear in other productions +elsewhere than at the end, and when indenting is used to describe +structure. In this case there is no way for a smaller indent to ensure +the left-recursive symbol cannot be extended. When it appears at the +end of a production, that production can be reduced to ensure the symbol +isn't extended. So we record left-recursive symbols while reading the +grammar, and produce a warning when reporting the grammar if they are +found in an unsuitable place. + +A symbol that is only left recursive in a production where it is +followed by newline does not cause these problems because the newline +will effectively terminate it. + While building productions we will need to add to an array which needs to grow dynamically. @@ -460,6 +492,7 @@ Now we have all the bits we need to parse a full production. ###### symbol fields int first_production; + int left_recursive; ###### functions static char *parse_production(struct grammar *g, @@ -476,8 +509,10 @@ Now we have all the bits we need to parse a full production. tk = token_next(state); while (tk.num == TK_ident || tk.num == TK_mark) { struct symbol *bs = sym_find(g, tk.txt); - if (bs->type == Unknown) - bs->type = Terminal; + if (bs->type == Unknown) { + if (!g->terminals_declared) + bs->type = Terminal; + } if (bs->type == Virtual) { err = "Virtual symbol not permitted in production"; goto abort; @@ -519,6 +554,11 @@ Now we have all the bits we need to parse a full production. } tk = token_next(state); } + if (p.body_size >= 2 && + p.body[0] == p.head && + p.body[1]->num != TK_newline) + p.head->left_recursive = 1; + if (tk.num != TK_newline && tk.num != TK_eof) { err = "stray tokens at end of line"; goto abort; @@ -648,6 +688,21 @@ to produce errors that the parser is better positioned to handle. goto abort; } token_close(state); + if (g->terminals_declared) { + struct symbol *s; + int errs = 0; + for (s = g->syms; s; s = s->next) { + if (s->type != Unknown) + continue; + errs += 1; + fprintf(stderr, "Token %.*s not declared\n", + s->name.len, s->name.txt); + } + if (errs) { + free(g); + g = NULL; + } + } return g; abort: fprintf(stderr, "Error at line %d: %s\n", @@ -1524,6 +1579,11 @@ changeover point in `first_nonterm`. snum++; } g->first_nonterm = snum; + for (s = g->syms; s; s = s->next) + if (s->num < 0 && s->type != Virtual) { + s->num = snum; + snum++; + } for (s = g->syms; s; s = s->next) if (s->num < 0) { s->num = snum; @@ -1562,7 +1622,7 @@ all the tables that have been generated, plus a description of any conflicts. if (g->follow) report_follow(g); report_itemsets(g); - return report_conflicts(g, type); + return report_conflicts(g, type) + report_left_recursive(g); } Firstly we have the complete list of symbols, together with the @@ -1883,6 +1943,43 @@ but handled internally. return cnt; } + +### Reporting non-final left-recursive symbols. + +Left recursive symbols are a problem for parses that honour indentation +when they appear other than at the end of the production. So we need to +report these when asked. + +###### functions + + static int report_left_recursive(struct grammar *g) + { + int p; + int bad_left_recursive = 0; + + for (p = 0; p < g->production_count; p++) { + struct production *pr = g->productions[p]; + int sn; + + for (sn = 0; sn < pr->body_size - 1; sn++) { + struct symbol *s = pr->body[sn]; + + if (s->left_recursive == 1 && + s != pr->head) { + if (!bad_left_recursive) { + bad_left_recursive = 1; + printf("Misplaced left recursive symbols.\n"); + } + printf(" "); + prtxt(s->name); + printf(" in production [%d]\n", p); + s->left_recursive = 2; + } + } + } + return bad_left_recursive; + } + ## Generating the parser The exported part of the parser is the `parse_XX` function, where the name @@ -1953,7 +2050,7 @@ numbers right. for (i = TK_reserved; i < g->num_syms; i++) - if (g->symtab[i]->type != Terminal) + if (g->symtab[i]->type == Nonterminal) fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len, g->symtab[i]->name.txt); fprintf(f, "};\n\n"); @@ -2716,8 +2813,12 @@ accepted the input and can finish. We return whatever `asn` was returned by reducing production zero. If we can neither shift nor reduce we have an error to handle. We pop -single entries off the stack until we can shift the `TK_error` symbol, then -drop input tokens until we find one we can shift into the new error state. +single entries off the stack until we can shift the `TK_error` symbol, +then drop input tokens until we find one we can shift into the new error +state. We need to ensure that something is dropped or shifted after an +error, or we could get into an infinite loop, only shifting in +`TK_error`, then reducing. So we track if there has been a shift since +the last error, and if not the next error always discards one token. When we find `TK_in` and `TK_out` tokens which report indents we need to handle them directly as the grammar cannot express what we want to @@ -2761,11 +2862,11 @@ in the thing that preceed: Here the NEWLINE will be shifted because nothing can be reduced until the `if` is seen. -When, during error handling, we discard token read in, we want to keep +When during error handling we discard tokens read in, we want to keep discarding until we see one that is recognised. If we had a full set -of LR(1) grammar states, this will mean looking in the look-ahead set, +of LR(1) grammar states, this would mean looking in the look-ahead set, but we don't keep a full look-ahead set. We only record the subset -that leads to SHIFT. We can, however, deduce the look-ahead set but +that leads to SHIFT. We can, however, deduce the look-ahead set by looking at the SHIFT subsets for all states that we can get to by reducing zero or more times. So we need a little function which checks if a given token is in any of these look-ahead sets. @@ -2797,6 +2898,7 @@ checks if a given token is in any of these look-ahead sets. struct parser p = { 0 }; struct token *tk = NULL; int accepted = 0; + int shift_since_err = 1; void *ret = NULL; shift(&p, TK_eof, 0, 1, NULL, states); @@ -2868,6 +2970,7 @@ checks if a given token is in any of these look-ahead sets. goto force_reduce; } if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) { + shift_since_err = 1; tk = NULL; parser_trace_action(trace, "Shift"); continue; @@ -2932,11 +3035,22 @@ checks if a given token is in any of these look-ahead sets. // no state accepted TK_error break; } + if (!shift_since_err) { + /* must discard at least one token to avoid + * infinite loop. + */ + if (tk->num == TK_eof) + break; + free(tk); + tk = tok_copy(token_next(tokens)); + } + shift_since_err = 0; tos = &p.stack[p.tos-1]; while (!in_lookahead(tk, states, tos->state) && tk->num != TK_eof) { free(tk); tk = tok_copy(token_next(tokens)); + shift_since_err = 1; if (tk->num == TK_in) indents += 1; if (tk->num == TK_out) {