X-Git-Url: https://ocean-lang.org/code/?a=blobdiff_plain;ds=sidebyside;f=csrc%2Fparsergen.mdc;h=13b946fb0e992580c8c7b2d410a003415bb3eea3;hb=2594d150e123c29949493c37c8b9c14aa6abe5a4;hp=82d09b3ed1b497362b77ff7a264dff837499c39a;hpb=1bfa4fc8cfc5fafa5b2b3ae6bdd9b77a4242e74a;p=ocean diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc index 82d09b3..13b946f 100644 --- a/csrc/parsergen.mdc +++ b/csrc/parsergen.mdc @@ -400,6 +400,21 @@ The `<` may be included for symbols declared as storing a reference (not a structure) and means that the reference is being moved out, so it will not automatically be freed. +Symbols that are left-recursive are a little special. These are symbols +that both the head of a production and the first body symbol of the same +production. They are problematic when they appear in other productions +elsewhere than at the end, and when indenting is used to describe +structure. In this case there is no way for a smaller indent to ensure +the left-recursive symbol cannot be extended. When it appears at the +end of a production, that production can be reduced to ensure the symbol +isn't extended. So we record left-recursive symbols while reading the +grammar, and produce a warning when reporting the grammar if they are +found in an unsuitable place. + +A symbol that is only left recursive in a production where it is +followed by newline does not cause these problems because the newline +will effectively terminate it. + While building productions we will need to add to an array which needs to grow dynamically. @@ -460,6 +475,7 @@ Now we have all the bits we need to parse a full production. ###### symbol fields int first_production; + int left_recursive; ###### functions static char *parse_production(struct grammar *g, @@ -499,8 +515,10 @@ Now we have all the bits we need to parse a full production. vs = sym_find(g, tk.txt); if (vs->num == TK_newline) p.line_like = 1; - else if (vs->type != Virtual) { - err = "symbol after $$ must be virtual"; + else if (vs->num == TK_out) + p.line_like = 2; + else if (vs->precedence == 0) { + err = "symbol after $$ must have precedence"; goto abort; } else { p.precedence = vs->precedence; @@ -517,6 +535,11 @@ Now we have all the bits we need to parse a full production. } tk = token_next(state); } + if (p.body_size >= 2 && + p.body[0] == p.head && + p.body[1]->num != TK_newline) + p.head->left_recursive = 1; + if (tk.num != TK_newline && tk.num != TK_eof) { err = "stray tokens at end of line"; goto abort; @@ -634,6 +657,11 @@ to produce errors that the parser is better positioned to handle. } else if (tk.num == TK_mark && text_is(tk.txt, "$*")) { err = dollar_line(state, g, 1); + } else if (tk.num == TK_mark + && text_is(tk.txt, "//")) { + while (tk.num != TK_newline && + tk.num != TK_eof) + tk = token_next(state); } else { err = "Unrecognised token at start of line."; } @@ -1309,6 +1337,8 @@ into the go to set, so the item is ineffective. struct symbol *s; struct symset LA = INIT_SYMSET; unsigned short sn = 0; + struct symset LAnl = INIT_SYMSET; + unsigned short snnl = 0; if (is->min_prefix == 0 || (bs > 0 && bs < is->min_prefix)) @@ -1330,26 +1360,26 @@ into the go to set, so the item is ineffective. continue; if (symset_find(&done, s->num) < 0) { symset_add(&done, s->num, 0); - if (s->line_like) - is->starts_line = 1; } if (s->type != Nonterminal) continue; + if (s->line_like) + is->starts_line = 1; again = 1; if (type >= LALR) { // Need the LA set. int to_end; add_first(pr, bs+1, &LA, g, &to_end); if (to_end) { - if (pr->line_like) - symset_add(&LA, TK_newline, 0); - else { - struct symset ss = set_find(g, is->items.data[i]); - symset_union(&LA, &ss); - } + struct symset ss = set_find(g, is->items.data[i]); + symset_union(&LA, &ss); } sn = save_set(g, LA); LA = set_find(g, sn); + if (symset_find(&LA, TK_newline)) + symset_add(&LAnl, TK_newline, 0); + snnl = save_set(g, LAnl); + LAnl = set_find(g, snnl); } /* Add productions for this symbol */ @@ -1360,19 +1390,25 @@ into the go to set, so the item is ineffective. int itm = item_num(p2, 0); int pos = symset_find(&is->items, itm); if (pos < 0) { - symset_add(&is->items, itm, sn); + if (g->productions[p2]->line_like) + symset_add(&is->items, itm, snnl); + else + symset_add(&is->items, itm, sn); /* Will have re-ordered, so start * from beginning again */ i = -1; } else if (type >= LALR) { struct symset ss = set_find(g, is->items.data[pos]); struct symset tmp = INIT_SYMSET; + struct symset *la = &LA; + if (g->productions[p2]->line_like) + la = &LAnl; symset_union(&tmp, &ss); - if (symset_union(&tmp, &LA)) { + if (symset_union(&tmp, la)) { is->items.data[pos] = save_set(g, tmp); i = -1; - }else + } else symset_free(tmp); } } @@ -1509,6 +1545,11 @@ changeover point in `first_nonterm`. snum++; } g->first_nonterm = snum; + for (s = g->syms; s; s = s->next) + if (s->num < 0 && s->type != Virtual) { + s->num = snum; + snum++; + } for (s = g->syms; s; s = s->next) if (s->num < 0) { s->num = snum; @@ -1547,7 +1588,7 @@ all the tables that have been generated, plus a description of any conflicts. if (g->follow) report_follow(g); report_itemsets(g); - return report_conflicts(g, type); + return report_conflicts(g, type) + report_left_recursive(g); } Firstly we have the complete list of symbols, together with the @@ -1643,8 +1684,10 @@ it up a bit. First the items, with production number and associativity. printf(" [%d%s]", s->precedence, assoc_names[s->assoc]); } - if (pr->line_like) + if (pr->line_like == 1) printf(" $$NEWLINE"); + else if (pr->line_like) + printf(" $$OUT"); printf("\n"); } @@ -1783,14 +1826,10 @@ terminals to items where that terminal could be shifted and another which maps terminals to items that could be reduced when the terminal is in look-ahead. We report when we get conflicts between the two. -As a special case, if we find a SHIFT/REDUCE conflict, where a -terminal that could be shifted is in the lookahead set of some -reducable item, then set check if the reducable item also have -`TK_newline` in its lookahead set. If it does, then a newline will -force the reduction, but anything else can reasonably be shifted, so -that isn't really a conflict. Such apparent conflicts do not get -counted, and are reported as non-critical. This will not affect a -"traditional" grammar that does not include newlines as token. +As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE +terminal, we ignore it. NEWLINES are handled specially with its own +rules for when to shift and when to reduce. Conflicts are expected, +but handled internally. static int conflicts_slr(struct grammar *g, enum grammar_type type) { @@ -1844,12 +1883,9 @@ counted, and are reported as non-critical. This will not affect a for (k = 0; k < la.cnt; k++) { int pos = symset_find(&shifts, la.syms[k]); if (pos >= 0 && la.syms[k] != TK_newline) { - if (symset_find(&la, TK_newline) < 0) { - printf(" State %d has SHIFT/REDUCE conflict on ", i); - cnt++; - } else - printf(" State %d has non-critical SHIFT/REDUCE conflict on ", i); - prtxt(g->symtab[la.syms[k]]->name); + printf(" State %d has SHIFT/REDUCE conflict on ", i); + cnt++; + prtxt(g->symtab[la.syms[k]]->name); printf(":\n"); report_item(g, shifts.data[pos]); report_item(g, itm); @@ -1873,6 +1909,43 @@ counted, and are reported as non-critical. This will not affect a return cnt; } + +### Reporting non-final left-recursive symbols. + +Left recursive symbols are a problem for parses that honour indentation +when they appear other than at the end of the production. So we need to +report these when asked. + +###### functions + + static int report_left_recursive(struct grammar *g) + { + int p; + int bad_left_recursive = 0; + + for (p = 0; p < g->production_count; p++) { + struct production *pr = g->productions[p]; + int sn; + + for (sn = 0; sn < pr->body_size - 1; sn++) { + struct symbol *s = pr->body[sn]; + + if (s->left_recursive == 1 && + s != pr->head) { + if (!bad_left_recursive) { + bad_left_recursive = 1; + printf("Misplaced left recursive symbols.\n"); + } + printf(" "); + prtxt(s->name); + printf(" in production [%d]\n", p); + s->left_recursive = 2; + } + } + } + return bad_left_recursive; + } + ## Generating the parser The exported part of the parser is the `parse_XX` function, where the name @@ -1906,7 +1979,6 @@ pieces of code provided in the grammar file, so they are generated first. fprintf(f, "\tstruct token_state *tokens;\n"); fprintf(f, "\tconfig->words_marks = known;\n"); fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n"); - fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n"); fprintf(f, "\ttokens = token_open(code, config);\n"); fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n"); fprintf(f, "\ttoken_close(tokens);\n"); @@ -1944,7 +2016,7 @@ numbers right. for (i = TK_reserved; i < g->num_syms; i++) - if (g->symtab[i]->type != Terminal) + if (g->symtab[i]->type == Nonterminal) fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len, g->symtab[i]->name.txt); fprintf(f, "};\n\n"); @@ -2866,9 +2938,13 @@ checks if a given token is in any of these look-ahead sets. force_reduce: if (states[tos->state].reduce_prod >= 0 && states[tos->state].newline_only && - tk->num != TK_newline && tk->num != TK_eof && tk->num != TK_out) { - /* Anything other than newline in an error as this - * production must end at EOL + !(tk->num == TK_newline || + tk->num == TK_eof || + tk->num == TK_out || + (tos->indents == 0 && tos->since_newline == 0))) { + /* Anything other than newline or out or eof + * in an error unless we are already at start + * of line, as this production must end at EOL. */ } else if (states[tos->state].reduce_prod >= 0) { void **body; @@ -3020,7 +3096,7 @@ end inside square brackets. fputs(reserved_words[tk->num], trace); else text_dump(trace, tk->txt, 20); - fputs("]", trace); + fprintf(trace, ":%d:%d]", tk->line, tk->col); } void parser_trace_action(FILE *trace, char *action) @@ -3051,7 +3127,7 @@ an error. # calc: header - #include "number.h" + #include "parse_number.h" // what do we use for a demo-grammar? A calculator of course. struct number { mpq_t val; @@ -3071,7 +3147,6 @@ an error. #include #include "mdcode.h" #include "scanner.h" - #include "number.h" #include "parser.h" #include "calc.h" @@ -3097,7 +3172,6 @@ an error. struct section *s; struct token_config config = { .ignored = (1 << TK_line_comment) - | (1 << TK_block_comment) | (1 << TK_in) | (1 << TK_out), .number_chars = ".,_+-", @@ -3119,7 +3193,7 @@ an error. # calc: grammar $LEFT + - - $LEFT * / + $LEFT * / // Session -> Session Line | Line @@ -3147,6 +3221,16 @@ an error. | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$ | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$ | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$ + | Expression // Expression ${ { + mpz_t z0, z1, z2; + mpq_init($0.val); + mpz_init(z0); mpz_init(z1); mpz_init(z2); + mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val)); + mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val)); + mpz_tdiv_q(z0, z1, z2); + mpq_set_z($0.val, z0); + mpz_clear(z0); mpz_clear(z1); mpz_clear(z2); + } }$ | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$ | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$ @@ -3159,4 +3243,6 @@ an error. 10 * 9 / 2 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1 + 355//113 + error