X-Git-Url: https://ocean-lang.org/code/?a=blobdiff_plain;f=csrc%2Fparsergen.mdc;h=3f36df9a6873c3c94f2f127d8a0ca6419e50603c;hb=850a39a0a761e0af89c15253f075ecd9e9ecc6ee;hp=478482e0f6b0512b9fc09946b4b1b5ead04fe7fd;hpb=51ba21ffd913b8eb700699567567560f99a2aa9c;p=ocean diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc index 478482e..3f36df9 100644 --- a/csrc/parsergen.mdc +++ b/csrc/parsergen.mdc @@ -100,6 +100,7 @@ symbol. struct production { unsigned short precedence; enum assoc assoc; + char line_like; ## production fields }; struct grammar { @@ -277,6 +278,9 @@ declares how it associates. This level is stored in each symbol listed and may be inherited by any production which uses the symbol. A production inherits from the last symbol which has a precedence. +The symbols on the first precedence line have the lowest precedence. +Subsequent lines introduce symbols with higher precedence. + ###### grammar fields struct text current_type; int type_isref; @@ -493,12 +497,17 @@ Now we have all the bits we need to parse a full production. goto abort; } vs = sym_find(g, tk.txt); - if (vs->type != Virtual) { - err = "symbol after $$ must be virtual"; + if (vs->num == TK_newline) + p.line_like = 1; + else if (vs->num == TK_out) + p.line_like = 2; + else if (vs->precedence == 0) { + err = "symbol after $$ must have precedence"; goto abort; + } else { + p.precedence = vs->precedence; + p.assoc = vs->assoc; } - p.precedence = vs->precedence; - p.assoc = vs->assoc; tk = token_next(state); } if (tk.num == TK_open) { @@ -627,6 +636,11 @@ to produce errors that the parser is better positioned to handle. } else if (tk.num == TK_mark && text_is(tk.txt, "$*")) { err = dollar_line(state, g, 1); + } else if (tk.num == TK_mark + && text_is(tk.txt, "//")) { + while (tk.num != TK_newline && + tk.num != TK_eof) + tk = token_next(state); } else { err = "Unrecognised token at start of line."; } @@ -1287,12 +1301,11 @@ When itemsets are created we assign a precedence to the itemset from the complete item, if there is one. We ignore the possibility of there being two and don't (currently) handle precedence in such grammars. When completing a grammar we ignore any item where DOT is -followed by a terminal with a precedence lower (numerically higher) -than that for the itemset. Unless the terminal has right -associativity, we also ignore items where the terminal has the same -precedence. The result is that unwanted items are still in the -itemset, but the terminal doesn't get into the go to set, so the item -is ineffective. +followed by a terminal with a precedence lower than that for the +itemset. Unless the terminal has right associativity, we also ignore +items where the terminal has the same precedence. The result is that +unwanted items are still in the itemset, but the terminal doesn't get +into the go to set, so the item is ineffective. ###### complete itemset for (i = 0; i < is->items.cnt; i++) { @@ -1303,6 +1316,8 @@ is ineffective. struct symbol *s; struct symset LA = INIT_SYMSET; unsigned short sn = 0; + struct symset LAnl = INIT_SYMSET; + unsigned short snnl = 0; if (is->min_prefix == 0 || (bs > 0 && bs < is->min_prefix)) @@ -1311,7 +1326,7 @@ is ineffective. continue; s = pr->body[bs]; if (s->precedence && is->precedence && - is->precedence < s->precedence) + is->precedence > s->precedence) /* This terminal has a low precedence and * shouldn't be shifted */ @@ -1324,11 +1339,11 @@ is ineffective. continue; if (symset_find(&done, s->num) < 0) { symset_add(&done, s->num, 0); - if (s->line_like) - is->starts_line = 1; } if (s->type != Nonterminal) continue; + if (s->line_like) + is->starts_line = 1; again = 1; if (type >= LALR) { // Need the LA set. @@ -1340,6 +1355,10 @@ is ineffective. } sn = save_set(g, LA); LA = set_find(g, sn); + if (symset_find(&LA, TK_newline)) + symset_add(&LAnl, TK_newline, 0); + snnl = save_set(g, LAnl); + LAnl = set_find(g, snnl); } /* Add productions for this symbol */ @@ -1350,19 +1369,25 @@ is ineffective. int itm = item_num(p2, 0); int pos = symset_find(&is->items, itm); if (pos < 0) { - symset_add(&is->items, itm, sn); + if (g->productions[p2]->line_like) + symset_add(&is->items, itm, snnl); + else + symset_add(&is->items, itm, sn); /* Will have re-ordered, so start * from beginning again */ i = -1; } else if (type >= LALR) { struct symset ss = set_find(g, is->items.data[pos]); struct symset tmp = INIT_SYMSET; + struct symset *la = &LA; + if (g->productions[p2]->line_like) + la = &LAnl; symset_union(&tmp, &ss); - if (symset_union(&tmp, &LA)) { + if (symset_union(&tmp, la)) { is->items.data[pos] = save_set(g, tmp); i = -1; - }else + } else symset_free(tmp); } } @@ -1404,8 +1429,7 @@ with a pre-existing itemset). pos = symset_find(&newitemset, pr->head->num); if (bp + 1 == pr->body_size && pr->precedence > 0 && - (precedence == 0 || - pr->precedence < precedence)) { + pr->precedence > precedence) { // new itemset is reducible and has a precedence. precedence = pr->precedence; assoc = pr->assoc; @@ -1634,6 +1658,10 @@ it up a bit. First the items, with production number and associativity. printf(" [%d%s]", s->precedence, assoc_names[s->assoc]); } + if (pr->line_like == 1) + printf(" $$NEWLINE"); + else if (pr->line_like) + printf(" $$OUT"); printf("\n"); } @@ -1772,14 +1800,10 @@ terminals to items where that terminal could be shifted and another which maps terminals to items that could be reduced when the terminal is in look-ahead. We report when we get conflicts between the two. -As a special case, if we find a SHIFT/REDUCE conflict, where a -terminal that could be shifted is in the lookahead set of some -reducable item, then set check if the reducable item also have -`TK_newline` in its lookahead set. If it does, then a newline will -force the reduction, but anything else can reasonably be shifted, so -that isn't really a conflict. Such apparent conflicts do not get -counted, and are reported as non-critical. This will not affect a -"traditional" grammar that does not include newlines as token. +As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE +terminal, we ignore it. NEWLINES are handled specially with its own +rules for when to shift and when to reduce. Conflicts are expected, +but handled internally. static int conflicts_slr(struct grammar *g, enum grammar_type type) { @@ -1799,14 +1823,20 @@ counted, and are reported as non-critical. This will not affect a int p = item_prod(itm); int bp = item_index(itm); struct production *pr = g->productions[p]; + struct symbol *s; - if (bp < pr->body_size && - pr->body[bp]->type == Terminal) { - /* shiftable */ - int sym = pr->body[bp]->num; - if (symset_find(&shifts, sym) < 0) - symset_add(&shifts, sym, itm); - } + if (bp >= pr->body_size || + pr->body[bp]->type != Terminal) + /* not shiftable */ + continue; + + s = pr->body[bp]; + if (s->precedence && is->precedence) + /* Precedence resolves this, so no conflict */ + continue; + + if (symset_find(&shifts, s->num) < 0) + symset_add(&shifts, s->num, itm); } /* Now look for reductions and conflicts */ for (j = 0; j < is->items.cnt; j++) { @@ -1826,13 +1856,10 @@ counted, and are reported as non-critical. This will not affect a int k; for (k = 0; k < la.cnt; k++) { int pos = symset_find(&shifts, la.syms[k]); - if (pos >= 0) { - if (symset_find(&la, TK_newline) < 0) { - printf(" State %d has SHIFT/REDUCE conflict on ", i); - cnt++; - } else - printf(" State %d has non-critical SHIFT/REDUCE conflict on ", i); - prtxt(g->symtab[la.syms[k]]->name); + if (pos >= 0 && la.syms[k] != TK_newline) { + printf(" State %d has SHIFT/REDUCE conflict on ", i); + cnt++; + prtxt(g->symtab[la.syms[k]]->name); printf(":\n"); report_item(g, shifts.data[pos]); report_item(g, itm); @@ -1889,7 +1916,6 @@ pieces of code provided in the grammar file, so they are generated first. fprintf(f, "\tstruct token_state *tokens;\n"); fprintf(f, "\tconfig->words_marks = known;\n"); fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n"); - fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n"); fprintf(f, "\ttokens = token_open(code, config);\n"); fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n"); fprintf(f, "\ttoken_close(tokens);\n"); @@ -1900,7 +1926,9 @@ pieces of code provided in the grammar file, so they are generated first. ### Known words table The known words table is simply an array of terminal symbols. -The table of nonterminals used for tracing is a similar array. +The table of nonterminals used for tracing is a similar array. We +include virtual symbols in the table of non_terminals to keep the +numbers right. ###### functions @@ -1925,7 +1953,7 @@ The table of nonterminals used for tracing is a similar array. for (i = TK_reserved; i < g->num_syms; i++) - if (g->symtab[i]->type == Nonterminal) + if (g->symtab[i]->type != Terminal) fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len, g->symtab[i]->name.txt); fprintf(f, "};\n\n"); @@ -1955,7 +1983,8 @@ The go to table is stored in a simple array of `sym` and corresponding short reduce_prod; short reduce_size; short reduce_sym; - short starts_line; + char starts_line; + char newline_only; short min_prefix; }; @@ -2004,13 +2033,15 @@ The go to table is stored in a simple array of `sym` and corresponding } if (prod >= 0) - fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d },\n", + fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n", i, is->go_to.cnt, i, prod, g->productions[prod]->body_size, g->productions[prod]->head->num, - is->starts_line, is->min_prefix); + is->starts_line, + g->productions[prod]->line_like, + is->min_prefix); else - fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n", + fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n", i, is->go_to.cnt, i, is->starts_line, is->min_prefix); } @@ -2103,10 +2134,13 @@ automatically freed. This is equivalent to assigning `NULL` to the pointer. fputs("\n", f); for (i = 0; i < p->body_size; i++) { if (p->body[i]->struct_name.txt && - p->body[i]->isref && - used[i]) + used[i]) { // assume this has been copied out - fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i); + if (p->body[i]->isref) + fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i); + else + fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt); + } } free(used); } @@ -2839,7 +2873,17 @@ checks if a given token is in any of these look-ahead sets. continue; } force_reduce: - if (states[tos->state].reduce_prod >= 0) { + if (states[tos->state].reduce_prod >= 0 && + states[tos->state].newline_only && + !(tk->num == TK_newline || + tk->num == TK_eof || + tk->num == TK_out || + (tos->indents == 0 && tos->since_newline == 0))) { + /* Anything other than newline or out or eof + * in an error unless we are already at start + * of line, as this production must end at EOL. + */ + } else if (states[tos->state].reduce_prod >= 0) { void **body; void *res; const struct state *nextstate = &states[tos->state]; @@ -2989,7 +3033,7 @@ end inside square brackets. fputs(reserved_words[tk->num], trace); else text_dump(trace, tk->txt, 20); - fputs("]", trace); + fprintf(trace, ":%d:%d]", tk->line, tk->col); } void parser_trace_action(FILE *trace, char *action) @@ -3020,7 +3064,7 @@ an error. # calc: header - #include "number.h" + #include "parse_number.h" // what do we use for a demo-grammar? A calculator of course. struct number { mpq_t val; @@ -3040,7 +3084,6 @@ an error. #include #include "mdcode.h" #include "scanner.h" - #include "number.h" #include "parser.h" #include "calc.h" @@ -3066,7 +3109,6 @@ an error. struct section *s; struct token_config config = { .ignored = (1 << TK_line_comment) - | (1 << TK_block_comment) | (1 << TK_in) | (1 << TK_out), .number_chars = ".,_+-", @@ -3087,8 +3129,8 @@ an error. # calc: grammar - $LEFT * / $LEFT + - + $LEFT * / // Session -> Session Line | Line @@ -3116,6 +3158,16 @@ an error. | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$ | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$ | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$ + | Expression // Expression ${ { + mpz_t z0, z1, z2; + mpq_init($0.val); + mpz_init(z0); mpz_init(z1); mpz_init(z2); + mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val)); + mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val)); + mpz_tdiv_q(z0, z1, z2); + mpq_set_z($0.val, z0); + mpz_clear(z0); mpz_clear(z1); mpz_clear(z2); + } }$ | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$ | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$ @@ -3128,4 +3180,6 @@ an error. 10 * 9 / 2 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1 + 355//113 + error