X-Git-Url: https://ocean-lang.org/code/?a=blobdiff_plain;f=csrc%2Fparsergen.mdc;h=78ff5435b8086fcd720674cfcb72c13b3a99d919;hb=97b1522ec76e072e927ef3e5f1f917f2a92236db;hp=c25a87e1f621fd3ffa95d4a809c2276a235b2476;hpb=d28c54dc30f0b4322ffb2ecbbfcee9a566052d4b;p=ocean diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc index c25a87e..78ff543 100644 --- a/csrc/parsergen.mdc +++ b/csrc/parsergen.mdc @@ -877,7 +877,7 @@ changes happen. } } -### Setting `can_eol` and `line_like` +### Setting `line_like` In order to be able to ignore newline tokens when not relevant, but still include them in the parse when needed, we will need to know @@ -885,30 +885,26 @@ which states can start a "line-like" section of code. We ignore newlines when there is an indent since the most recent start of a line-like symbol. -To know which symbols are line-like, we first need to know which -symbols start with a NEWLINE token. Any symbol which is followed by a -NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol. -Certainly when trying to parse one of these we must take note of NEWLINEs. +A "line_like" symbol is simply any symbol that can derive a NEWLINE. +If a symbol cannot derive a NEWLINE, then it is only part of a line - +so is word-like. If it can derive a NEWLINE, then we consider it to +be like a line. -Clearly the `TK_newline` token can start with a NEWLINE. Any symbol -which is the head of a production that contains a starts-with-NEWLINE -symbol preceeded only by nullable symbols is also a -starts-with-NEWLINE symbol. We use a new field `can_eol` to record -this attribute of symbols, and compute it in a repetitive manner -similar to `set_nullable`. -Once we have that, we can determine which symbols are `line_like` by -seeing which are followed by a `can_eol` symbol in any production. +Clearly the `TK_newline` token can derive a NEWLINE. Any symbol which +is the head of a production that contains a line_like symbol is also a +line-like symbol. We use a new field `line_like` to record this +attribute of symbols, and compute it in a repetitive manner similar to +`set_nullable`. ###### symbol fields - int can_eol; int line_like; ###### functions - static void set_can_eol(struct grammar *g) + static void set_line_like(struct grammar *g) { int check_again = 1; - g->symtab[TK_newline]->can_eol = 1; + g->symtab[TK_newline]->line_like = 1; while (check_again) { int p; check_again = 0; @@ -916,35 +912,20 @@ seeing which are followed by a `can_eol` symbol in any production. struct production *pr = g->productions[p]; int s; - if (pr->head->can_eol) + if (pr->head->line_like) continue; for (s = 0 ; s < pr->body_size; s++) { - if (pr->body[s]->can_eol) { - pr->head->can_eol = 1; + if (pr->body[s]->line_like) { + pr->head->line_like = 1; check_again = 1; break; } - if (!pr->body[s]->nullable) - break; } } } } - static void set_line_like(struct grammar *g) - { - int p; - for (p = 0; p < g->production_count; p++) { - struct production *pr = g->productions[p]; - int s; - - for (s = 1; s < pr->body_size; s++) - if (pr->body[s]->can_eol) - pr->body[s-1]->line_like = 1; - } - } - ### Building the `first` sets When calculating what can follow a particular non-terminal, we will need to @@ -1180,9 +1161,10 @@ need to be consider for completion again. So a `completed` flag is needed. For correct handling of `TK_newline` when parsing, we will need to know which states (itemsets) can occur at the start of a line, so we -will record a `starts_line` flag too. +will record a `starts_line` flag too whenever DOT is at the start of a +`line_like` symbol. -Finally, for handling `TK_out` we need to know where production in the +Finally, for handling `TK_out` we need to know whether productions in the current state started *before* the most recent indent. A state doesn't usually keep details of individual productions, so we need to add one extra detail. `min_prefix` is the smallest non-zero number of @@ -1301,7 +1283,7 @@ be supplemented by the LA set for the item which produce the new item. We also collect a set of all symbols which follow "DOT" (in `done`) as this is used in the next stage. -If any of these symbols are flagged as starting a line, then this +If any of these symbols are flagged as `line_like`, then this state must be a `starts_line` state so now is a good time to record that. When itemsets are created we assign a precedence to the itemset from @@ -1532,7 +1514,6 @@ changeover point in `first_nonterm`. g->symtab[s->num] = s; set_nullable(g); - set_can_eol(g); set_line_like(g); if (type >= SLR) build_first(g); @@ -1583,9 +1564,8 @@ show if it can end in a newline (`>`), if it is considered to be if (!s) continue; - printf(" %c%c%c%3d%c: ", + printf(" %c%c%3d%c: ", s->nullable ? '.':' ', - s->can_eol ? '>':' ', s->line_like ? '<':' ', s->num, symtypes[s->type]); prtxt(s->name); @@ -1796,6 +1776,15 @@ terminals to items where that terminal could be shifted and another which maps terminals to items that could be reduced when the terminal is in look-ahead. We report when we get conflicts between the two. +As a special case, if we find a SHIFT/REDUCE conflict, where a +terminal that could be shifted is in the lookahead set of some +reducable item, then set check if the reducable item also have +`TK_newline` in its lookahead set. If it does, then a newline will +force and reduction, but anything else can reasonably be shifts, so +that isn't really a conflict. Such apparent conflicts do not get +reported. This will not affect a "tradtional" grammar that does not +include newlines as token. + static int conflicts_slr(struct grammar *g, enum grammar_type type) { int i; @@ -1823,7 +1812,7 @@ is in look-ahead. We report when we get conflicts between the two. symset_add(&shifts, sym, itm); } } - /* Now look for reduction and conflicts */ + /* Now look for reductions and conflicts */ for (j = 0; j < is->items.cnt; j++) { unsigned short itm = is->items.syms[j]; int p = item_prod(itm); @@ -1841,7 +1830,7 @@ is in look-ahead. We report when we get conflicts between the two. int k; for (k = 0; k < la.cnt; k++) { int pos = symset_find(&shifts, la.syms[k]); - if (pos >= 0) { + if (pos >= 0 && symset_find(&la, TK_newline) < 0) { printf(" State %d has SHIFT/REDUCE conflict on ", i); prtxt(g->symtab[la.syms[k]]->name); printf(":\n"); @@ -2713,9 +2702,32 @@ the most recent start of line. This is how a newline forcible terminates any line-like structure - we try to reduce down to at most one symbol for each line where newlines are allowed. +When, during error handling, we discard token read in, we want to keep +discarding until we see one that is recognised. If we had a full set +of LR(1) grammar states, this will mean looking in the look-ahead set, +but we don't keep a full look-ahead set. We only record the subset +that leads to SHIFT. We can, however, deduce the look-ahead set but +looking at the SHIFT subsets for all states that we can get to by +reducing zero or more times. So we need a little function which +checks if a given token is in any of these look-ahead sets. + ###### parser includes #include "parser.h" + ###### parser_run + + static int in_lookahead(struct token *tk, const struct state *states, int state) + { + while (state >= 0) { + if (search(&states[state], tk->num) >= 0) + return 1; + if (states[state].reduce_prod < 0) + return 0; + state = search(&states[state], states[state].reduce_sym); + } + return 0; + } + void *parser_run(struct token_state *tokens, const struct state states[], int (*do_reduce)(int, void**, struct token_config*, void*), @@ -2852,7 +2864,7 @@ one symbol for each line where newlines are allowed. break; } tos = &p.stack[p.tos-1]; - while (search(&states[tos->state], tk->num) < 0 && + while (!in_lookahead(tk, states, tos->state) && tk->num != TK_eof) { free(tk); tk = tok_copy(token_next(tokens)); @@ -2865,11 +2877,7 @@ one symbol for each line where newlines are allowed. // FIXME update since_indent here } } - if (p.tos == 0 && tk->num == TK_eof) - break; - tos = &p.stack[p.tos-1]; tos->indents += indents; - exit(1); } free(tk); pop(&p, p.tos, NULL, do_free); @@ -2981,6 +2989,9 @@ an error. ./parsergen --tag calc -o calc parsergen.mdc calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp + calctest : calc + ./calc parsergen.mdc + tests :: calctest # calc: header @@ -3001,6 +3012,7 @@ an error. #include #include #include + #include #include "mdcode.h" #include "scanner.h" #include "number.h" @@ -3014,12 +3026,19 @@ an error. free(n); } + static int text_is(struct text t, char *s) + { + return (strlen(s) == t.len && + strncmp(s, t.txt, t.len) == 0); + } + int main(int argc, char *argv[]) { int fd = open(argv[1], O_RDONLY); int len = lseek(fd, 0, 2); char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); - struct section *s = code_extract(file, file+len, NULL); + struct section *table = code_extract(file, file+len, NULL); + struct section *s; struct token_config config = { .ignored = (1 << TK_line_comment) | (1 << TK_block_comment) @@ -3029,12 +3048,14 @@ an error. .word_start = "", .word_cont = "", }; - parse_calc(s->code, &config, argc > 2 ? stderr : NULL); - while (s) { - struct section *t = s->next; - code_free(s->code); - free(s); - s = t; + for (s = table; s; s = s->next) + if (text_is(s->section, "example: input")) + parse_calc(s->code, &config, argc > 2 ? stderr : NULL); + while (table) { + struct section *t = table->next; + code_free(table->code); + free(table); + table = t; } exit(0); } @@ -3072,3 +3093,14 @@ an error. | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$ | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$ | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$ + +# example: input + + 355/113 + 3.1415926535 - 355/113 + 2 + 4 * 5 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 * 9 / 2 + 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1 + + error