X-Git-Url: https://ocean-lang.org/code/?p=ocean;a=blobdiff_plain;f=csrc%2Fparsergen.mdc;h=d2ff89844d2e616e0f10e470c0a7dbffde9492a7;hp=72425feb660956c4c94db76f74ed373fc8850429;hb=5281589bf4d9a66ce106b160e1272f1c51d7ac15;hpb=3e0341dcfa9cf6d757bb3e8a4a0cebb9b74bc59e diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc index 72425fe..d2ff898 100644 --- a/csrc/parsergen.mdc +++ b/csrc/parsergen.mdc @@ -21,7 +21,6 @@ There are several distinct sections. `parsergen` program built from the C code in this file can extract that grammar directly from this file and process it. - ###### File: parsergen.c #include #include @@ -63,7 +62,8 @@ If the `--tag` option is given, then any top level header that doesn't start with the tag is ignored, and the tag is striped from the rest. So `--tag Foo` means that the three needed sections must be `Foo: header`, `Foo: code`, -and `Foo: grammar`. +and `Foo: grammar`. The tag `calc` is used to extract the same calculator +from this file. [mdcode]: mdcode.html [scanner]: scanner.html @@ -350,6 +350,7 @@ production inherits from the last symbol which has a precedence. s->precedence = g->prec_levels; s->assoc = assoc; found += 1; + t = token_next(ts); } if (found == 0) err = "No symbols given on precedence line"; @@ -437,7 +438,7 @@ the end. return code; } -Now we have all the bit we need to parse a full production. +Now we have all the bits we need to parse a full production. ###### includes #include @@ -451,6 +452,7 @@ Now we have all the bit we need to parse a full production. struct symbol **body; int body_size; struct text code; + int code_line; ###### symbol fields int first_production; @@ -500,6 +502,7 @@ Now we have all the bit we need to parse a full production. tk = token_next(state); } if (tk.num == TK_open) { + p.code_line = tk.line; p.code = collect_code(state, tk); if (p.code.txt == NULL) { err = "code fragment not closed properly"; @@ -552,7 +555,11 @@ where `START` is the first non-terminal given. p->head->first_production = g->production_count; array_add(&g->productions, &g->production_count, p); -Now we are ready to read in the grammar. +Now we are ready to read in the grammar. We ignore comments +and strings so that the marks which introduce them can be +used as terminals in the grammar. We don't ignore numbers +even though we don't allow them as that causes the scanner +to produce errors that the parser is better positioned to handle. ###### grammar_read static struct grammar *grammar_read(struct code_node *code) @@ -725,10 +732,10 @@ or `-1` to indicate failure. } We will often want to form the union of two symsets. When we do, we -will often want to know if anything changed (as they might mean there +will often want to know if anything changed (as that might mean there is more work to do). So `symset_union` reports whether anything was added to the first set. We use a slow quadratic approach as these -sets don't really get very big. If profiles shows this to be a problem is +sets don't really get very big. If profiles shows this to be a problem it can be optimised later. static int symset_union(struct symset *a, struct symset *b) @@ -831,7 +838,6 @@ array like the productions. return sl->ss; } - ### Setting `nullable` We set `nullable` on the head symbol for any production for which all @@ -869,7 +875,7 @@ changes happen. } } -### Setting `can_eol` and `line_like` +### Setting `line_like` In order to be able to ignore newline tokens when not relevant, but still include them in the parse when needed, we will need to know @@ -877,30 +883,25 @@ which states can start a "line-like" section of code. We ignore newlines when there is an indent since the most recent start of a line-like symbol. -To know which symbols are line-like, we first need to know which -symbols start with a NEWLINE token. Any symbol which is followed by a -NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol. -Certainly when trying to parse one of these we must take not of NEWLINEs. +A "line_like" symbol is simply any symbol that can derive a NEWLINE. +If a symbol cannot derive a NEWLINE, then it is only part of a line - +so is word-like. If it can derive a NEWLINE, then we consider it to +be like a line. -Clearly the `TK_newline` token can start with a NEWLINE. Any symbol -which is the head of a production that contains a starts-with-NEWLINE -symbol preceeded only by nullable symbols is also a -starts-with-NEWLINE symbol. We use a new field `can_eol` to record -this attribute of symbols, and compute it in a repetitive manner -similar to `set_nullable`. - -Once we have that, we can determine which symbols are `line_like` be -seeing which are followed by a `can_eol` symbol in any production. +Clearly the `TK_newline` token can derive a NEWLINE. Any symbol which +is the head of a production that contains a line_like symbol is also a +line-like symbol. We use a new field `line_like` to record this +attribute of symbols, and compute it in a repetitive manner similar to +`set_nullable`. ###### symbol fields - int can_eol; int line_like; ###### functions - static void set_can_eol(struct grammar *g) + static void set_line_like(struct grammar *g) { int check_again = 1; - g->symtab[TK_newline]->can_eol = 1; + g->symtab[TK_newline]->line_like = 1; while (check_again) { int p; check_again = 0; @@ -908,35 +909,20 @@ seeing which are followed by a `can_eol` symbol in any production. struct production *pr = g->productions[p]; int s; - if (pr->head->can_eol) + if (pr->head->line_like) continue; for (s = 0 ; s < pr->body_size; s++) { - if (pr->body[s]->can_eol) { - pr->head->can_eol = 1; + if (pr->body[s]->line_like) { + pr->head->line_like = 1; check_again = 1; break; } - if (!pr->body[s]->nullable) - break; } } } } - static void set_line_like(struct grammar *g) - { - int p; - for (p = 0; p < g->production_count; p++) { - struct production *pr = g->productions[p]; - int s; - - for (s = 1; s < pr->body_size; s++) - if (pr->body[s]->can_eol) - pr->body[s-1]->line_like = 1; - } - } - ### Building the `first` sets When calculating what can follow a particular non-terminal, we will need to @@ -950,7 +936,7 @@ we repeat the calculations until no changes happen, just like with `set_nullable`. This makes use of the fact that `symset_union` reports if any change happens. -The core of this which finds the "first" of part of a production body +The core of this, which finds the "first" of part of a production body, will be reused for computing the "follow" sets, so we split it out into a separate function. @@ -1166,6 +1152,26 @@ can just compare the symset and the data values together. a.data[i] - b.data[i]; } +It will be helpful to know if an itemset has been "completed" or not, +particularly for LALR where itemsets get merged, at which point they +need to be consider for completion again. So a `completed` flag is needed. + +For correct handling of `TK_newline` when parsing, we will need to +know which states (itemsets) can occur at the start of a line, so we +will record a `starts_line` flag too whenever DOT is at the start of a +`line_like` symbol. + +Finally, for handling `TK_out` we need to know whether productions in the +current state started *before* the most recent indent. A state +doesn't usually keep details of individual productions, so we need to +add one extra detail. `min_prefix` is the smallest non-zero number of +symbols *before* DOT in any production in an itemset. This will allow +us to determine if the the most recent indent is sufficiently recent +to cancel it against a `TK_out`. If it was seen longer ago than the +`min_prefix`, and if the current state cannot be reduced, then the +indented section must have ended in the middle of a syntactic unit, so +an error must be signaled. + And now we can build the list of itemsets. The lookup routine returns both a success flag and a pointer to where in the list an insert should happen, so we don't need to search a second time. @@ -1176,6 +1182,8 @@ should happen, so we don't need to search a second time. short state; struct symset items; struct symset go_to; + enum assoc assoc; + unsigned short precedence; char completed; char starts_line; int min_prefix; @@ -1216,6 +1224,7 @@ recalculated and their LA sets updated. them to a data structure, of freeing them. static int add_itemset(struct grammar *g, struct symset ss, + enum assoc assoc, unsigned short precedence, enum grammar_type type) { struct itemset **where, *is; @@ -1226,6 +1235,8 @@ them to a data structure, of freeing them. is->state = g->states; g->states += 1; is->items = ss; + is->assoc = assoc; + is->precedence = precedence; is->next = *where; is->go_to = INIT_DATASET; *where = is; @@ -1269,11 +1280,19 @@ be supplemented by the LA set for the item which produce the new item. We also collect a set of all symbols which follow "DOT" (in `done`) as this is used in the next stage. -If any of these symbols are flagged as starting a line, then this +If any of these symbols are flagged as `line_like`, then this state must be a `starts_line` state so now is a good time to record that. -NOTE: precedence handling should happen here - I haven't written this yet -though. +When itemsets are created we assign a precedence to the itemset from +the complete item, if there is one. We ignore the possibility of +there being two and don't (currently) handle precedence in such +grammars. When completing a grammar we ignore any item where DOT is +followed by a terminal with a precedence lower (numerically higher) +than that for the itemset. Unless the terminal has right +associativity, we also ignore items where the terminal has the same +precedence. The result is that unwanted items are still in the +itemset, but the terminal doesn't get into the go to set, so the item +is ineffective. ###### complete itemset for (i = 0; i < is->items.cnt; i++) { @@ -1291,6 +1310,18 @@ though. if (bs == pr->body_size) continue; s = pr->body[bs]; + if (s->precedence && is->precedence && + is->precedence < s->precedence) + /* This terminal has a low precedence and + * shouldn't be shifted + */ + continue; + if (s->precedence && is->precedence && + is->precedence == s->precedence && s->assoc != Right) + /* This terminal has a matching precedence and is + * not Right-associative, so we mustn't shift it. + */ + continue; if (symset_find(&done, s->num) < 0) { symset_add(&done, s->num, 0); if (s->line_like) @@ -1350,6 +1381,8 @@ with a pre-existing itemset). int j; unsigned short state; struct symbol *sym = g->symtab[done.syms[i]]; + enum assoc assoc = Non; + unsigned short precedence = 0; struct symset newitemset = INIT_SYMSET; if (type >= LALR) newitemset = INIT_DATASET; @@ -1369,6 +1402,14 @@ with a pre-existing itemset). if (type >= LALR) la = is->items.data[j]; pos = symset_find(&newitemset, pr->head->num); + if (bp + 1 == pr->body_size && + pr->precedence > 0 && + (precedence == 0 || + pr->precedence < precedence)) { + // new itemset is reducible and has a precedence. + precedence = pr->precedence; + assoc = pr->assoc; + } if (pos < 0) symset_add(&newitemset, item_num(p, bp+1), la); else if (type >= LALR) { @@ -1386,12 +1427,12 @@ with a pre-existing itemset). } } } - state = add_itemset(g, newitemset, type); + state = add_itemset(g, newitemset, assoc, precedence, type); if (symset_find(&is->go_to, done.syms[i]) < 0) symset_add(&is->go_to, done.syms[i], state); } -All that is left is to crate the initial itemset from production zero, and +All that is left is to create the initial itemset from production zero, and with `TK_eof` as the LA set. ###### functions @@ -1410,7 +1451,7 @@ with `TK_eof` as the LA set. } // production 0, offset 0 (with no data) symset_add(&first, item_num(0, 0), la); - add_itemset(g, first, type); + add_itemset(g, first, Non, 0, type); for (again = 0, is = g->items; is; is = is->next ?: again ? (again = 0, g->items) : NULL) { @@ -1470,7 +1511,6 @@ changeover point in `first_nonterm`. g->symtab[s->num] = s; set_nullable(g); - set_can_eol(g); set_line_like(g); if (type >= SLR) build_first(g); @@ -1503,8 +1543,8 @@ all the tables that have been generated, plus a description of any conflicts. Firstly we have the complete list of symbols, together with the "FIRST" set if that was generated. We add a mark to each symbol to -show if it can end in a newline (`>`), if it implies the start of a -line (`<`), or if it is nullable (`.`). +show if it can end in a newline (`>`), if it is considered to be +"line-like" (`<`), or if it is nullable (`.`). ###### functions @@ -1521,9 +1561,8 @@ line (`<`), or if it is nullable (`.`). if (!s) continue; - printf(" %c%c%c%3d%c: ", + printf(" %c%c%3d%c: ", s->nullable ? '.':' ', - s->can_eol ? '>':' ', s->line_like ? '<':' ', s->num, symtypes[s->type]); prtxt(s->name); @@ -1586,9 +1625,15 @@ it up a bit. First the items, with production number and associativity. if (dot == pr->body_size) printf(" ."); printf(" [%d]", p); - if (pr->precedence) + if (pr->precedence && dot == pr->body_size) printf(" (%d%s)", pr->precedence, assoc_names[pr->assoc]); + if (dot < pr->body_size && + pr->body[dot]->precedence) { + struct symbol *s = pr->body[dot]; + printf(" [%d%s]", s->precedence, + assoc_names[s->assoc]); + } printf("\n"); } @@ -1611,7 +1656,6 @@ The LA sets which are (possibly) reported with each item: Then the go to sets: - static void report_goto(struct grammar *g, struct symset gt) { int i; @@ -1633,8 +1677,11 @@ Now we can report all the item sets complete with items, LA sets, and GO TO. for (s = 0; s < g->states; s++) { int j; struct itemset *is = g->statetab[s]; - printf(" Itemset %d:%s min prefix=%d\n", + printf(" Itemset %d:%s min prefix=%d", s, is->starts_line?" (startsline)":"", is->min_prefix); + if (is->precedence) + printf(" %d%s", is->precedence, assoc_names[is->assoc]); + printf("\n"); for (j = 0; j < is->items.cnt; j++) { report_item(g, is->items.syms[j]); if (is->items.data != NO_DATA) @@ -1670,7 +1717,7 @@ LR1 are also similar as they have FOLLOW or LA sets. LR0 conflicts are any state which have both a reducible item and a shiftable item, or two reducible items. -LR05 conflicts only occurs if two possibly reductions exist, +LR05 conflicts only occur if two possible reductions exist, as shifts always over-ride reductions. ###### conflict functions @@ -1725,6 +1772,15 @@ terminals to items where that terminal could be shifted and another which maps terminals to items that could be reduced when the terminal is in look-ahead. We report when we get conflicts between the two. +As a special case, if we find a SHIFT/REDUCE conflict, where a +terminal that could be shifted is in the lookahead set of some +reducable item, then set check if the reducable item also have +`TK_newline` in its lookahead set. If it does, then a newline will +force the reduction, but anything else can reasonably be shifted, so +that isn't really a conflict. Such apparent conflicts do not get +counted, and are reported as non-critical. This will not affect a +"traditional" grammar that does not include newlines as token. + static int conflicts_slr(struct grammar *g, enum grammar_type type) { int i; @@ -1752,7 +1808,7 @@ is in look-ahead. We report when we get conflicts between the two. symset_add(&shifts, sym, itm); } } - /* Now look for reduction and conflicts */ + /* Now look for reductions and conflicts */ for (j = 0; j < is->items.cnt; j++) { unsigned short itm = is->items.syms[j]; int p = item_prod(itm); @@ -1771,12 +1827,15 @@ is in look-ahead. We report when we get conflicts between the two. for (k = 0; k < la.cnt; k++) { int pos = symset_find(&shifts, la.syms[k]); if (pos >= 0) { - printf(" State %d has SHIFT/REDUCE conflict on ", i); + if (symset_find(&la, TK_newline) < 0) { + printf(" State %d has SHIFT/REDUCE conflict on ", i); + cnt++; + } else + printf(" State %d has non-critical SHIFT/REDUCE conflict on ", i); prtxt(g->symtab[la.syms[k]]->name); printf(":\n"); report_item(g, shifts.data[pos]); report_item(g, itm); - cnt++; } pos = symset_find(&reduce, la.syms[k]); if (pos < 0) { @@ -1797,7 +1856,6 @@ is in look-ahead. We report when we get conflicts between the two. return cnt; } - ## Generating the parser The exported part of the parser is the `parse_XX` function, where the name @@ -1814,13 +1872,14 @@ pieces of code provided in the grammar file, so they are generated first. ###### parser_generate - static void gen_parser(FILE *f, struct grammar *g, char *file, char *name) + static void gen_parser(FILE *f, struct grammar *g, char *file, char *name, + struct code_node *pre_reduce) { gen_known(f, g); gen_non_term(f, g); gen_goto(f, g); gen_states(f, g); - gen_reduce(f, g, file); + gen_reduce(f, g, file, pre_reduce); gen_free(f, g); fprintf(f, "#line 0 \"gen_parser\"\n"); @@ -1841,7 +1900,9 @@ pieces of code provided in the grammar file, so they are generated first. ### Known words table The known words table is simply an array of terminal symbols. -The table of nonterminals used for tracing is a similar array. +The table of nonterminals used for tracing is a similar array. We +include virtual symbols in the table of non_terminals to keep the +numbers right. ###### functions @@ -1866,7 +1927,7 @@ The table of nonterminals used for tracing is a similar array. for (i = TK_reserved; i < g->num_syms; i++) - if (g->symtab[i]->type == Nonterminal) + if (g->symtab[i]->type != Terminal) fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len, g->symtab[i]->name.txt); fprintf(f, "};\n\n"); @@ -1896,12 +1957,10 @@ The go to table is stored in a simple array of `sym` and corresponding short reduce_prod; short reduce_size; short reduce_sym; - short shift_sym; short starts_line; short min_prefix; }; - ###### functions static void gen_goto(FILE *f, struct grammar *g) @@ -1930,23 +1989,15 @@ The go to table is stored in a simple array of `sym` and corresponding for (i = 0; i < g->states; i++) { struct itemset *is = g->statetab[i]; int j, prod = -1, prod_len; - int shift_sym = -1; - int shift_len = 0, shift_remain = 0; + for (j = 0; j < is->items.cnt; j++) { int itm = is->items.syms[j]; int p = item_prod(itm); int bp = item_index(itm); struct production *pr = g->productions[p]; - if (bp < pr->body_size) { - if (shift_sym < 0 || - (shift_len == bp && shift_remain > pr->body_size - bp)) { - shift_sym = pr->body[bp]->num; - shift_len = bp; - shift_remain = pr->body_size - bp; - } + if (bp < pr->body_size) continue; - } /* This is what we reduce */ if (prod < 0 || prod_len < pr->body_size) { prod = p; @@ -1955,14 +2006,14 @@ The go to table is stored in a simple array of `sym` and corresponding } if (prod >= 0) - fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d, %d },\n", + fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d },\n", i, is->go_to.cnt, i, prod, g->productions[prod]->body_size, g->productions[prod]->head->num, is->starts_line, is->min_prefix); else - fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d, %d },\n", - i, is->go_to.cnt, i, shift_sym, + fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n", + i, is->go_to.cnt, i, is->starts_line, is->min_prefix); } fprintf(f, "};\n\n"); @@ -1989,7 +2040,7 @@ can access the larger structure using pointer manipulation. The code fragment requires translation when written out. Any `$N` needs to be converted to a reference either to that buffer (if `$0`) or to the structure returned by a previous reduction. These pointers need to be cast -to the appropriate type for each access. All this is handling in +to the appropriate type for each access. All this is handled in `gen_code`. `gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'. @@ -2064,21 +2115,27 @@ automatically freed. This is equivalent to assigning `NULL` to the pointer. ###### functions - static void gen_reduce(FILE *f, struct grammar *g, char *file) + static void gen_reduce(FILE *f, struct grammar *g, char *file, + struct code_node *code) { int i; - fprintf(f, "#line 0 \"gen_reduce\"\n"); + fprintf(f, "#line 1 \"gen_reduce\"\n"); fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n"); fprintf(f, "{\n"); fprintf(f, "\tint ret_size = 0;\n"); + if (code) + code_node_print(f, code, file); + fprintf(f, "#line 4 \"gen_reduce\"\n"); fprintf(f, "\tswitch(prod) {\n"); for (i = 0; i < g->production_count; i++) { struct production *p = g->productions[i]; fprintf(f, "\tcase %d:\n", i); - if (p->code.txt) + if (p->code.txt) { + fprintf(f, "#line %d \"%s\"\n", p->code_line, file); gen_code(p, f, g); + } if (p->head->struct_name.txt) fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n", @@ -2103,7 +2160,7 @@ recovery where individual stack frames might need to be freed. For this, the grammar author is required to defined a `free_XX` function for each structure that is used by a non-terminal. `do_free` will call whichever is appropriate given a symbol number, and will call `free` (as is -appropriate for tokens on any terminal symbol. +appropriate for tokens) on any terminal symbol. ###### functions @@ -2228,7 +2285,7 @@ grammar file). To be able to run `mdcode` and `scanner` on the grammar we need to memory map it. -One we have extracted the code (with `mdcode`) we expect to find three +Once we have extracted the code (with `mdcode`) we expect to find three sections: header, code, and grammar. Anything else that is not excluded by the `--tag` option is an error. @@ -2266,6 +2323,7 @@ parser with neither. "grammar" must be provided. struct code_node *hdr = NULL; struct code_node *code = NULL; struct code_node *gram = NULL; + struct code_node *pre_reduce = NULL; for (s = table; s; s = s->next) { struct text sec = s->section; if (tag && !strip_tag(&sec, tag)) @@ -2276,6 +2334,8 @@ parser with neither. "grammar" must be provided. code = s->code; else if (text_is(sec, "grammar")) gram = s->code; + else if (text_is(sec, "reduce")) + pre_reduce = s->code; else { fprintf(stderr, "Unknown content section: %.*s\n", s->section.len, s->section.txt); @@ -2322,7 +2382,7 @@ the report finds conflicts we will exit with an error status. rv |= 1; } -If a headers section is defined, we write it out and include a declaration +If a "headers" section is defined, we write it out and include a declaration for the `parse_XX` function so it can be used from separate code. if (rv == 0 && hdr && outfile) { @@ -2347,7 +2407,7 @@ file with the code section (if any) and the parser tables and function. if (f) { if (code) code_node_print(f, code, infile); - gen_parser(f, g, infile, name); + gen_parser(f, g, infile, name, pre_reduce); fclose(f); } else { fprintf(stderr, "Cannot create %s.c\n", @@ -2437,20 +2497,23 @@ The `state` is the most important one and guides the parsing process. The freeing function. The symbol leads us to the right free function through `do_free`. -The `indents` count tracks the line indents in the symbol. These are -used to allow indent information to guide parsing and error recovery. +The `indents` count tracks the line indents with-in the symbol or +immediately follow it. These are used to allow indent information to +guide parsing and error recovery. `since_newline` tracks how many stack frames since the last start-of-line (whether indented or not). So if `since_newline` is -zero, then this symbol is at the start of a line. +zero, then this symbol is at the start of a line. Similarly +`since_indent` counts the number of states since an indent, it is zero +precisely when `indents` is not zero. `newline_permitted` keeps track of whether newlines should be ignored -or not, and `starts_line` records if this state stated on a newline. +or not. The stack is most properly seen as alternating states and symbols - states, like the 'DOT' in items, are between symbols. Each frame in our stack holds a state and the symbol that was before it. The -bottom of stack holds the start state, but no symbol, as nothing came +bottom of stack holds the start state but no symbol, as nothing came before the beginning. ###### parser functions @@ -2474,12 +2537,15 @@ before the beginning. Two operations are needed on the stack - shift (which is like push) and pop. -Shift applies not only to terminals but also to non-terminals. When we -reduce a production we will pop off entries corresponding to the body -symbols, then push on an item for the head of the production. This last is -exactly the same process as shifting in a terminal so we use the same -function for both. In both cases we provide a stack frame which -contains the symbol to shift and related indent information. +Shift applies not only to terminals but also to non-terminals. When +we reduce a production we will pop off entries corresponding to the +body symbols, then push on an item for the head of the production. +This last is exactly the same process as shifting in a terminal so we +use the same function for both. In both cases we provide the symbol, +the number of indents the symbol contains (which will be zero for a +terminal symbol) and a flag indicating the the symbol was at (or was +reduced from a symbol which was at) the start of a line. The state is +deduced from the current top-of-stack state and the new symbol. To simplify other code we arrange for `shift` to fail if there is no `goto` state for the symbol. This is useful in basic parsing due to our design @@ -2489,17 +2555,13 @@ function reports if it could. `shift` is also used to push state zero onto the stack, so if the stack is empty, it always chooses zero as the next state. -So `shift` finds the next state. If that succeed it extends the allocations -if needed and pushes all the information onto the stacks. - -Newlines are permitted after a starts_line state until an internal -indent. So we need to find the topmost state which `starts_line` and -see if there are any indents other than immediately after it. +So `shift` finds the next state. If that succeeds it extends the +allocations if needed and pushes all the information onto the stacks. -So we walk down: - -- if state starts_line, then newlines_permitted. -- if any non-initial indents, newlines not permitted +Newlines are permitted after a `starts_line` state until an internal +indent. If the new frame has neither a `starts_line` state nor an +indent, newlines are permitted if the previous stack frame permitted +them. ###### parser functions @@ -2557,11 +2619,9 @@ So we walk down: `pop` primarily moves the top of stack (`tos`) back down the required amount and frees any `asn` entries that need to be freed. It also -collects a summary of the indents in the symbols that are being -removed. It is called _after_ we reduce a production, just before we -`shift` the nonterminal in. - -`pop` is only called if there are entries to remove, so `num` is never zero. +collects a summary of the indents and line starts in the symbols that +are being removed. It is called _after_ we reduce a production, just +before we `shift` the nonterminal in. ###### parser functions @@ -2575,7 +2635,7 @@ removed. It is called _after_ we reduce a production, just before we p->tos -= num; for (i = 0; i < num; i++) { - sol |= !p->stack[p->tos+1].since_newline; + sol |= !p->stack[p->tos+i].since_newline; indents += p->stack[p->tos+i].indents; do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]); @@ -2616,9 +2676,9 @@ copying, hence `memdup` and `tokcopy`. ### The heart of the parser. -Now we have the parser. If we can shift, we do, though newlines and -reducing indenting may block that. If not and we can reduce we do. -If the production we reduced was production zero, then we have +Now we have the parser. If we can shift we do, though newlines and +reducing indenting may block that. If not and we can reduce we do +that. If the production we reduced was production zero, then we have accepted the input and can finish. We return whatever `asn` was returned by reducing production zero. @@ -2631,20 +2691,70 @@ When we find `TK_in` and `TK_out` tokens which report indents we need to handle them directly as the grammar cannot express what we want to do with them. -`TK_in` tokens are easy: we simply update the `next` stack frame to -record how many indents there are and that the next token started with -an indent. - -`TK_out` tokens must either be counted off against any pending indent, -or must force reductions until there is a pending indent which isn't -at the start of a production. - -`TK_newline` tokens are ignored precisely if there has been an indent -since the last state which could have been at the start of a line. +`TK_in` tokens are easy: we simply update indent count in the top stack frame to +record how many indents there are following the previous token. + +`TK_out` tokens must be canceled against an indent count +within the stack. If we can reduce some symbols that are all since +the most recent indent, then we do that first. If the minimum prefix +of the current state then extends back before the most recent indent, +that indent can be cancelled. If the minimum prefix is shorter then +the indent had ended prematurely and we must start error handling, which +is still a work-in-progress. + +`TK_newline` tokens are ignored unless the top stack frame records +that they are permitted. In that case they will not be considered for +shifting if it is possible to reduce some symbols that are all since +the most recent start of line. This is how a newline forcibly +terminates any line-like structure - we try to reduce down to at most +one symbol for each line where newlines are allowed. +A consequence of this is that a rule like + +###### Example: newlines - broken + + Newlines -> + | NEWLINE Newlines + IfStatement -> Newlines if .... + +cannot work, as the NEWLINE will never be shifted as the empty string +will be reduced first. Optional sets of newlines need to be include +in the thing that preceed: + +###### Example: newlines - works + + If -> if + | NEWLINE If + IfStatement -> If .... + +Here the NEWLINE will be shifted because nothing can be reduced until +the `if` is seen. + +When, during error handling, we discard token read in, we want to keep +discarding until we see one that is recognised. If we had a full set +of LR(1) grammar states, this will mean looking in the look-ahead set, +but we don't keep a full look-ahead set. We only record the subset +that leads to SHIFT. We can, however, deduce the look-ahead set but +looking at the SHIFT subsets for all states that we can get to by +reducing zero or more times. So we need a little function which +checks if a given token is in any of these look-ahead sets. ###### parser includes #include "parser.h" + ###### parser_run + + static int in_lookahead(struct token *tk, const struct state *states, int state) + { + while (state >= 0) { + if (search(&states[state], tk->num) >= 0) + return 1; + if (states[state].reduce_prod < 0) + return 0; + state = search(&states[state], states[state].reduce_sym); + } + return 0; + } + void *parser_run(struct token_state *tokens, const struct state states[], int (*do_reduce)(int, void**, struct token_config*, void*), @@ -2710,8 +2820,8 @@ since the last state which could have been at the start of a line. parser_trace_action(trace, "Cancel"); continue; } - // fall through and force a REDUCE (as 'shift' - // will fail). + // fall through to error handling as both SHIFT and REDUCE + // will fail. } if (tk->num == TK_newline) { if (!tos->newline_permitted) { @@ -2745,13 +2855,8 @@ since the last state which could have been at the start of a line. bufsize = do_reduce(prod, body, config, buf); - if (size) - indents = pop(&p, size, &start_of_line, - do_free); - else { - indents = 0; - start_of_line = 0; - } + indents = pop(&p, size, &start_of_line, + do_free); res = memdup(buf, bufsize); memset(buf, 0, bufsize); if (!shift(&p, nextstate->reduce_sym, @@ -2764,16 +2869,6 @@ since the last state which could have been at the start of a line. parser_trace_action(trace, "Reduce"); continue; } - if (tk->num == TK_out) { - // Indent problem - synthesise tokens to get us - // out of here. - fprintf(stderr, "Synthesize %d to handle indent problem\n", states[tos->state].shift_sym); - shift(&p, states[tos->state].shift_sym, - 0, 1, tok_copy(*tk), states); - // FIXME need to report this error somehow - parser_trace_action(trace, "Synthesize"); - continue; - } /* Error. We walk up the stack until we * find a state which will accept TK_error. * We then shift in TK_error and see what state @@ -2785,9 +2880,9 @@ since the last state which could have been at the start of a line. short indents = 0, start_of_line; err_tk = tok_copy(*tk); - while (shift(&p, TK_error, 0, 0, - err_tk, states) == 0 - && p.tos > 0) + while (p.tos > 0 && + shift(&p, TK_error, 0, 0, + err_tk, states) == 0) // discard this state indents += pop(&p, 1, &start_of_line, do_free); if (p.tos == 0) { @@ -2796,7 +2891,7 @@ since the last state which could have been at the start of a line. break; } tos = &p.stack[p.tos-1]; - while (search(&states[tos->state], tk->num) < 0 && + while (!in_lookahead(tk, states, tos->state) && tk->num != TK_eof) { free(tk); tk = tok_copy(token_next(tokens)); @@ -2809,14 +2904,10 @@ since the last state which could have been at the start of a line. // FIXME update since_indent here } } - if (p.tos == 0 && tk->num == TK_eof) - break; - tos = &p.stack[p.tos-1]; tos->indents += indents; } free(tk); - if (p.tos) - pop(&p, p.tos, NULL, do_free); + pop(&p, p.tos, NULL, do_free); free(p.asn_stack); free(p.stack); return ret; @@ -2916,7 +3007,7 @@ The obvious example for a parser is a calculator. As `scanner` provides number parsing function using `libgmp` is it not much work to perform arbitrary rational number calculations. -This calculator takes one expression, or an equality test per line. The +This calculator takes one expression, or an equality test, per line. The results are printed and if any equality test fails, the program exits with an error. @@ -2925,6 +3016,9 @@ an error. ./parsergen --tag calc -o calc parsergen.mdc calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp + calctest : calc + ./calc parsergen.mdc + demos :: calctest # calc: header @@ -2945,6 +3039,7 @@ an error. #include #include #include + #include #include "mdcode.h" #include "scanner.h" #include "number.h" @@ -2958,12 +3053,19 @@ an error. free(n); } + static int text_is(struct text t, char *s) + { + return (strlen(s) == t.len && + strncmp(s, t.txt, t.len) == 0); + } + int main(int argc, char *argv[]) { int fd = open(argv[1], O_RDONLY); int len = lseek(fd, 0, 2); char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); - struct section *s = code_extract(file, file+len, NULL); + struct section *table = code_extract(file, file+len, NULL); + struct section *s; struct token_config config = { .ignored = (1 << TK_line_comment) | (1 << TK_block_comment) @@ -2973,18 +3075,23 @@ an error. .word_start = "", .word_cont = "", }; - parse_calc(s->code, &config, argc > 2 ? stderr : NULL); - while (s) { - struct section *t = s->next; - code_free(s->code); - free(s); - s = t; + for (s = table; s; s = s->next) + if (text_is(s->section, "example: input")) + parse_calc(s->code, &config, argc > 2 ? stderr : NULL); + while (table) { + struct section *t = table->next; + code_free(table->code); + free(table); + table = t; } exit(0); } # calc: grammar + $LEFT * / + $LEFT + - + Session -> Session Line | Line @@ -3007,13 +3114,20 @@ an error. | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$ $number - Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$ - | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$ - | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$ + Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$ + | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$ + | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$ + | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$ + | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$ + | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$ - Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$ - | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$ - | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$ +# example: input - Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$ - | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$ + 355/113 + 3.1415926535 - 355/113 + 2 + 4 * 5 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 * 9 / 2 + 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1 + + error