X-Git-Url: https://ocean-lang.org/code/?p=ocean;a=blobdiff_plain;f=csrc%2Fparsergen.mdc;h=d9467442eeab1dfec4720f7a2b4f5c6666d43867;hp=19703bce82000b2ce642a29e9e6ad5c15bd91d3a;hb=10db06aed6af588a0ccd05e80a0f50286949d56c;hpb=b5039c5ba828498822903abe4b68cde2a8124aa5 diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc index 19703bc..d946744 100644 --- a/csrc/parsergen.mdc +++ b/csrc/parsergen.mdc @@ -1,9 +1,14 @@ -# LR(1) Parser Generator # +# LR(1) Parser Generator with 2D support # This parser generator takes a grammar description combined with code fragments, analyses it, and can report details about the analysis and write out C code files which can be compiled to make a parser. +"2D support" means that indentation and line breaks can be significant. +Indent tokens (IN and OUT) and end-of-line tokens (EOL and NEWLINE) can +be used to describe the grammar and the parser can selectively ignore +these where they aren't relevant. + There are several distinct sections. 1. `grammar_read` will read a grammar from a literate-code file and @@ -21,7 +26,6 @@ There are several distinct sections. `parsergen` program built from the C code in this file can extract that grammar directly from this file and process it. - ###### File: parsergen.c #include #include @@ -53,17 +57,20 @@ There are several distinct sections. ## Reading the grammar -The grammar must be stored in a markdown literate code file as parsed -by "[mdcode][]". It must have three top level (i.e. unreferenced) -sections: `header`, `code`, and `grammar`. The first two will be -literally copied into the generated `.c` and `.h`. files. The last -contains the grammar. This is tokenised with "[scanner][]". +The grammar must be stored in a markdown literate code file as parsed by +"[mdcode][]". It may have up to four top level (i.e. unreferenced) +sections: `header`, `code`, `grammar` and `reduce`. The first two, if +present, will be literally copied into the generated `.c` and `.h` +files. The third is required and contains the grammar which is +tokenised with "[scanner][]". `reduce` can provide variable +declarations and common intialization code for all the reduction code +fragments in the grammar. If the `--tag` option is given, then any top level header that doesn't start with the tag is ignored, and the tag is striped from the rest. So -`--tag Foo` -means that the three needed sections must be `Foo: header`, `Foo: code`, -and `Foo: grammar`. +`--tag Foo` means that the four sections expected are `Foo: header`, +`Foo: code`, `Foo: grammar` and `Foo: reduce`. The tag `calc` is used +to extract the sample calculator from this file. [mdcode]: mdcode.html [scanner]: scanner.html @@ -77,10 +84,11 @@ and `Foo: grammar`. #include "scanner.h" The grammar contains production sets, precedence/associativity -declarations, and data type declarations. These are all parsed with -_ad hoc_ parsing as we don't have a parser generator yet. +declarations, general terminal declarations, and data type declarations. +These are all parsed with _ad hoc_ parsing as we don't have a parser +generator yet. -The precedence and associativity can be set for each production, but +The precedence and associativity can be set for each production, and can be inherited from symbols. The data type (either structure or a reference to a structure) is potentially defined for each non-terminal and describes what C structure is used to store information about each @@ -106,11 +114,11 @@ symbol. ## grammar fields }; -The strings reported by `mdcode` and `scanner` are `struct text` which have -length rather than being null terminated. To help with printing and +The strings reported by `mdcode` and `scanner` are `struct text` which +have length rather than being nul terminated. To help with printing and comparing we define `text_is` and `prtxt`, which should possibly go in -`mdcode`. `scanner` does provide `text_dump` which is useful for strings -which might contain control characters. +`mdcode`. `scanner` does provide `text_dump` which is useful for +strings which might contain control characters. `strip_tag` is a bit like `strncmp`, but adds a test for a colon, because that is what we need to detect tags. @@ -144,38 +152,53 @@ because that is what we need to detect tags. Productions are comprised primarily of symbols - terminal and non-terminal. We do not make a syntactic distinction between the two, -though non-terminals must be identifiers. Non-terminal symbols are -those which appear in the head of a production, terminal symbols are -those which don't. There are also "virtual" symbols used for precedence -marking discussed later, and sometimes we won't know what type a symbol -is yet. +though non-terminals must be identifiers while terminals can also be +marks. Non-terminal symbols are those which appear in the head of a +production, terminal symbols are those which don't. There are also +"virtual" symbols used for precedence marking discussed later, and +sometimes we won't know what type a symbol is yet. + +To help with code safety it is possible to declare the terminal symbols. +If this is done, then any symbol used in a production that does not +appear in the head of any production and is not declared as a terminal +is treated as an error. ###### forward declarations enum symtype { Unknown, Virtual, Terminal, Nonterminal }; char *symtypes = "UVTN"; ###### symbol fields enum symtype type; +###### grammar fields + int terminals_declared; Symbols can be either `TK_ident` or `TK_mark`. They are saved in a table of known symbols and the resulting parser will report them as `TK_reserved + N`. A small set of identifiers are reserved for the -different token types that `scanner` can report. +different token types that `scanner` can report, and an even smaller set +are reserved for a special token that the parser can generate (`EOL`) as +will be described later. This latter set cannot use predefined numbers, +so they are marked as `isspecial` for now and will get assigned a number +with the non-terminals later. ###### declarations - static char *reserved_words[] = { - [TK_error] = "ERROR", - [TK_number] = "NUMBER", - [TK_ident] = "IDENTIFIER", - [TK_mark] = "MARK", - [TK_string] = "STRING", - [TK_multi_string] = "MULTI_STRING", - [TK_in] = "IN", - [TK_out] = "OUT", - [TK_newline] = "NEWLINE", - [TK_eof] = "$eof", + + static struct { int num; char *name; } reserved_words[] = { + { TK_error, "ERROR" }, + { TK_number, "NUMBER" }, + { TK_ident, "IDENTIFIER" }, + { TK_mark, "MARK" }, + { TK_string, "STRING" }, + { TK_multi_string, "MULTI_STRING" }, + { TK_in, "IN" }, + { TK_out, "OUT" }, + { TK_newline, "NEWLINE" }, + { TK_eof, "$eof" }, + { -1, "EOL" }, }; + ###### symbol fields short num; + unsigned int isspecial:1; Note that `TK_eof` and the two `TK_*_comment` tokens cannot be recognised. The former is automatically expected at the end of the text @@ -228,21 +251,21 @@ symbol, but its type might be `Unknown`. for (i = 0; i < entries; i++) { struct text t; struct symbol *s; - t.txt = reserved_words[i]; - if (!t.txt) - continue; + t.txt = reserved_words[i].name; t.len = strlen(t.txt); s = sym_find(g, t); s->type = Terminal; - s->num = i; + s->num = reserved_words[i].num; + s->isspecial = 1; } } ### Data types and precedence. -Data type specification and precedence specification are both -introduced by a dollar sign at the start of the line. If the next -word is `LEFT`, `RIGHT` or `NON`, then the line specifies a +Data type specification, precedence specification, and declaration of +terminals are all introduced by a dollar sign at the start of the line. +If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a +precedence, if it is `TERM` then the line declares terminals without precedence, otherwise it specifies a data type. The data type name is simply stored and applied to the head of all @@ -255,20 +278,20 @@ Any productions given before the first data type declaration will have no data type associated with them and can carry no information. In order to allow other non-terminals to have no type, the data type `$void` can be given. This does *not* mean that `struct void` will be -used, but rather than no type will be associated with future +used, but rather than no type will be associated with subsequent non-terminals. -The precedence line must contain a list of symbols - typically -terminal symbols, but not necessarily. It can only contain symbols -that have not been seen yet, so precedence declaration must precede -the productions that they affect. +The precedence line must contain a list of symbols, either terminal +symbols or virtual symbols. It can only contain symbols that have not +been seen yet, so precedence declaration must precede the productions +that they affect. -A precedence line may also contain a "virtual" symbol which is an -identifier preceded by `$$`. Virtual terminals carry precedence -information but are not included in the grammar. A production can -declare that it inherits the precedence of a given virtual symbol. +A "virtual" symbol is an identifier preceded by `$$`. Virtual symbols +carry precedence information but are not included in the grammar. A +production can declare that it inherits the precedence of a given +virtual symbol. -This use for `$$` precludes it from being used as a symbol in the +This use of `$$` precludes it from being used as a symbol in the described language. Two other symbols: `${` and `}$` are also unavailable. @@ -277,6 +300,14 @@ declares how it associates. This level is stored in each symbol listed and may be inherited by any production which uses the symbol. A production inherits from the last symbol which has a precedence. +The symbols on the first precedence line have the lowest precedence. +Subsequent lines introduce symbols with higher precedence and so bind +more tightly. + +Note that a declaration line (or "dollar line") can start with either of +two different marks: "$" or "$*". The `dollar_line()` function defined +here is told which was found in the `isref` argument. + ###### grammar fields struct text current_type; int type_isref; @@ -292,6 +323,7 @@ production inherits from the last symbol which has a precedence. struct token t = token_next(ts); char *err; enum assoc assoc; + int term = 0; int found; if (t.num != TK_ident) { @@ -304,7 +336,10 @@ production inherits from the last symbol which has a precedence. assoc = Right; else if (text_is(t.txt, "NON")) assoc = Non; - else { + else if (text_is(t.txt, "TERM")) { + term = 1; + g->terminals_declared = 1; + } else { g->current_type = t.txt; g->type_isref = isref; if (text_is(t.txt, "void")) @@ -322,7 +357,7 @@ production inherits from the last symbol which has a precedence. goto abort; } - // This is a precedence line, need some symbols. + // This is a precedence or TERM line, need some symbols. found = 0; g->prec_levels += 1; t = token_next(ts); @@ -336,6 +371,10 @@ production inherits from the last symbol which has a precedence. err = "$$ must be followed by a word"; goto abort; } + if (term) { + err = "Virtual symbols not permitted on $TERM line"; + goto abort; + } } else if (t.num != TK_ident && t.num != TK_mark) { err = "Illegal token in precedence line"; @@ -343,17 +382,21 @@ production inherits from the last symbol which has a precedence. } s = sym_find(g, t.txt); if (s->type != Unknown) { - err = "Symbols in precedence line must not already be known."; + err = "Symbols in precedence/TERM line must not already be known."; goto abort; } s->type = type; - s->precedence = g->prec_levels; - s->assoc = assoc; + if (!term) { + s->precedence = g->prec_levels; + s->assoc = assoc; + } found += 1; + t = token_next(ts); } - if (found == 0) - err = "No symbols given on precedence line"; + if (found == 0) { + err = "No symbols given on precedence/TERM line"; goto abort; + } return NULL; abort: while (t.num != TK_newline && t.num != TK_eof) @@ -386,14 +429,15 @@ be in one `code_node` of the literate code. The `}$` must be at the end of a line. Text in the code fragment will undergo substitutions where `$N` or -`$ @@ -451,6 +495,7 @@ Now we have all the bit we need to parse a full production. struct symbol **body; int body_size; struct text code; + int code_line; ###### symbol fields int first_production; @@ -470,8 +515,6 @@ Now we have all the bit we need to parse a full production. tk = token_next(state); while (tk.num == TK_ident || tk.num == TK_mark) { struct symbol *bs = sym_find(g, tk.txt); - if (bs->type == Unknown) - bs->type = Terminal; if (bs->type == Virtual) { err = "Virtual symbol not permitted in production"; goto abort; @@ -491,15 +534,17 @@ Now we have all the bit we need to parse a full production. goto abort; } vs = sym_find(g, tk.txt); - if (vs->type != Virtual) { - err = "symbol after $$ must be virtual"; + if (vs->precedence == 0) { + err = "symbol after $$ must have precedence"; goto abort; + } else { + p.precedence = vs->precedence; + p.assoc = vs->assoc; } - p.precedence = vs->precedence; - p.assoc = vs->assoc; tk = token_next(state); } if (tk.num == TK_open) { + p.code_line = tk.line; p.code = collect_code(state, tk); if (p.code.txt == NULL) { err = "code fragment not closed properly"; @@ -507,6 +552,7 @@ Now we have all the bit we need to parse a full production. } tk = token_next(state); } + if (tk.num != TK_newline && tk.num != TK_eof) { err = "stray tokens at end of line"; goto abort; @@ -518,18 +564,19 @@ Now we have all the bit we need to parse a full production. abort: while (tk.num != TK_newline && tk.num != TK_eof) tk = token_next(state); + free(p.body); return err; } -With the ability to parse production and dollar-lines, we have nearly all +With the ability to parse productions and dollar-lines, we have nearly all that we need to parse a grammar from a `code_node`. -The head of the first production will effectively be the `start` symbol of -the grammar. However it won't _actually_ be so. Processing the grammar is -greatly simplified if the real start symbol only has a single production, -and expects `$eof` as the final terminal. So when we find the first -explicit production we insert an extra production as production zero which -looks like +The head of the first production will effectively be the `start` symbol +of the grammar. However it won't _actually_ be so. Processing the +grammar is greatly simplified if the real start symbol only has a single +production, and expects `$eof` as the final terminal. So when we find +the first explicit production we insert an extra implicit production as +production zero which looks like ###### Example: production 0 $start -> START $eof @@ -552,7 +599,16 @@ where `START` is the first non-terminal given. p->head->first_production = g->production_count; array_add(&g->productions, &g->production_count, p); -Now we are ready to read in the grammar. +Now we are ready to read in the grammar. We ignore comments +and strings so that the marks which introduce them can be +used as terminals in the grammar. We don't ignore numbers +even though we don't allow them as that causes the scanner +to produce errors that the parser is better positioned to handle. +We also ignore indentation, but we expect and use newlines. + +To allow comments in the grammar, we explicitly check for "//" as the +first token on the line and those lines are skipped. "//" can still be +used as a terminal anywhere that a terminal is expected. ###### grammar_read static struct grammar *grammar_read(struct code_node *code) @@ -620,6 +676,11 @@ Now we are ready to read in the grammar. } else if (tk.num == TK_mark && text_is(tk.txt, "$*")) { err = dollar_line(state, g, 1); + } else if (tk.num == TK_mark + && text_is(tk.txt, "//")) { + while (tk.num != TK_newline && + tk.num != TK_eof) + tk = token_next(state); } else { err = "Unrecognised token at start of line."; } @@ -627,30 +688,46 @@ Now we are ready to read in the grammar. goto abort; } token_close(state); + + struct symbol *s; + for (s = g->syms; s; s = s->next) { + if (s->type != Unknown) + continue; + if (!g->terminals_declared) { + s->type = Terminal; + continue; + } + err = "not declared"; + fprintf(stderr, "Token %.*s not declared\n", + s->name.len, s->name.txt); + } + if (err) { + free(g); // FIXME free content + g = NULL; + } return g; abort: fprintf(stderr, "Error at line %d: %s\n", tk.line, err); token_close(state); - free(g); + free(g); // FIXME free content return NULL; } ## Analysing the grammar The central task in analysing the grammar is to determine a set of -states to drive the parsing process. This will require finding -various sets of symbols and of "items". Some of these sets will need -to attach information to each element in the set, so they are more -like maps. - -Each "item" may have a 'look-ahead' or `LA` set associated with -it. Many of these will be the same as each other. To avoid memory -wastage, and to simplify some comparisons of sets, these sets will be -stored in a list of unique sets, each assigned a number. - -Once we have the data structures in hand to manage these sets and -lists, we can start setting the 'nullable' flag, build the 'FIRST' +states to drive the parsing process. This will require finding various +sets of symbols and of "items". Some of these sets will need to attach +information to each element in the set, so they are more like maps. + +Each "item" may have a 'look-ahead' or `LA` set associated with it. +Many of these will be the same as each other. To avoid memory wastage, +and to simplify some comparisons of sets, these sets will be stored in a +list of unique sets, each assigned a number. + +Once we have the data structures in hand to manage these sets and lists, +we can start setting the 'nullable' flag, build the 'FIRST' and 'FOLLOW' sets, and then create the item sets which define the various states. ### Symbol sets. @@ -659,8 +736,8 @@ Though we don't only store symbols in these sets, they are the main things we store, so they are called symbol sets or "symsets". A symset has a size, an array of shorts, and an optional array of data -values, which are also shorts. If the array of data is not present, -we store an impossible pointer, as `NULL` is used to indicate that no +values, which are also shorts. If the array of data is not present, we +store an impossible pointer, as `NULL` is used to indicate that no memory has been allocated yet; ###### declarations @@ -672,10 +749,10 @@ memory has been allocated yet; const struct symset INIT_SYMSET = { 0, NULL, NO_DATA }; const struct symset INIT_DATASET = { 0, NULL, NULL }; -The arrays of shorts are allocated in blocks of 8 and are kept sorted -by using an insertion sort. We don't explicitly record the amount of -allocated space as it can be derived directly from the current `cnt` using -`((cnt - 1) | 7) + 1`. +The arrays of shorts are allocated in blocks of 8 and are kept sorted by +using an insertion sort. We don't explicitly record the amount of +allocated space as it can be derived directly from the current `cnt` +using `((cnt - 1) | 7) + 1`. ###### functions static void symset_add(struct symset *s, unsigned short key, unsigned short val) @@ -725,11 +802,11 @@ or `-1` to indicate failure. } We will often want to form the union of two symsets. When we do, we -will often want to know if anything changed (as they might mean there -is more work to do). So `symset_union` reports whether anything was -added to the first set. We use a slow quadratic approach as these -sets don't really get very big. If profiles shows this to be a problem is -can be optimised later. +will often want to know if anything changed (as that might mean there is +more work to do). So `symset_union` reports whether anything was added +to the first set. We use a slow quadratic approach as these sets are +rarely large. If profiling shows this to be a problem it can be +optimised later. static int symset_union(struct symset *a, struct symset *b) { @@ -760,9 +837,9 @@ And of course we must be able to free a symset. Some symsets are simply stored somewhere appropriate in the `grammar` data structure, others needs a bit of indirection. This applies particularly to "LA" sets. These will be paired with "items" in an -"itemset". As itemsets will be stored in a symset, the "LA" set needs to be -stored in the `data` field, so we need a mapping from a small (short) -number to an LA `symset`. +"itemset". As itemsets will be stored in a symset, the "LA" set needs +to be stored in the `data` field, so we need a mapping from a small +(short) number to an LA `symset`. As mentioned earlier this involves creating a list of unique symsets. @@ -819,9 +896,9 @@ would be more efficient and may be added later. return s->num; } -Finding a set by number is currently performed by a simple linear search. -If this turns out to hurt performance, we can store the sets in a dynamic -array like the productions. +Finding a set by number is currently performed by a simple linear +search. If this turns out to hurt performance, we can store the sets in +a dynamic array like the productions. static struct symset set_find(struct grammar *g, int num) { @@ -831,7 +908,6 @@ array like the productions. return sl->ss; } - ### Setting `nullable` We set `nullable` on the head symbol for any production for which all @@ -869,73 +945,24 @@ changes happen. } } -### Setting `can_eol` - -In order to be able to ignore newline tokens when not relevant, but -still include them in the parse when needed, we will need to know -which states can start a "line-like" section of code. We ignore -newlines when there is an indent since the most recent start of a -line-like section. - -To know what is line-like, we first need to know which symbols can end -a line-like section, which is precisely those which can end with a -newline token. These symbols don't necessarily alway end with a -newline, but they can. Hence they are not described as "lines" but -only "line-like". - -Clearly the `TK_newline` token can end with a newline. Any symbol -which is the head of a production that contains a line-ending symbol -followed only by nullable symbols is also a line-ending symbol. We -use a new field `can_eol` to record this attribute of symbols, and -compute it in a repetitive manner similar to `set_nullable`. - -###### symbol fields - int can_eol; - -###### functions - static void set_can_eol(struct grammar *g) - { - int check_again = 1; - g->symtab[TK_newline]->can_eol = 1; - while (check_again) { - int p; - check_again = 0; - for (p = 0; p < g->production_count; p++) { - struct production *pr = g->productions[p]; - int s; - - if (pr->head->can_eol) - continue; - - for (s = pr->body_size - 1; s >= 0; s--) { - if (pr->body[s]->can_eol) { - pr->head->can_eol = 1; - check_again = 1; - break; - } - if (!pr->body[s]->nullable) - break; - } - } - } - } - ### Building the `first` sets -When calculating what can follow a particular non-terminal, we will need to -know what the "first" terminal in any subsequent non-terminal might be. So -we calculate the `first` set for every non-terminal and store them in an -array. We don't bother recording the "first" set for terminals as they are -trivial. - -As the "first" for one symbol might depend on the "first" of another, -we repeat the calculations until no changes happen, just like with -`set_nullable`. This makes use of the fact that `symset_union` -reports if any change happens. - -The core of this which finds the "first" of part of a production body -will be reused for computing the "follow" sets, so we split it out -into a separate function. +When calculating what can follow a particular non-terminal, we will need +to know what the "first" terminal in any subsequent non-terminal might +be. So we calculate the `first` set for every non-terminal and store +them in an array. We don't bother recording the "first" set for +terminals as they are trivial. + +As the "first" for one symbol might depend on the "first" of another, we +repeat the calculations until no changes happen, just like with +`set_nullable`. This makes use of the fact that `symset_union` reports +if any change happens. + +The core of this, which finds the "first" of part of a production body, +will be reused for computing the "follow" sets, so we split that out +into a separate function, `add_first`, which also reports if it got all +the way to the end of the production without finding a non-nullable +symbol. ###### grammar fields struct symset *first; @@ -989,18 +1016,21 @@ into a separate function. ### Building the `follow` sets. -There are two different situations when we will want to generate "follow" -sets. If we are doing an SLR analysis, we want to generate a single -"follow" set for each non-terminal in the grammar. That is what is -happening here. If we are doing an LALR or LR analysis we will want -to generate a separate "LA" set for each item. We do that later -in state generation. +There are two different situations when we will want to generate +"follow" sets. If we are doing an SLR analysis, we want to generate a +single "follow" set for each non-terminal in the grammar. That is what +is happening here. If we are doing an LALR or LR analysis we will want +to generate a separate "LA" set for each item. We do that later in +state generation. There are two parts to generating a "follow" set. Firstly we look at -every place that any non-terminal appears in the body of any -production, and we find the set of possible "first" symbols after -there. This is done using `add_first` above and only needs to be done -once as the "first" sets are now stable and will not change. +every place that any non-terminal appears in the body of any production, +and we find the set of possible "first" symbols after there. This is +done using `add_first` above and only needs to be done once as the +"first" sets are now stable and will not change. + +###### grammar fields + struct symset *follow; ###### follow code @@ -1022,12 +1052,16 @@ other symbol which is followed only by `nullable` symbols. As this depends on "follow" itself we need to repeatedly perform the process until no further changes happen. +Rather than 2 nested loops, one that repeats the whole process until +there is no change, and one that iterates of the set of productions, we +combine these two functions into a single loop. + ###### follow code - for (again = 0, p = 0; + for (check_again = 0, p = 0; p < g->production_count; p < g->production_count-1 - ? p++ : again ? (again = 0, p = 0) + ? p++ : check_again ? (check_again = 0, p = 0) : p++) { struct production *pr = g->productions[p]; int b; @@ -1038,7 +1072,7 @@ until no further changes happen. break; if (symset_union(&g->follow[bs->num], &g->follow[pr->head->num])) - again = 1; + check_again = 1; if (!bs->nullable) break; } @@ -1047,13 +1081,10 @@ until no further changes happen. We now just need to create and initialise the `follow` list to get a complete function. -###### grammar fields - struct symset *follow; - ###### functions static void build_follow(struct grammar *g) { - int s, again, p; + int s, check_again, p; g->follow = calloc(g->num_syms, sizeof(g->follow[0])); for (s = 0; s < g->num_syms; s++) g->follow[s] = INIT_SYMSET; @@ -1065,11 +1096,13 @@ complete function. There are three different levels of detail that can be chosen for building the itemsets and states for the LR grammar. They are: -1. LR(0) or SLR(1), where no look-ahead is considered. +1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the + itemsets - look-ahead, if used at all, is simply the 'follow' sets + already computed, 2. LALR(1) where we build look-ahead sets with each item and merge the LA sets when we find two paths to the same "kernel" set of items. 3. LR(1) where different look-ahead for any item in the set means - a different state must be created. + a different item set must be created. ###### forward declarations enum grammar_type { LR0, LR05, SLR, LALR, LR1 }; @@ -1079,10 +1112,10 @@ generated state already exists. For now we use a simple sorted linked list. An item is a pair of numbers: the production number and the position of -"DOT", which is an index into the body of the production. -As the numbers are not enormous we can combine them into a single "short" -and store them in a `symset` - 4 bits for "DOT" and 12 bits for the -production number (so 4000 productions with maximum of 15 symbols in the +"DOT", which is an index into the body of the production. As the +numbers are not enormous we can combine them into a single "short" and +store them in a `symset` - 5 bits for "DOT" and 11 bits for the +production number (so 2000 productions with maximum of 31 symbols in the body). Comparing the itemsets will be a little different to comparing symsets @@ -1095,15 +1128,15 @@ the end of the list in the symset, and then only compare items before the first "0". ###### declarations - static inline unsigned short item_num(int production, int index) + static inline unsigned short item_num(int production, int dot) { - return production | ((31-index) << 11); + return production | ((31-dot) << 11); } static inline int item_prod(unsigned short item) { return item & 0x7ff; } - static inline int item_index(unsigned short item) + static inline int item_dot(unsigned short item) { return (31-(item >> 11)) & 0x1f; } @@ -1121,8 +1154,8 @@ can just compare the symset and the data values together. for (i = 0; i < a.cnt && i < b.cnt && - item_index(a.syms[i]) > 0 && - item_index(b.syms[i]) > 0; + item_dot(a.syms[i]) > 0 && + item_dot(b.syms[i]) > 0; i++) { int diff = a.syms[i] - b.syms[i]; if (diff) @@ -1133,11 +1166,11 @@ can just compare the symset and the data values together. return diff; } } - if (i == a.cnt || item_index(a.syms[i]) == 0) + if (i == a.cnt || item_dot(a.syms[i]) == 0) av = -1; else av = a.syms[i]; - if (i == b.cnt || item_index(b.syms[i]) == 0) + if (i == b.cnt || item_dot(b.syms[i]) == 0) bv = -1; else bv = b.syms[i]; @@ -1149,9 +1182,14 @@ can just compare the symset and the data values together. a.data[i] - b.data[i]; } +It will be helpful to know if an itemset has been "completed" or not, +particularly for LALR where itemsets get merged, at which point they +need to be consider for completion again. So a `completed` flag is +needed. + And now we can build the list of itemsets. The lookup routine returns -both a success flag and a pointer to where in the list an insert -should happen, so we don't need to search a second time. +both a success flag and a pointer to where in the list an insert should +happen, so we don't need to search a second time. ###### declarations struct itemset { @@ -1159,8 +1197,9 @@ should happen, so we don't need to search a second time. short state; struct symset items; struct symset go_to; + enum assoc assoc; + unsigned short precedence; char completed; - char starts_line; }; ###### grammar fields @@ -1190,15 +1229,16 @@ should happen, so we don't need to search a second time. } Adding an itemset may require merging the LA sets if LALR analysis is -happening. If any new LA set adds any symbols that weren't in the old LA set, we -clear the `completed` flag so that the dependants of this itemset will be -recalculated and their LA sets updated. +happening. If any new LA set adds any symbols that weren't in the old +LA set, we clear the `completed` flag so that the dependants of this +itemset will be recalculated and their LA sets updated. `add_itemset` must consume the symsets it is passed, either by adding them to a data structure, of freeing them. static int add_itemset(struct grammar *g, struct symset ss, - enum grammar_type type, int starts_line) + enum assoc assoc, unsigned short precedence, + enum grammar_type type) { struct itemset **where, *is; int i; @@ -1208,9 +1248,10 @@ them to a data structure, of freeing them. is->state = g->states; g->states += 1; is->items = ss; + is->assoc = assoc; + is->precedence = precedence; is->next = *where; is->go_to = INIT_DATASET; - is->starts_line = starts_line; *where = is; return is->state; } @@ -1238,28 +1279,39 @@ them to a data structure, of freeing them. #### The build -To build all the itemsets, we first insert the initial itemset made -from production zero, complete each itemset, and then generate new -itemsets from old until no new ones can be made. +To build all the itemsets, we first insert the initial itemset made from +production zero, complete each itemset, and then generate new itemsets +from old until no new ones can be made. -Completing an itemset means finding all the items where "DOT" is followed by -a nonterminal and adding "DOT=0" items for every production from that -non-terminal - providing each item hasn't already been added. +Completing an itemset means finding all the items where "DOT" is +followed by a nonterminal and adding "DOT=0" items for every production +from that non-terminal - providing each item hasn't already been added. +When we add such an item it might get inserted before the current +one, so to make sure we process it we reset the loop counter to the +start. If LA sets are needed, the LA set for each new item is found using -`add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may -be supplemented by the LA set for the item which produce the new item. - -We also collect a set of all symbols which follow "DOT" (in `done`) as this -is used in the next stage. - -NOTE: precedence handling should happen here - I haven't written this yet -though. +`add_first` which was developed earlier for `FIRST` and `FOLLOW`. This +may be supplemented by the LA set for the item which produced the new +item. + +We also collect a set of all symbols which follow "DOT" (in `done`) as +this is used in the next stage. + +When itemsets are created we assign a precedence to the itemset from the +complete item, if there is one. We ignore the possibility of there +being two and don't (currently) handle precedence in such grammars. +When completing a grammar we ignore any item where DOT is followed by a +terminal with a precedence lower than that for the itemset. Unless the +terminal has right associativity, we also ignore items where the +terminal has the same precedence. The result is that unwanted items are +still in the itemset, but the terminal doesn't get into the go to set, +so the item is ineffective. ###### complete itemset for (i = 0; i < is->items.cnt; i++) { int p = item_prod(is->items.syms[i]); - int bs = item_index(is->items.syms[i]); + int bs = item_dot(is->items.syms[i]); struct production *pr = g->productions[p]; int p2; struct symbol *s; @@ -1269,11 +1321,24 @@ though. if (bs == pr->body_size) continue; s = pr->body[bs]; + if (s->precedence && is->precedence && + is->precedence > s->precedence) + /* This terminal has a low precedence and + * shouldn't be shifted + */ + continue; + if (s->precedence && is->precedence && + is->precedence == s->precedence && s->assoc != Right) + /* This terminal has a matching precedence and is + * not Right-associative, so we mustn't shift it. + */ + continue; if (symset_find(&done, s->num) < 0) symset_add(&done, s->num, 0); + if (s->type != Nonterminal) continue; - again = 1; + check_again = 1; if (type >= LALR) { // Need the LA set. int to_end; @@ -1301,21 +1366,22 @@ though. } else if (type >= LALR) { struct symset ss = set_find(g, is->items.data[pos]); struct symset tmp = INIT_SYMSET; + struct symset *la = &LA; symset_union(&tmp, &ss); - if (symset_union(&tmp, &LA)) { + if (symset_union(&tmp, la)) { is->items.data[pos] = save_set(g, tmp); i = -1; - }else + } else symset_free(tmp); } } } -For each symbol we found (and placed in `done`) we collect all the items for -which this symbol is next, and create a new itemset with "DOT" advanced over -the symbol. This is then added to the collection of itemsets (or merged -with a pre-existing itemset). +For each symbol we found (and placed in `done`) we collect all the items +for which this symbol is next, and create a new itemset with "DOT" +advanced over the symbol. This is then added to the collection of +itemsets (or merged with a pre-existing itemset). ###### derive itemsets // Now we have a completed itemset, so we need to @@ -1324,19 +1390,17 @@ with a pre-existing itemset). for (i = 0; i < done.cnt; i++) { int j; unsigned short state; - int starts_line = 0; struct symbol *sym = g->symtab[done.syms[i]]; + enum assoc assoc = Non; + unsigned short precedence = 0; struct symset newitemset = INIT_SYMSET; if (type >= LALR) newitemset = INIT_DATASET; - if (sym->can_eol || - (sym->nullable && is->starts_line)) - starts_line = 1; for (j = 0; j < is->items.cnt; j++) { int itm = is->items.syms[j]; int p = item_prod(itm); - int bp = item_index(itm); + int bp = item_dot(itm); struct production *pr = g->productions[p]; unsigned short la = 0; int pos; @@ -1345,11 +1409,21 @@ with a pre-existing itemset). continue; if (pr->body[bp] != sym) continue; + + bp += 1; if (type >= LALR) la = is->items.data[j]; - pos = symset_find(&newitemset, pr->head->num); + if (bp == pr->body_size && + pr->precedence > 0 && + pr->precedence > precedence) { + // new itemset is reducible and has a precedence. + precedence = pr->precedence; + assoc = pr->assoc; + } + pos = symset_find(&newitemset, item_num(p, bp)); + if (pos < 0) - symset_add(&newitemset, item_num(p, bp+1), la); + symset_add(&newitemset, item_num(p, bp), la); else if (type >= LALR) { // Need to merge la set. int la2 = newitemset.data[pos]; @@ -1365,12 +1439,12 @@ with a pre-existing itemset). } } } - state = add_itemset(g, newitemset, type, starts_line); + state = add_itemset(g, newitemset, assoc, precedence, type); if (symset_find(&is->go_to, done.syms[i]) < 0) symset_add(&is->go_to, done.syms[i], state); } -All that is left is to crate the initial itemset from production zero, and +All that is left is to create the initial itemset from production zero, and with `TK_eof` as the LA set. ###### functions @@ -1378,7 +1452,7 @@ with `TK_eof` as the LA set. { struct symset first = INIT_SYMSET; struct itemset *is; - int again; + int check_again; unsigned short la = 0; if (type >= LALR) { // LA set just has eof @@ -1389,16 +1463,16 @@ with `TK_eof` as the LA set. } // production 0, offset 0 (with no data) symset_add(&first, item_num(0, 0), la); - add_itemset(g, first, type, g->productions[0]->body[0]->can_eol); - for (again = 0, is = g->items; + add_itemset(g, first, Non, 0, type); + for (check_again = 0, is = g->items; is; - is = is->next ?: again ? (again = 0, g->items) : NULL) { + is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) { int i; struct symset done = INIT_SYMSET; if (is->completed) continue; is->completed = 1; - again = 1; + check_again = 1; ## complete itemset ## derive itemsets symset_free(done); @@ -1416,9 +1490,11 @@ doesn't use them. They will be calculated to some extent if needed for a report. Once we have built everything we allocate arrays for the two lists: -symbols and itemsets. This allows more efficient access during reporting. -The symbols are grouped as terminals and non-terminals and we record the -changeover point in `first_nonterm`. +symbols and itemsets. This allows more efficient access during +reporting. The symbols are grouped as terminals, then non-terminals, +then virtual, with the start of non-terminals recorded as `first_nonterm`. +Special terminals -- meaning just EOL -- are included with the +non-terminals so that they are not expected by the scanner. ###### grammar fields struct symbol **symtab; @@ -1433,11 +1509,16 @@ changeover point in `first_nonterm`. struct itemset *is; int snum = TK_reserved; for (s = g->syms; s; s = s->next) - if (s->num < 0 && s->type == Terminal) { + if (s->num < 0 && s->type == Terminal && !s->isspecial) { s->num = snum; snum++; } g->first_nonterm = snum; + for (s = g->syms; s; s = s->next) + if (s->num < 0 && s->type != Virtual) { + s->num = snum; + snum++; + } for (s = g->syms; s; s = s->next) if (s->num < 0) { s->num = snum; @@ -1449,7 +1530,6 @@ changeover point in `first_nonterm`. g->symtab[s->num] = s; set_nullable(g); - set_can_eol(g); if (type >= SLR) build_first(g); @@ -1481,7 +1561,7 @@ all the tables that have been generated, plus a description of any conflicts. Firstly we have the complete list of symbols, together with the "FIRST" set if that was generated. We add a mark to each symbol to -show if it can end in a newline (`>`), or if it is nullable (`.`). +show if it is nullable (`.`). ###### functions @@ -1498,9 +1578,8 @@ show if it can end in a newline (`>`), or if it is nullable (`.`). if (!s) continue; - printf(" %c%c%3d%c: ", + printf(" %c%3d%c: ", s->nullable ? '.':' ', - s->can_eol ? '>':' ', s->num, symtypes[s->type]); prtxt(s->name); if (s->precedence) @@ -1548,7 +1627,7 @@ it up a bit. First the items, with production number and associativity. static void report_item(struct grammar *g, int itm) { int p = item_prod(itm); - int dot = item_index(itm); + int dot = item_dot(itm); struct production *pr = g->productions[p]; int i; @@ -1562,9 +1641,15 @@ it up a bit. First the items, with production number and associativity. if (dot == pr->body_size) printf(" ."); printf(" [%d]", p); - if (pr->precedence) + if (pr->precedence && dot == pr->body_size) printf(" (%d%s)", pr->precedence, assoc_names[pr->assoc]); + if (dot < pr->body_size && + pr->body[dot]->precedence) { + struct symbol *s = pr->body[dot]; + printf(" [%d%s]", s->precedence, + assoc_names[s->assoc]); + } printf("\n"); } @@ -1587,7 +1672,6 @@ The LA sets which are (possibly) reported with each item: Then the go to sets: - static void report_goto(struct grammar *g, struct symset gt) { int i; @@ -1609,7 +1693,10 @@ Now we can report all the item sets complete with items, LA sets, and GO TO. for (s = 0; s < g->states; s++) { int j; struct itemset *is = g->statetab[s]; - printf(" Itemset %d:%s\n", s, is->starts_line?" (startsline)":""); + printf(" Itemset %d:", s); + if (is->precedence) + printf(" %d%s", is->precedence, assoc_names[is->assoc]); + printf("\n"); for (j = 0; j < is->items.cnt; j++) { report_item(g, is->items.syms[j]); if (is->items.data != NO_DATA) @@ -1622,7 +1709,7 @@ Now we can report all the item sets complete with items, LA sets, and GO TO. ### Reporting conflicts Conflict detection varies a lot among different analysis levels. However -LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and +LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and LR1 are also similar as they have FOLLOW or LA sets. ###### functions @@ -1645,7 +1732,7 @@ LR1 are also similar as they have FOLLOW or LA sets. LR0 conflicts are any state which have both a reducible item and a shiftable item, or two reducible items. -LR05 conflicts only occurs if two possibly reductions exist, +LR05 conflicts only occur if two possible reductions exist, as shifts always over-ride reductions. ###### conflict functions @@ -1664,7 +1751,7 @@ as shifts always over-ride reductions. for (j = 0; j < is->items.cnt; j++) { int itm = is->items.syms[j]; int p = item_prod(itm); - int bp = item_index(itm); + int bp = item_dot(itm); struct production *pr = g->productions[p]; if (bp == pr->body_size) { @@ -1716,22 +1803,28 @@ is in look-ahead. We report when we get conflicts between the two. for (j = 0; j < is->items.cnt; j++) { unsigned short itm = is->items.syms[j]; int p = item_prod(itm); - int bp = item_index(itm); + int bp = item_dot(itm); struct production *pr = g->productions[p]; + struct symbol *s; - if (bp < pr->body_size && - pr->body[bp]->type == Terminal) { - /* shiftable */ - int sym = pr->body[bp]->num; - if (symset_find(&shifts, sym) < 0) - symset_add(&shifts, sym, itm); - } + if (bp >= pr->body_size || + pr->body[bp]->type != Terminal) + /* not shiftable */ + continue; + + s = pr->body[bp]; + if (s->precedence && is->precedence) + /* Precedence resolves this, so no conflict */ + continue; + + if (symset_find(&shifts, s->num) < 0) + symset_add(&shifts, s->num, itm); } - /* Now look for reduction and conflicts */ + /* Now look for reductions and conflicts */ for (j = 0; j < is->items.cnt; j++) { unsigned short itm = is->items.syms[j]; int p = item_prod(itm); - int bp = item_index(itm); + int bp = item_dot(itm); struct production *pr = g->productions[p]; if (bp < pr->body_size) @@ -1747,11 +1840,11 @@ is in look-ahead. We report when we get conflicts between the two. int pos = symset_find(&shifts, la.syms[k]); if (pos >= 0) { printf(" State %d has SHIFT/REDUCE conflict on ", i); - prtxt(g->symtab[la.syms[k]]->name); + cnt++; + prtxt(g->symtab[la.syms[k]]->name); printf(":\n"); report_item(g, shifts.data[pos]); report_item(g, itm); - cnt++; } pos = symset_find(&reduce, la.syms[k]); if (pos < 0) { @@ -1783,19 +1876,22 @@ optional `FILE` to send tracing to. The `token_config` gets the list of known words added and then is used with the `code_node` to initialize the scanner. -`parse_XX` then calls the library function `parser_run` to actually complete -the parse. This needs the `states` table and function to call the various -pieces of code provided in the grammar file, so they are generated first. +`parse_XX` then calls the library function `parser_run` to actually +complete the parse. This needs the `states` table, the `reductions` +table and functions to call the various pieces of code provided in the +grammar file, so they are generated first. ###### parser_generate - static void gen_parser(FILE *f, struct grammar *g, char *file, char *name) + static void gen_parser(FILE *f, struct grammar *g, char *file, char *name, + struct code_node *pre_reduce) { gen_known(f, g); gen_non_term(f, g); gen_goto(f, g); gen_states(f, g); - gen_reduce(f, g, file); + gen_reductions(f, g); + gen_reduce(f, g, file, pre_reduce); gen_free(f, g); fprintf(f, "#line 0 \"gen_parser\"\n"); @@ -1805,9 +1901,8 @@ pieces of code provided in the grammar file, so they are generated first. fprintf(f, "\tstruct token_state *tokens;\n"); fprintf(f, "\tconfig->words_marks = known;\n"); fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n"); - fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n"); fprintf(f, "\ttokens = token_open(code, config);\n"); - fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n"); + fprintf(f, "\tvoid *rv = parser_run(tokens, states, reductions, do_reduce, do_free, trace, non_term, config);\n"); fprintf(f, "\ttoken_close(tokens);\n"); fprintf(f, "\treturn rv;\n"); fprintf(f, "}\n\n"); @@ -1841,20 +1936,23 @@ The table of nonterminals used for tracing is a similar array. for (i = TK_reserved; i < g->num_syms; i++) - if (g->symtab[i]->type == Nonterminal) + if (g->symtab[i]->type == Nonterminal || + g->symtab[i]->isspecial) fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len, g->symtab[i]->name.txt); fprintf(f, "};\n\n"); } -### States and the goto tables. +### States, reductions, and the go to tables. -For each state we record the goto table, the reducible production if -there is one, or a symbol to shift for error recovery. -Some of the details of the reducible production are stored in the -`do_reduce` function to come later. Here we store the production number, -the body size (useful for stack management) and the resulting symbol (useful -for knowing how to free data later). +For each state we record the go to table and the reducible production if +there is one, the details of which are in a separate table of +reductions. Some of the details of the reducible production are stored +in the `do_reduce` function to come later. In the go to table we store +the production number and in the reductions table: the body size (useful +for stack management), the resulting symbol (useful for knowing how to +free data later), and the size of the resulting asn object (useful for +preallocation space. The go to table is stored in a simple array of `sym` and corresponding `state`. @@ -1865,17 +1963,17 @@ The go to table is stored in a simple array of `sym` and corresponding short sym; short state; }; + struct reduction { + short size; + short sym; + short result_size; + }; struct state { + short reduce_prod; short go_to_cnt; const struct lookup * go_to; - short reduce_prod; - short reduce_size; - short reduce_sym; - short shift_sym; - short starts_line; }; - ###### functions static void gen_goto(FILE *f, struct grammar *g) @@ -1883,10 +1981,13 @@ The go to table is stored in a simple array of `sym` and corresponding int i; fprintf(f, "#line 0 \"gen_goto\"\n"); for (i = 0; i < g->states; i++) { + struct symset gt = g->statetab[i]->go_to; int j; + + if (gt.cnt == 0) + continue; fprintf(f, "static const struct lookup goto_%d[] = {\n", i); - struct symset gt = g->statetab[i]->go_to; for (j = 0; j < gt.cnt; j++) fprintf(f, "\t{ %d, %d },\n", gt.syms[j], gt.data[j]); @@ -1894,7 +1995,25 @@ The go to table is stored in a simple array of `sym` and corresponding } } -###### functions + static void gen_reductions(FILE *f, struct grammar *g) + { + int i; + fprintf(f, "#line 0 \"gen_reductions\"\n"); + fprintf(f, "static const struct reduction reductions[] = {\n"); + for (i = 0; i < g->production_count; i++) { + struct production *pr = g->productions[i]; + struct symbol *hd = pr->head; + fprintf(f, "\t{%d, %d, ", pr->body_size, hd->num); + if (hd->struct_name.txt == NULL) + fprintf(f, "0 },\n"); + else + fprintf(f, "sizeof(struct %.*s%s) },\n", + hd->struct_name.len, + hd->struct_name.txt, + hd->isref ? "*" : ""); + } + fprintf(f, "};\n\n"); + } static void gen_states(FILE *f, struct grammar *g) { @@ -1904,75 +2023,155 @@ The go to table is stored in a simple array of `sym` and corresponding for (i = 0; i < g->states; i++) { struct itemset *is = g->statetab[i]; int j, prod = -1, prod_len; - int shift_sym = -1; - int shift_len = 0, shift_remain = 0; + for (j = 0; j < is->items.cnt; j++) { int itm = is->items.syms[j]; int p = item_prod(itm); - int bp = item_index(itm); + int bp = item_dot(itm); struct production *pr = g->productions[p]; - if (bp < pr->body_size) { - if (shift_sym < 0 || - (shift_len == bp && shift_remain > pr->body_size - bp)) { - shift_sym = pr->body[bp]->num; - shift_len = bp; - shift_remain = pr->body_size - bp; - } + if (bp < pr->body_size) continue; - } - /* This is what we reduce */ + /* This is what we reduce - choose longest */ if (prod < 0 || prod_len < pr->body_size) { prod = p; prod_len = pr->body_size; } } - - if (prod >= 0) - fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n", - i, is->go_to.cnt, i, prod, - g->productions[prod]->body_size, - g->productions[prod]->head->num, - is->starts_line); + if (is->go_to.cnt) + fprintf(f, "\t[%d] = { %d, %d, goto_%d },\n", + i, prod, is->go_to.cnt, i); else - fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n", - i, is->go_to.cnt, i, shift_sym, - is->starts_line); + fprintf(f, "\t[%d] = { %d, 0, NULL },\n", i, prod); } fprintf(f, "};\n\n"); } ### The `do_reduce` function and the code -When the parser engine decides to reduce a production, it calls `do_reduce`. -This has two functions. +When the parser engine decides to reduce a production, it calls +`do_reduce` which runs the code that was included with the production, +if any. -Firstly, if a non-NULL `trace` file is passed, it prints out details of the -production being reduced. Secondly it runs the code that was included with -the production if any. - -This code needs to be able to store data somewhere. Rather than requiring -`do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have -`do_reduce` return the size to be saved. +This code needs to be able to store data somewhere so we record the size +of the data expected with each state so it can be allocated before +`do_reduce` is called. In order for the code to access "global" context, we pass in the -"config" pointer that was passed to parser function. If the `struct +"config" pointer that was passed to the parser function. If the `struct token_config` is embedded in some larger structure, the reducing code -can access the larger structure using pointer manipulation. - -The code fragment requires translation when written out. Any `$N` needs to -be converted to a reference either to that buffer (if `$0`) or to the -structure returned by a previous reduction. These pointers need to be cast -to the appropriate type for each access. All this is handling in +can access the larger structure using pointer manipulation. Performing +that pointer manipulation, and any other common code or variables for +all reduce actions, can be provided in code node called "reduce" which +is passed around in `pre_reduce` which you might have already noticed. + +The code fragments require translation when written out. Any `$N` needs +to be converted to a reference either to that buffer (if `$0`) or to the +structure returned by a previous reduction. These pointers need to be +cast to the appropriate type for each access. All this is handled in `gen_code`. -`gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'. -This applied only to symbols with references (or pointers), not those with structures. -The `<` implies that the reference it being moved out, so the object will not be -automatically freed. This is equivalent to assigning `NULL` to the pointer. +`gen_code` also allows symbol references to contain a '`<`' as in +'`$<2`'. This is particularly useful for references (or pointers), but +can be used with structures too. The `<` implies that the value is +being moved out, so the object will not be automatically freed. It is +equivalent to assigning `NULL` to the pointer or filling a structure +with zeros. + +Instead of a number `N`, the `$` or `$<` can be followed by some letters +and possibly a number. A number by itself (other than zero) selects a +symbol from the body of the production. A sequence of letters selects +the shortest symbol in the body which contains those letters in the +given order. If a number follows the letters, then a later occurrence +of that symbol is chosen. So "`$AB2`" will refer to the structure +attached to the second occurrence of the shortest symbol which contains +an `A` followed by a `B`. If there is no unique shortest system, or if +the number given is too large, then the symbol reference is not +transformed, and will cause an error when the code is compiled. ###### functions + static int textchr(struct text t, char c, int s) + { + int i; + for (i = s; i < t.len; i++) + if (t.txt[i] == c) + return i; + return -1; + } + + static int subseq_match(char *seq, int slen, struct text name) + { + int st = 0; + while (slen > 0) { + st = textchr(name, *seq, st); + if (st < 0) + return 0; + slen -= 1; + seq += 1; + st += 1; + } + return 1; + } + + static int choose_sym(char **namep, int len, struct production *p) + { + char *name = *namep; + char *nam = name; + int namlen; + int n = 0; + int i, s, slen; + char c; + + c = *name; + while (len > 0 && + ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) { + name += 1; + len -= 1; + c = *name; + } + namlen = name-nam; + while (len > 0 && (c >= '0' && c <= '9')) { + name += 1; + len -= 1; + n = n * 10 + (c - '0'); + c = *name; + } + if (namlen == 0) { + if (name == *namep || n > p->body_size) + return -1; + *namep = name; + return n; + } + slen = 0; s = -1; + for (i = 0; i < p->body_size; i++) { + if (!subseq_match(nam, namlen, p->body[i]->name)) + continue; + if (slen == 0 || p->body[i]->name.len < slen) { + s = i; + slen = p->body[i]->name.len; + } + if (s >= 0 && p->body[i] != p->body[s] && + p->body[i]->name.len == p->body[s]->name.len) + /* not unique, so s cannot be used */ + s = -1; + } + if (s < 0) + return -1; + if (n == 0) + n = 1; + for (i = 0; i < p->body_size; i++) + if (p->body[i] == p->body[s]) { + n -= 1; + if (n == 0) + break; + } + if (n > 0 || i == p->body_size) + return -1; + *namep = name; + return i + 1; + } + static void gen_code(struct production *p, FILE *f, struct grammar *g) { char *c; @@ -1994,24 +2193,19 @@ automatically freed. This is equivalent to assigning `NULL` to the pointer. use = 1; c++; } - if (*c < '0' || *c > '9') { + n = choose_sym(&c, p->code.txt + p->code.len - c, p); + if (n < 0) { + fputc('$', f); if (use) fputc('<', f); fputc(*c, f); continue; } - n = *c - '0'; - while (c[1] >= '0' && c[1] <= '9') { - c += 1; - n = n * 10 + *c - '0'; - } if (n == 0) fprintf(f, "(*(struct %.*s*%s)ret)", p->head->struct_name.len, p->head->struct_name.txt, p->head->isref ? "*":""); - else if (n > p->body_size) - fprintf(f, "$%d", n); else if (p->body[n-1]->type == Terminal) fprintf(f, "(*(struct token *)body[%d])", n-1); @@ -2024,35 +2218,45 @@ automatically freed. This is equivalent to assigning `NULL` to the pointer. p->body[n-1]->isref ? "*":"", n-1); used[n-1] = use; } + c -= 1; } fputs("\n", f); for (i = 0; i < p->body_size; i++) { if (p->body[i]->struct_name.txt && - p->body[i]->isref && - used[i]) + used[i]) { // assume this has been copied out - fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i); + if (p->body[i]->isref) + fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i); + else + fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt); + } } free(used); } ###### functions - static void gen_reduce(FILE *f, struct grammar *g, char *file) + static void gen_reduce(FILE *f, struct grammar *g, char *file, + struct code_node *pre_reduce) { int i; - fprintf(f, "#line 0 \"gen_reduce\"\n"); + fprintf(f, "#line 1 \"gen_reduce\"\n"); fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n"); fprintf(f, "{\n"); fprintf(f, "\tint ret_size = 0;\n"); + if (pre_reduce) + code_node_print(f, pre_reduce, file); + fprintf(f, "#line 4 \"gen_reduce\"\n"); fprintf(f, "\tswitch(prod) {\n"); for (i = 0; i < g->production_count; i++) { struct production *p = g->productions[i]; fprintf(f, "\tcase %d:\n", i); - if (p->code.txt) + if (p->code.txt) { + fprintf(f, "#line %d \"%s\"\n", p->code_line, file); gen_code(p, f, g); + } if (p->head->struct_name.txt) fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n", @@ -2077,7 +2281,7 @@ recovery where individual stack frames might need to be freed. For this, the grammar author is required to defined a `free_XX` function for each structure that is used by a non-terminal. `do_free` will call whichever is appropriate given a symbol number, and will call `free` (as is -appropriate for tokens on any terminal symbol. +appropriate for tokens) on any terminal symbol. ###### functions @@ -2089,7 +2293,15 @@ appropriate for tokens on any terminal symbol. fprintf(f, "static void do_free(short sym, void *asn)\n"); fprintf(f, "{\n"); fprintf(f, "\tif (!asn) return;\n"); - fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm); + fprintf(f, "\tif (sym < %d", g->first_nonterm); + /* Need to handle special terminals too */ + for (i = 0; i < g->num_syms; i++) { + struct symbol *s = g->symtab[i]; + if (i >= g->first_nonterm && s->type == Terminal && + s->isspecial) + fprintf(f, " || sym == %d", s->num); + } + fprintf(f, ") {\n"); fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n"); fprintf(f, "\tswitch(sym) {\n"); @@ -2176,7 +2388,7 @@ grammar file). case 't': tag = optarg; break; default: - fprintf(stderr, "Usage: parsergen ...\n"); + fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n"); exit(1); } } @@ -2202,12 +2414,12 @@ grammar file). To be able to run `mdcode` and `scanner` on the grammar we need to memory map it. -One we have extracted the code (with `mdcode`) we expect to find three -sections: header, code, and grammar. Anything else that is not -excluded by the `--tag` option is an error. +Once we have extracted the code (with `mdcode`) we expect to find three +or four sections: header, code, grammar, reduce. +Anything else that is not excluded by the `--tag` option is an error. -"header" and "code" are optional, though it is hard to build a working -parser with neither. "grammar" must be provided. +"header", "code", and "reduce" are optional, though it is hard to build +a working parser without the first two. "grammar" must be provided. ###### includes #include @@ -2240,6 +2452,7 @@ parser with neither. "grammar" must be provided. struct code_node *hdr = NULL; struct code_node *code = NULL; struct code_node *gram = NULL; + struct code_node *pre_reduce = NULL; for (s = table; s; s = s->next) { struct text sec = s->section; if (tag && !strip_tag(&sec, tag)) @@ -2250,6 +2463,8 @@ parser with neither. "grammar" must be provided. code = s->code; else if (text_is(sec, "grammar")) gram = s->code; + else if (text_is(sec, "reduce")) + pre_reduce = s->code; else { fprintf(stderr, "Unknown content section: %.*s\n", s->section.len, s->section.txt); @@ -2296,7 +2511,7 @@ the report finds conflicts we will exit with an error status. rv |= 1; } -If a headers section is defined, we write it out and include a declaration +If a "headers" section is defined, we write it out and include a declaration for the `parse_XX` function so it can be used from separate code. if (rv == 0 && hdr && outfile) { @@ -2321,7 +2536,7 @@ file with the code section (if any) and the parser tables and function. if (f) { if (code) code_node_print(f, code, infile); - gen_parser(f, g, infile, name); + gen_parser(f, g, infile, name, pre_reduce); fclose(f); } else { fprintf(stderr, "Cannot create %s.c\n", @@ -2358,12 +2573,12 @@ recognised properly, and link with `libicuuc` as `libmdcode` requires that. ## The SHIFT/REDUCE parser -Having analysed the grammar and generated all the tables, we only need the -shift/reduce engine to bring it all together. +Having analysed the grammar and generated all the tables, we only need +the shift/reduce engine to bring it all together. -### Goto table lookup +### Go to table lookup -The parser generator has nicely provided us with goto tables sorted by +The parser generator has nicely provided us with go to tables sorted by symbol number. We need a binary search function to find a symbol in the table. @@ -2389,55 +2604,66 @@ table. return -1; } -### The state stack. +### Memory allocation -The core data structure for the parser is the stack. This tracks all the -symbols that have been recognised or partially recognised. +The `scanner` returns tokens in a local variable - we want them in allocated +memory so they can live in the `asn_stack`. So we provide `tok_copy` to +make an allocated copy as required. -The stack usually won't grow very large - maybe a few tens of entries. So -we dynamically resize and array as required but never bother to shrink it -down again. +###### parser includes + #include -We keep the stack as two separate allocations. One, `asn_stack` stores the -"abstract syntax nodes" which are created by each reduction. When we call -`do_reduce` we need to pass an array of the `asn`s of the body of the -production, and by keeping a separate `asn` stack, we can just pass a -pointer into this stack. +###### parser functions -The other allocation stores all other stack fields of which there are six. -The `state` is the most important one and guides the parsing process. The -`sym` is nearly unnecessary. However when we want to free entries from the -`asn_stack`, it helps to know what type they are so we can call the right -freeing function. The symbol leads us to the right free function through -`do_free`. + static struct token *tok_copy(struct token tk) + { + struct token *new = malloc(sizeof(*new)); + *new = tk; + return new; + } -The `indents` count and the `starts_indented` flag track the line -indents in the symbol. These are used to allow indent information to -guide parsing and error recovery. +### The state stack. -`newline_permitted` keeps track of whether newlines should be ignored -or not, and `starts_line` records if this state stated on a newline. +The core data structure for the parser is the stack. This tracks all +the symbols that have been recognised or partially recognised. + +The stack usually won't grow very large - maybe a few tens of entries. +So we dynamically grow an array as required but never bother to +shrink it down again. + +We keep the stack as two separate allocations. One, `asn_stack`, stores +the "abstract syntax nodes" which are created by each reduction. When +we call `do_reduce` we need to pass an array of the `asn`s of the body +of the production, and by keeping a separate `asn` stack, we can just +pass a pointer into this stack. + +The other allocation stores all other stack fields of which there are +two. The `state` is the most important one and guides the parsing +process. The `sym` is nearly unnecessary as it is implicit in the +state. However when we want to free entries from the `asn_stack`, it +helps to know what type they are so we can call the right freeing +function. The symbol leads us to the right free function through +`do_free`. -The stack is more properly seen as alternating states and symbols - +The stack is most properly seen as alternating states and symbols - states, like the 'DOT' in items, are between symbols. Each frame in our stack holds a state and the symbol that was before it. The -bottom of stack holds the start state, but no symbol, as nothing came -before the beginning. +bottom of stack holds the start state but no symbol, as nothing came +before the beginning. As we need to store some value, `TK_eof` is used +to mark the beginning of the file as well as the end. ###### parser functions struct parser { struct frame { short state; - short newline_permitted; - short sym; - short starts_indented; - short indents; } *stack; void **asn_stack; int stack_size; int tos; + + ## parser state }; #### Shift and pop @@ -2445,13 +2671,13 @@ before the beginning. Two operations are needed on the stack - shift (which is like push) and pop. Shift applies not only to terminals but also to non-terminals. When we -reduce a production we will pop off entries corresponding to the body -symbols, then push on an item for the head of the production. This last is -exactly the same process as shifting in a terminal so we use the same -function for both. In both cases we provide a stack frame which -contains the symbol to shift and related indent information. +reduce a production we will pop off frames corresponding to the body +symbols, then push on a frame for the head of the production. This last +is exactly the same process as shifting in a terminal so we use the same +function for both. In both cases we provide the symbol. The state is +deduced from the current top-of-stack state and the new symbol. -To simplify other code we arrange for `shift` to fail if there is no `goto` +To simplify other code we arrange for `shift` to fail if there is no `go to` state for the symbol. This is useful in basic parsing due to our design that we shift when we can, and reduce when we cannot. So the `shift` function reports if it could. @@ -2459,22 +2685,23 @@ function reports if it could. `shift` is also used to push state zero onto the stack, so if the stack is empty, it always chooses zero as the next state. -So `shift` finds the next state. If that succeed it extends the allocations -if needed and pushes all the information onto the stacks. +So `shift` finds the next state. If that succeeds it extends the +allocations if needed and pushes all the information onto the stacks. ###### parser functions - static int shift(struct parser *p, struct frame *next, - void *asn, + static int shift(struct parser *p, + short sym, void *asn, const struct state states[]) { - // Push an entry onto the stack + struct frame next = {0}; int newstate = p->tos ? search(&states[p->stack[p->tos-1].state], - next->sym) + sym) : 0; if (newstate < 0) return 0; + if (p->tos >= p->stack_size) { p->stack_size += 10; p->stack = realloc(p->stack, p->stack_size @@ -2482,247 +2709,238 @@ if needed and pushes all the information onto the stacks. p->asn_stack = realloc(p->asn_stack, p->stack_size * sizeof(p->asn_stack[0])); } - next->state = newstate; - next->newline_permitted = 0; - if (p->tos) - next->newline_permitted = - p->stack[p->tos-1].newline_permitted; - if (next->indents) - next->newline_permitted = 0; - if (states[newstate].starts_line) - next->newline_permitted = 1; - p->stack[p->tos] = *next; + next.sym = sym; + next.state = newstate; + + p->stack[p->tos] = next; p->asn_stack[p->tos] = asn; p->tos++; return 1; } `pop` primarily moves the top of stack (`tos`) back down the required -amount and frees any `asn` entries that need to be freed. It also -collects a summary of the indents in the symbols that are being -removed. It is called _after_ we reduce a production, just before we -`shift` the nonterminal in. - -`pop` is only called if there are entries to remove, so `num` is never zero. +amount and frees any `asn` entries that need to be freed. It is called +_after_ we reduce a production, just before we `shift` the nonterminal +in. ###### parser functions - static void pop(struct parser *p, int num, struct frame *next, + static void pop(struct parser *p, int num, void(*do_free)(short sym, void *asn)) { int i; + p->tos -= num; - next->starts_indented = p->stack[p->tos].starts_indented; - next->indents = 0; - for (i = 0; i < num; i++) { - next->indents += p->stack[p->tos+i].indents; - do_free(p->stack[p->tos+i].sym, - p->asn_stack[p->tos+i]); - } + for (i = 0; i < num; i++) + do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]); } -### Memory allocation +### The heart of the parser. -The `scanner` returns tokens in a local variable - we want them in allocated -memory so they can live in the `asn_stack`. Similarly the `asn` produced by -a reduce is in a large buffer. Both of these require some allocation and -copying, hence `memdup` and `tokcopy`. +Now we have the parser. For each token we might shift it, trigger a +reduction, or start error handling. 2D tokens (IN, OUT, NEWLINE, EOL) +might also be ignored. Ignoring tokens is combined with shifting. -###### parser includes - #include +###### parser vars -###### parser functions + struct parser p = { 0 }; + struct token *tk = NULL; + int accepted = 0; - void *memdup(void *m, int len) - { - void *ret; - if (len == 0) - return NULL; - ret = malloc(len); - memcpy(ret, m, len); - return ret; +###### heart of parser + + shift(&p, TK_eof, NULL, states); + while (!accepted && p.tos > 0) { + struct frame *tos = &p.stack[p.tos-1]; + if (!tk) + tk = tok_copy(token_next(tokens)); + parser_trace(trace, &p, + tk, states, non_term, config->known_count); + + ## try shift or ignore + ## try reduce + ## handle error } - static struct token *tok_copy(struct token tk) - { - struct token *new = malloc(sizeof(*new)); - *new = tk; - return new; +Indents are ignored unless they can be shifted onto the stack +immediately or nothing can be shifted (in which case we reduce, and try +again). The end of an indented section - the OUT token - is ignored +precisely when the indent was ignored. To keep track of this we need a +small stack of flags, which is easily stored as bits in an `unsigned +long`. This will never overflow and the scanner only allows 20 levels +of indentation. + +###### parser state + unsigned long ignored_indents; + +NEWLINE is ignored when in an indented section of text which was not +explicitly expected by the grammar. So if the most recent indent is +ignored, so is any NEWLINE token. + +If a NEWLINE is seen but it cannot be shifted, we try to shift an EOL +token instead. If that succeeds, we make a new copy of the NEWLINE +token and continue. This allows a NEWLINE to appear to be preceded by +an indefinite number of EOL tokens. + +The token number for `EOL` cannot be statically declared, so when the +parser starts we need to look through the array of non-terminals to find +the EOL. + +###### parser state + int tk_eol; + +###### find eol + p.tk_eol = 0; + while (strcmp(non_term[p.tk_eol], "EOL") != 0) + p.tk_eol += 1; + p.tk_eol += TK_reserved + config->known_count; + +For other tokens, we shift the next token if that is possible, otherwise +we try to reduce a production. + +###### try shift or ignore + + if ((tk->num == TK_newline || tk->num == TK_out) && + (p.ignored_indents & 1)) { + /* indented, so ignore OUT and NEWLINE */ + if (tk->num == TK_out) + p.ignored_indents >>= 1; + free(tk); + tk = NULL; + parser_trace_action(trace, "Ignore"); + continue; + } + + if (shift(&p, tk->num, tk, states)) { + if (tk->num == TK_out) + p.ignored_indents >>= 1; + if (tk->num == TK_in) + p.ignored_indents <<= 1; + + parser_trace_action(trace, "Shift"); + tk = NULL; + ## did shift + continue; + } else if (tk->num == TK_newline && + shift(&p, p.tk_eol, tk, states)) { + tk = tok_copy(*tk); + parser_trace_action(trace, "ShiftEOL"); + continue; + } + + if (tk->num == TK_in && states[p.stack[p.tos-1].state].go_to_cnt > 0) { + /* No indent expected here and reduce is not mandatory, so ignore IN */ + free(tk); + tk = NULL; + p.ignored_indents <<= 1; + p.ignored_indents |= 1; + parser_trace_action(trace, "Ignore"); + continue; } -### The heart of the parser. +We have already discussed the bulk of the handling of a "reduce" action, +with the `pop()` and `shift()` functions doing much of the work. There +is a little more complexity needed to manage storage for the asn (Abstract +Syntax Node), and also a test of whether the reduction is permitted. + +When we try to shift the result of reducing production-zero, it will +fail because there is no next state. In this case the asn will not have +been stored on the stack, so it get stored in the `ret` variable, and we +report that that input has been accepted. -Now we have the parser. If we can shift, we do, though newlines and -reducing indenting may block that. If not and we can reduce we do. -If the production we reduced was production zero, then we have -accepted the input and can finish. +###### parser vars -We return whatever `asn` was returned by reducing production zero. + void *ret = NULL; -If we can neither shift nor reduce we have an error to handle. We pop -single entries off the stack until we can shift the `TK_error` symbol, then -drop input tokens until we find one we can shift into the new error state. +###### try reduce -When we find `TK_in` and `TK_out` tokens which report indents we need -to handle them directly as the grammar cannot express what we want to -do with them. + if (states[tos->state].reduce_prod >= 0) { + void **body; + void *res; + const struct state *nextstate = &states[tos->state]; + int prod = nextstate->reduce_prod; + int size = reductions[prod].size; + int res_size = reductions[prod].result_size; + + body = p.asn_stack + (p.tos - size); + res = res_size ? calloc(1, res_size) : NULL; + res_size = do_reduce(prod, body, config, res); + if (res_size != reductions[prod].result_size) + abort(); + pop(&p, size, do_free); + if (!shift(&p, reductions[prod].sym, res, states)) { + accepted = 1; + ret = res; + parser_trace_action(trace, "Accept"); + } else + parser_trace_action(trace, "Reduce"); + continue; + } -`TK_in` tokens are easy: we simply update the `next` stack frame to -record how many indents there are and that the next token started with -an indent. +If we can neither shift nor reduce we have an error to handle. There +are two possible responses to an error: we can pop single frames off the +stack until we find a frame that can shift `TK_error`, or we can discard +the current look-ahead symbol. -`TK_out` tokens must either be counted off against any pending indent, -or must force reductions until there is a pending indent which isn't -at the start of a production. +When we first see an error we do the first of these and set a flag to +record that we are processing an error. If the normal shift/reduce +tests fail to find that anything can be done from that state, we start +dropping tokens until either we manage to shift one, or reach end-of-file. -`TK_newline` tokens are ignored precisely if there has been an indent -since the last state which could have been at the start of a line. +###### parser vars + + int in_err = 0; + +###### did shift + + in_err = 0; + +###### handle error + + if (in_err) { + parser_trace_action(trace, "DISCARD"); + if (tk->num == TK_eof) + break; + free(tk); + tk = NULL; + } else { + struct token *err_tk; + + parser_trace_action(trace, "ERROR"); + + err_tk = tok_copy(*tk); + while (p.tos > 0 && shift(&p, TK_error, err_tk, states) == 0) + // discard this state + pop(&p, 1, do_free); + if (p.tos == 0) { + free(err_tk); + // no state accepted TK_error + break; + } + in_err = 1; + } ###### parser includes #include "parser.h" + ###### parser_run + void *parser_run(struct token_state *tokens, const struct state states[], + const struct reduction reductions[], int (*do_reduce)(int, void**, struct token_config*, void*), void (*do_free)(short, void*), FILE *trace, const char *non_term[], struct token_config *config) { - struct parser p = { 0 }; - struct frame next = { 0 }; - struct token *tk = NULL; - int accepted = 0; - void *ret = NULL; - - shift(&p, &next, NULL, states); - while (!accepted) { - struct token *err_tk; - struct frame *tos = &p.stack[p.tos-1]; - if (!tk) - tk = tok_copy(token_next(tokens)); - next.sym = tk->num; - if (trace) - parser_trace(trace, &p, &next, tk, states, non_term, config->known_count); - - if (next.sym == TK_in) { - next.starts_indented = 1; - next.indents = 1; - free(tk); - tk = NULL; - continue; - } - if (next.sym == TK_out) { - if (tos->indents > tos->starts_indented || - (tos->indents == 1 && - states[tos->state].reduce_size != 1)) { - tos->indents -= 1; - if (tos->indents <= tos->starts_indented) { - // no internal indent any more, reassess 'newline_permitted' - if (states[tos->state].starts_line) - tos->newline_permitted = 1; - else if (p.tos > 1) - tos->newline_permitted = p.stack[p.tos-2].newline_permitted; - } - free(tk); - tk = NULL; - continue; - } - // fall through and force a REDUCE (as 'shift' - // will fail). - } - if (next.sym == TK_newline) { - if (! tos->newline_permitted) { - free(tk); - tk = NULL; - continue; - } - } - if (shift(&p, &next, tk, states)) { - tk = NULL; - next.starts_indented = 0; - next.indents = 0; - continue; - } - if (states[tos->state].reduce_prod >= 0) { - void **body; - void *res; - const struct state *nextstate = &states[tos->state]; - int prod = nextstate->reduce_prod; - int size = nextstate->reduce_size; - int bufsize; - static char buf[16*1024]; - struct frame frame; - frame.sym = nextstate->reduce_sym; - - body = p.asn_stack + (p.tos - size); - - bufsize = do_reduce(prod, body, config, buf); - - if (size) - pop(&p, size, &frame, do_free); - else { - frame.indents = next.indents; - frame.starts_indented = frame.indents; - next.indents = 0; - next.starts_indented = 0; - } - res = memdup(buf, bufsize); - memset(buf, 0, bufsize); - if (!shift(&p, &frame, res, states)) { - if (prod != 0) abort(); - accepted = 1; - ret = res; - } - continue; - } - if (tk->num == TK_out) { - // Indent problem - synthesise tokens to get us - // out of here. - struct frame frame = { 0 }; - fprintf(stderr, "Synthesize %d to handle indent problem\n", states[tos->state].shift_sym); - frame.sym = states[tos->state].shift_sym; - shift(&p, &frame, tok_copy(*tk), states); - // FIXME need to report this error somehow - continue; - } - /* Error. We walk up the stack until we - * find a state which will accept TK_error. - * We then shift in TK_error and see what state - * that takes us too. - * Then we discard input tokens until - * we find one that is acceptable. - */ + ## parser vars + + ## find eol + + ## heart of parser - err_tk = tok_copy(*tk); - next.sym = TK_error; - while (shift(&p, &next, err_tk, states) == 0 - && p.tos > 0) - // discard this state - pop(&p, 1, &next, do_free); - if (p.tos == 0) { - free(err_tk); - // no state accepted TK_error - break; - } - tos = &p.stack[p.tos-1]; - while (search(&states[tos->state], tk->num) < 0 && - tk->num != TK_eof) { - free(tk); - tk = tok_copy(token_next(tokens)); - if (tk->num == TK_in) - next.indents += 1; - if (tk->num == TK_out) { - if (next.indents == 0) - break; - next.indents -= 1; - } - } - if (p.tos == 0 && tk->num == TK_eof) - break; - } free(tk); - if (p.tos) - pop(&p, p.tos, &next, do_free); + pop(&p, p.tos, do_free); free(p.asn_stack); free(p.stack); return ret; @@ -2731,6 +2949,7 @@ since the last state which could have been at the start of a line. ###### exported functions void *parser_run(struct token_state *tokens, const struct state states[], + const struct reduction reductions[], int (*do_reduce)(int, void**, struct token_config*, void*), void (*do_free)(short, void*), FILE *trace, const char *non_term[], @@ -2762,21 +2981,14 @@ end inside square brackets. [TK_newline] = "NEWLINE", [TK_eof] = "$eof", }; - static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[]) - { - fprintf(trace, "(%d", f->state); - if (states[f->state].starts_line) - fprintf(trace, "s"); - if (f->newline_permitted) - fprintf(trace, "n"); - fprintf(trace, ") "); - } - void parser_trace(FILE *trace, struct parser *p, struct frame *n, + void parser_trace(FILE *trace, struct parser *p, struct token *tk, const struct state states[], const char *non_term[], int knowns) { int i; + if (!trace) + return; for (i = 0; i < p->tos; i++) { struct frame *f = &p->stack[i]; if (i) { @@ -2790,12 +3002,9 @@ end inside square brackets. } else fputs(non_term[sym - TK_reserved - knowns], trace); - if (f->indents) - fprintf(trace, "%c%d", f->starts_indented?':':'.', - f->indents); fputs(" ", trace); } - parser_trace_state(trace, f, states); + fprintf(trace, "(%d) ", f->state); } fprintf(trace, "["); if (tk->num < TK_reserved && @@ -2803,32 +3012,38 @@ end inside square brackets. fputs(reserved_words[tk->num], trace); else text_dump(trace, tk->txt, 20); - if (n->indents) - fprintf(trace, "%c%d", n->starts_indented?':':'.', - n->indents); - fputs("]\n", trace); + fprintf(trace, ":%d:%d]", tk->line, tk->col); + } + + void parser_trace_action(FILE *trace, char *action) + { + if (trace) + fprintf(trace, " - %s\n", action); } # A Worked Example The obvious example for a parser is a calculator. -As `scanner` provides number parsing function using `libgmp` is it not much +As `scanner` provides number parsing function using `libgmp` it is not much work to perform arbitrary rational number calculations. -This calculator takes one expression, or an equality test per line. The -results are printed and if any equality test fails, the program exits with -an error. +This calculator takes one expression, or an equality test, per line. +The results are printed and if any equality test fails, the program +exits with an error. ###### File: parsergen.mk calc.c calc.h : parsergen parsergen.mdc ./parsergen --tag calc -o calc parsergen.mdc calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp + calctest : calc + ./calc parsergen.mdc + demos :: calctest # calc: header - #include "number.h" + #include "parse_number.h" // what do we use for a demo-grammar? A calculator of course. struct number { mpq_t val; @@ -2845,9 +3060,9 @@ an error. #include #include #include + #include #include "mdcode.h" #include "scanner.h" - #include "number.h" #include "parser.h" #include "calc.h" @@ -2858,33 +3073,44 @@ an error. free(n); } + static int text_is(struct text t, char *s) + { + return (strlen(s) == t.len && + strncmp(s, t.txt, t.len) == 0); + } + int main(int argc, char *argv[]) { int fd = open(argv[1], O_RDONLY); int len = lseek(fd, 0, 2); char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); - struct section *s = code_extract(file, file+len, NULL); + struct section *table = code_extract(file, file+len, NULL); + struct section *s; struct token_config config = { .ignored = (1 << TK_line_comment) - | (1 << TK_block_comment) | (1 << TK_in) | (1 << TK_out), .number_chars = ".,_+-", .word_start = "", .word_cont = "", }; - parse_calc(s->code, &config, argc > 2 ? stderr : NULL); - while (s) { - struct section *t = s->next; - code_free(s->code); - free(s); - s = t; + for (s = table; s; s = s->next) + if (text_is(s->section, "example: input")) + parse_calc(s->code, &config, argc > 2 ? stderr : NULL); + while (table) { + struct section *t = table->next; + code_free(table->code); + free(table); + table = t; } exit(0); } # calc: grammar + $LEFT + - + $LEFT * / // + Session -> Session Line | Line @@ -2907,13 +3133,32 @@ an error. | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$ $number - Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$ - | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$ - | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$ + Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$ + | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$ + | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$ + | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$ + | Expression // Expression ${ { + mpz_t z0, z1, z2; + mpq_init($0.val); + mpz_init(z0); mpz_init(z1); mpz_init(z2); + mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val)); + mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val)); + mpz_tdiv_q(z0, z1, z2); + mpq_set_z($0.val, z0); + mpz_clear(z0); mpz_clear(z1); mpz_clear(z2); + } }$ + | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$ + | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$ - Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$ - | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$ - | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$ +# example: input - Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$ - | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$ + 355/113 + 3.1415926535 - 355/113 + 2 + 4 * 5 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10 * 9 / 2 + 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1 + + 355//113 + + error