From 5e8cc61fe8626814a855a6347b3df3c24e7b7bb0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Sat, 13 Jul 2013 07:21:37 +1000 Subject: [PATCH] New file: parsergen This reads and analyses a grammar and generates a parser. It include a simple calculator Signed-off-by: NeilBrown --- csrc/parsergen.mdc | 2554 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2554 insertions(+) create mode 100644 csrc/parsergen.mdc diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc new file mode 100644 index 0000000..ddf03ab --- /dev/null +++ b/csrc/parsergen.mdc @@ -0,0 +1,2554 @@ +# LR(1) Parser Generator # + +This parser generator takes a grammar description combined with code +fragments, analyses it, and can report details about the analysis and +write out C code files which can be compiled to make a parser. + +There are several distinct sections. + +1. `grammar_read` will read a grammar from a literate-code file and + store the productions, symbols, and code fragments. +2. `grammar_analyse` will take that grammar and build LR parsing + states and look-ahead tables. +3. `grammar_report` will present the details of the analysis + in a readable format and will report any conflicts. +4. `parser_generate` will write out C code files with various + tables and with the code fragments arranged in useful places. +5. `parser_run` is a library function intended to be linked together + with the generated parser tables to complete the implementation of + a parser. +6. `calc.cgm` is a test grammar for a simple calculator. + +###### File: parsergen.c + #include + #include + #include + ## includes + ## forward declarations + ## declarations + ## functions + ## grammar_read + ## grammar_analyse + ## grammar_report + ## parser_generate + ## main +###### File: parser.h + ## exported types + ## exported functions +###### File: libparser.c + #include + #include + #include + ## parser includes + ## parser +###### File: calc.cgm + ## demo grammar +###### File: parsergen.mk + CFLAGS += -Wall -g + all :: parsergen calc + parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc + ./md2c parsergen.mdc + +## Reading the grammar + +The grammar must be stored in a markdown literate code file as parsed +by "[mdcode][]". It must have three top level (i.e. unreferenced) +sections: `header`, `code`, and `grammar`. The first two will be +literally copied into the generated `.c` and `.h`. files. The last +contains the grammar. This is tokenised with "[scanner][]". + +[mdcode]: mdcode.html +[scanner]: scanner.html + +###### includes + #include "mdcode.h" + #include "scanner.h" + +###### parser includes + #include "mdcode.h" + #include "scanner.h" + +The grammar contains production sets, precedence/associativity +declarations, and data type declarations. These are all parsed with +_ad hoc_ parsing as we don't have a parser generator yet. + +The precedence and associativity can be set for each production, but +can be inherited from symbols. The data type is potentially defined +for each non-terminal and describes what C structure is used to store +information about each symbol. + +###### declarations + enum assoc {Left, Right, Non}; + char *assoc_names[] = {"Left","Right","Non"}; + + struct symbol { + struct text struct_name; + enum assoc assoc; + unsigned short precedence; + ## symbol fields + }; + struct production { + unsigned short precedence; + enum assoc assoc; + ## production fields + }; + struct grammar { + ## grammar fields + }; + +The strings reported by `mdcode` and `scanner` are `struct text` which have +length rather than being null terminated. To help with printing an +comparing we define `text_is` and `prtxt`, which should possibly go in +`mdcode`. `scanner` does provide `text_dump` which is useful for strings +which might contain control characters. + +###### functions + static int text_is(struct text t, char *s) + { + return (strlen(s) == t.len && + strncmp(s, t.txt, t.len) == 0); + } + static void prtxt(struct text t) + { + printf("%.*s", t.len, t.txt); + } + +### Symbols + +Productions are comprised primarily of symbols - terminal and +non-terminal. We do not make a syntactic distinction between the two, +though non-terminals must be identifiers. Non-terminal symbols are +those which appear in the head of a production, terminal symbols are +those which don't. There are also "virtual" symbols used for precedence +marking discussed later, and sometimes we won't know what type a symbol +is yet. + +###### forward declarations + enum symtype { Unknown, Virtual, Terminal, Nonterminal }; + char *symtypes = "UVTN"; +###### symbol fields + enum symtype type; + +Symbols can be either `TK_ident` or `TK_mark`. They are saved in a +table of known symbols and the resulting parser will report them as +`TK_reserved + N`. A small set of identifiers are reserved for the +different token types that `scanner` can report. + +###### declarations + static char *reserved_words[] = { + [TK_error] = "ERROR", + [TK_number] = "NUMBER", + [TK_ident] = "IDENTIFIER", + [TK_mark] = "MARK", + [TK_string] = "STRING", + [TK_multi_string] = "MULTI_STRING", + [TK_indent] = "INDENT", + [TK_undent] = "UNDENT", + [TK_newline] = "NEWLINE", + [TK_eof] = "$eof", + }; +###### symbol fields + short num; + +Note that `TK_eof` and the two `TK_*_comment` tokens cannot be +recognised. The former is automatically expected at the end of the text +being parsed. The latter are always ignored by the parser. + +All of these symbols are stored in a simple symbol table. We use the +`struct text` from `mdcode` to store the name and link them together +into a sorted list using an insertion sort. + +We don't have separate `find` and `insert` functions as any symbol we +find needs to be remembered. We simply expect `find` to always return a +symbol, but its type might be `Unknown`. + +###### includes + #include + +###### symbol fields + struct text name; + struct symbol *next; + +###### grammar fields + struct symbol *syms; + int num_syms; + +###### functions + static int text_cmp(struct text a, struct text b) + { + int len = a.len; + if (a.len > b.len) + len = b.len; + int cmp = strncmp(a.txt, b.txt, len); + if (cmp) + return cmp; + else + return a.len - b.len; + } + + static struct symbol *sym_find(struct grammar *g, struct text s) + { + struct symbol **l = &g->syms; + struct symbol *n; + int cmp = 1; + + while (*l && + (cmp = text_cmp((*l)->name, s)) < 0) + l = & (*l)->next; + if (cmp == 0) + return *l; + n = calloc(1, sizeof(*n)); + n->name = s; + n->type = Unknown; + n->next = *l; + n->num = -1; + *l = n; + return n; + } + + static void symbols_init(struct grammar *g) + { + int entries = sizeof(reserved_words)/sizeof(reserved_words[0]); + int i; + for (i = 0; i < entries; i++) { + struct text t; + struct symbol *s; + t.txt = reserved_words[i]; + if (!t.txt) + continue; + t.len = strlen(t.txt); + s = sym_find(g, t); + s->type = Terminal; + s->num = i; + } + } + +### Data types and precedence. + +Data type specification and precedence specification are both +introduced by a dollar sign at the start of the line. If the next +word is `LEFT`, `RIGHT` or `NON`, then the line specifies a +precedence, otherwise it specifies a data type. + +The data type name is simply stored and applied to the head of all +subsequent productions. It must be the name of a structure, so `$expr` +maps to `struct expr`. + +The precedence line must contain a list of symbols - typically +terminal symbols, but not necessarily. It can only contain symbols +that have not been seen yet, so precedence declaration must precede +the productions that they affect. + +A precedence line may also contain a "virtual" symbol which is an +identifier preceded by `$$`. Virtual terminals carry precedence +information but are not included in the grammar. A production can +declare that it inherits the precedence of a given virtual symbol. + +This use for `$$` precludes it from being used as a symbol in the +described language. Two other symbols: `${` and `}$` are also +unavailable. + +Each new precedence line introduces a new precedence level and +declares how it associates. This level is stored in each symbol +listed and may be inherited by any production which uses the symbol. A +production inherits from the last symbol which has a precedence. + +###### grammar fields + struct text current_type; + int prec_levels; + +###### declarations + enum symbols { TK_virtual = TK_reserved, TK_open, TK_close }; + static const char *known[] = { "$$", "${", "}$" }; + +###### functions + static char *dollar_line(struct token_state *ts, struct grammar *g) + { + struct token t = token_next(ts); + char *err; + enum assoc assoc; + int found; + + if (t.num != TK_ident) { + err = "type or assoc expected after '$'"; + goto abort; + } + if (text_is(t.txt, "LEFT")) + assoc = Left; + else if (text_is(t.txt, "RIGHT")) + assoc = Right; + else if (text_is(t.txt, "NON")) + assoc = Non; + else { + g->current_type = t.txt; + t = token_next(ts); + if (t.num != TK_newline) { + err = "Extra tokens after type name"; + goto abort; + } + return NULL; + } + + // This is a precedence line, need some symbols. + found = 0; + g->prec_levels += 1; + t = token_next(ts); + while (t.num != TK_newline) { + enum symtype type = Terminal; + struct symbol *s; + if (t.num == TK_virtual) { + type = Virtual; + t = token_next(ts); + if (t.num != TK_ident) { + err = "$$ must be followed by a word"; + goto abort; + } + } else if (t.num != TK_ident && + t.num != TK_mark) { + err = "Illegal token in precedence line"; + goto abort; + } + s = sym_find(g, t.txt); + if (s->type != Unknown) { + err = "Symbols in precedence line must not already be known."; + goto abort; + } + s->type = type; + s->precedence = g->prec_levels; + s->assoc = assoc; + found += 1; + } + if (found == 0) + err = "No symbols given on precedence line"; + goto abort; + return NULL; + abort: + while (t.num != TK_newline && t.num != TK_eof) + t = token_next(ts); + return err; + } + +### Productions + +A production either starts with an identifier which is the head +non-terminal, or a vertical bar (`|`) in which case this production +uses the same head as the previous one. The identifier must be +followed by a `->` mark. All productions for a given non-terminal must +be grouped together with the `nonterminal ->` given only once. + +After this a (possibly empty) sequence of identifiers and marks form +the body of the production. A virtual symbol may be given after the +body by preceding it with `$$`. If a virtual symbol is given, the +precedence of the production is that for the virtual symbol. If none +is given, the precedence is inherited from the last symbol in the +production which has a precedence specified. + +After the optional precedence may come the `${` mark. This indicates +the start of a code fragment. If present, this must be on the same +line as the start of the production. + +All of the text from the `${` through to the matching `}$` is +collected and forms the code-fragment for the production. It must all +be in one `code_node` of the literate code. The `}$` must be +at the end of a line. + +Text in the code fragment will undergo substitutions where `$N` for +some numeric `N` will be replaced with a variable holding the parse +information for the particular symbol in the production. `$0` is the +head of the production, `$1` is the first symbol of the body, etc. +The type of `$N` for a terminal symbol is `struct token`. For +non-terminal, it is whatever has been declared for that symbol. + +While building productions we will need to add to an array which needs to +grow dynamically. + +###### functions + static void array_add(void *varray, int *cnt, void *new) + { + void ***array = varray; + int current = 0; + const int step = 8; + current = ((*cnt-1) | (step-1))+1; + if (*cnt == current) { + /* must grow */ + current += step; + *array = realloc(*array, current * sizeof(void*)); + } + (*array)[*cnt] = new; + (*cnt) += 1; + } + +Collecting the code fragment simply involves reading tokens until we +hit the end token `}$`, and noting the character position of the start and +the end. + +###### functions + static struct text collect_code(struct token_state *state, + struct token start) + { + struct text code; + struct token t; + code.txt = start.txt.txt + start.txt.len; + do + t = token_next(state); + while (t.node == start.node && + t.num != TK_close && t.num != TK_error && + t.num != TK_eof); + if (t.num == TK_close && t.node == start.node) + code.len = t.txt.txt - code.txt; + else + code.txt = NULL; + return code; + } + +Now we have all the bit we need to parse a full production. + +###### includes + #include + +###### grammar fields + struct production **productions; + int production_count; + +###### production fields + struct symbol *head; + struct symbol **body; + int body_size; + struct text code; + +###### symbol fields + int first_production; + +###### functions + static char *parse_production(struct grammar *g, + struct symbol *head, + struct token_state *state) + { + /* Head has already been parsed. */ + struct token tk; + char *err; + struct production p, *pp; + + memset(&p, 0, sizeof(p)); + p.head = head; + tk = token_next(state); + while (tk.num == TK_ident || tk.num == TK_mark) { + struct symbol *bs = sym_find(g, tk.txt); + if (bs->type == Unknown) + bs->type = Terminal; + if (bs->type == Virtual) { + err = "Virtual symbol not permitted in production"; + goto abort; + } + if (bs->precedence) { + p.precedence = bs->precedence; + p.assoc = bs->assoc; + } + array_add(&p.body, &p.body_size, bs); + tk = token_next(state); + } + if (tk.num == TK_virtual) { + struct symbol *vs; + tk = token_next(state); + if (tk.num != TK_ident) { + err = "word required after $$"; + goto abort; + } + vs = sym_find(g, tk.txt); + if (vs->type != Virtual) { + err = "symbol after $$ must be virtual"; + goto abort; + } + p.precedence = vs->precedence; + p.assoc = vs->assoc; + tk = token_next(state); + } + if (tk.num == TK_open) { + p.code = collect_code(state, tk); + if (p.code.txt == NULL) { + err = "code fragment not closed properly"; + goto abort; + } + tk = token_next(state); + } + if (tk.num != TK_newline && tk.num != TK_eof) { + err = "stray tokens at end of line"; + goto abort; + } + pp = malloc(sizeof(*pp)); + *pp = p; + array_add(&g->productions, &g->production_count, pp); + return NULL; + abort: + while (tk.num != TK_newline && tk.num != TK_eof) + tk = token_next(state); + return err; + } + +With the ability to parse production and dollar-lines, we have nearly all +that we need to parse a grammar from a `code_node`. + +The head of the first production will effectively be the `start` symbol of +the grammar. However it wont _actually_ be so. Processing the grammar is +greatly simplified if the real start symbol only has a single production, +and expect `$eof` as the final terminal. So when we find the first explicit +production we insert an extra production as production zero which looks like + +###### Example: production 0 + $start -> START $eof + +where `START` is the first non-terminal give. + +###### create production zero + struct production *p = calloc(1,sizeof(*p)); + struct text start = {"$start",6}; + struct text eof = {"$eof",4}; + p->head = sym_find(g, start); + p->head->type = Nonterminal; + array_add(&p->body, &p->body_size, head); + array_add(&p->body, &p->body_size, sym_find(g, eof)); + g->start = p->head->num; + p->head->first_production = g->production_count; + array_add(&g->productions, &g->production_count, p); + +Now we are ready to read in the grammar. + +###### grammar fields + int start; // the 'start' symbol. + +###### grammar_read + static struct grammar *grammar_read(struct code_node *code) + { + struct token_config conf = { + .word_start = "", + .word_cont = "", + .words_marks = known, + .known_count = sizeof(known)/sizeof(known[0]), + .number_chars = "", + .ignored = (1 << TK_line_comment) + | (1 << TK_block_comment) + | (0 << TK_number) + | (0 << TK_string) + | (1 << TK_multi_string) + | (1 << TK_indent) + | (1 << TK_undent), + }; + + struct token_state *state = token_open(code, &conf); + struct token tk = token_next(state); + struct symbol *head = NULL; + struct grammar *g; + char *err = NULL; + + g = calloc(1, sizeof(*g)); + symbols_init(g); + + for (tk = token_next(state); tk.num != TK_eof; + tk = token_next(state)) { + if (tk.num == TK_newline) + continue; + if (tk.num == TK_ident) { + // new non-terminal + head = sym_find(g, tk.txt); + if (head->type == Nonterminal) + err = "This non-terminal has already be used."; + else if (head->type == Virtual) + err = "Virtual symbol not permitted in head of production"; + else { + head->type = Nonterminal; + head->struct_name = g->current_type; + if (g->start == 0) { + ## create production zero + } + head->first_production = g->production_count; + tk = token_next(state); + if (tk.num == TK_mark && + text_is(tk.txt, "->")) + err = parse_production(g, head, state); + else + err = "'->' missing in production"; + } + } else if (tk.num == TK_mark + && text_is(tk.txt, "|")) { + // another production for same non-term + if (head) + err = parse_production(g, head, state); + else + err = "First production must have a head"; + } else if (tk.num == TK_mark + && text_is(tk.txt, "$")) { + err = dollar_line(state, g); + } else { + err = "Unrecognised token at start of line."; + } + if (err) + goto abort; + } + token_close(state); + return g; + abort: + fprintf(stderr, "Error at line %d: %s\n", + tk.line, err); + token_close(state); + free(g); + return NULL; + } + +## Analysing the grammar + +The central task in analysing the grammar is to determine a set of +states to drive the parsing process. This will require finding +various sets of symbols and of "items". Some of these sets will need +to attach information to each element in the set, so they are more +like maps. + +Each "item" may have a 'look-ahead' or `LA` set associated with +it. Many of these will be the same as each other. To avoid memory +wastage, and to simplify some comparisons of sets, these sets will be +stored in a list of unique sets, each assigned a number. + +Once we have the data structures in hand to manage these sets and +lists, we can start setting the 'nullable' flag, build the 'FIRST' +sets, and then create the item sets which define the various states. + +### Symbol sets. + +Though we don't only store symbols in these sets, they are the main +things we store, so they are called symbol sets or "symsets". + +A symset has a size, an array of shorts, and an optional array of data +values, which are also shorts. If the array of data is not present, +we store an impossible pointer, as `NULL` is used to indicate that no +memory has been allocated yet; + +###### declarations + struct symset { + short cnt; + unsigned short *syms, *data; + }; + #define NO_DATA ((unsigned short *)1) + const struct symset INIT_SYMSET = { 0, NULL, NO_DATA }; + const struct symset INIT_DATASET = { 0, NULL, NULL }; + +The arrays of shorts are allocated in blocks of 8 and are kept sorted +by using an insertion sort. We don't explicitly record the amount of +allocated space as it can be derived directly from the current `cnt` using +`((cnt - 1) | 7) + 1`. + +###### functions + static void symset_add(struct symset *s, int key, int val) + { + int i; + int current = ((s->cnt-1) | 7) + 1; + if (current == s->cnt) { + current += 8; + s->syms = realloc(s->syms, sizeof(*s->syms) * current); + if (s->data != NO_DATA) + s->data = realloc(s->data, sizeof(*s->data) * current); + } + i = s->cnt; + while (i > 0 && s->syms[i-1] > key) { + s->syms[i] = s->syms[i-1]; + if (s->data != NO_DATA) + s->data[i] = s->data[i-1]; + i--; + } + s->syms[i] = key; + if (s->data != NO_DATA) + s->data[i] = val; + s->cnt += 1; + } + +Finding a symbol (or item) in a `symset` uses a simple binary search. +We return the index where the value was found (so data can be accessed), +or `-1` to indicate failure. + + static int symset_find(struct symset *ss, int key) + { + int lo = 0; + int hi = ss->cnt; + + if (hi == 0) + return -1; + while (lo + 1 < hi) { + int mid = (lo + hi) / 2; + if (ss->syms[mid] <= key) + lo = mid; + else + hi = mid; + } + if (ss->syms[lo] == key) + return lo; + return -1; + } + +We will often want to form the union of two symsets. When we do, we +will often want to know if anything changed (as they might mean there +is more work to do). So `symset_union` reports whether anything was +added to the first set. We use a slow quadratic approach as these +sets don't really get very big. If profiles shows this to be a problem is +can be optimised later. + + static int symset_union(struct symset *a, struct symset *b) + { + int i; + int added = 0; + for (i = 0; i < b->cnt; i++) + if (symset_find(a, b->syms[i]) < 0) { + int data = 0; + if (b->data != NO_DATA) + data = b->data[i]; + symset_add(a, b->syms[i], data); + added++; + } + return added; + } + +And of course we must be able to free a symset. + + static void symset_free(struct symset ss) + { + free(ss.syms); + if (ss.data != NO_DATA) + free(ss.data); + } + +### Symset Storage + +Some symsets are simply stored somewhere appropriate in the `grammar` +data structure, others needs a bit of indirection. This applies +particularly to "LA" sets. These will be paired with "items" in an +"itemset". As itemsets will be stored in a symset, the "LA" set needs to be +stored in the `data` field, so we need a mapping from a small (short) +number to an LA `symset`. + +As mentioned earlier this involves creating a list of unique symsets. + +For now, we just use a linear list sorted by insertion. A skiplist +would be more efficient and may be added later. + +###### declarations + + struct setlist { + struct symset ss; + int num; + struct setlist *next; + }; + +###### grammar fields + struct setlist *sets; + int nextset; + +###### functions + + static int ss_cmp(struct symset a, struct symset b) + { + int i; + int diff = a.cnt - b.cnt; + if (diff) + return diff; + for (i = 0; i < a.cnt; i++) { + diff = (int)a.syms[i] - (int)b.syms[i]; + if (diff) + return diff; + } + return 0; + } + + static int save_set(struct grammar *g, struct symset ss) + { + struct setlist **sl = &g->sets; + int cmp = 1; + struct setlist *s; + + while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0) + sl = & (*sl)->next; + if (cmp == 0) { + symset_free(ss); + return (*sl)->num; + } + + s = malloc(sizeof(*s)); + s->ss = ss; + s->num = g->nextset; + g->nextset += 1; + s->next = *sl; + *sl = s; + return s->num; + } + +Finding a set by number is currently performed by a simple linear search. +If this turns out to hurt performance, we can store the sets in a dynamic +array like the productions. + + static struct symset set_find(struct grammar *g, int num) + { + struct setlist *sl = g->sets; + while (sl && sl->num != num) + sl = sl->next; + return sl->ss; + } + + +### Setting `nullable` + +We set `nullable` on the head symbol for any production for which all +the body symbols (if any) are nullable. As this is a recursive +definition, any change in the `nullable` setting means that we need to +re-evaluate where it needs to be set. + +We simply loop around performing the same calculations until no more +changes happen. + +###### symbol fields + int nullable; + +###### functions + static void set_nullable(struct grammar *g) + { + int check_again = 1; + while (check_again) { + int p; + check_again = 0; + for (p = 0; p < g->production_count; p++) { + struct production *pr = g->productions[p]; + int s; + + if (pr->head->nullable) + continue; + for (s = 0; s < pr->body_size; s++) + if (! pr->body[s]->nullable) + break; + if (s == pr->body_size) { + pr->head->nullable = 1; + check_again = 1; + } + } + } + } + +### Building the `first` sets + +When calculating what can follow a particular non-terminal, we will need to +know what the "first" terminal in an subsequent non-terminal might be. So +we calculate the `first` set for every non-terminal and store them in an +array. We don't bother recording the "first" set for terminals as they are +trivial. + +As the "first" for one symbol might depend on the "first" of another, +we repeat the calculations until no changes happen, just like with +`set_nullable`. This makes use of the fact that `symset_union` +reports if any change happens. + +The core of this which finds the "first" of part of a production body +will be reused for computing the "follow" sets, so we split it out +into a separate function. + +###### grammar fields + struct symset *first; + +###### functions + + static int add_first(struct production *pr, int start, + struct symset *target, struct grammar *g, + int *to_end) + { + int s; + int changed = 0; + for (s = start; s < pr->body_size; s++) { + struct symbol *bs = pr->body[s]; + if (bs->type == Terminal) { + if (symset_find(target, bs->num) < 0) { + symset_add(target, bs->num, 0); + changed = 1; + } + break; + } else if (symset_union(target, &g->first[bs->num])) + changed = 1; + if (!bs->nullable) + break; + } + if (to_end) + *to_end = (s == pr->body_size); + return changed; + } + + static void build_first(struct grammar *g) + { + int check_again = 1; + int s; + g->first = calloc(g->num_syms, sizeof(g->first[0])); + for (s = 0; s < g->num_syms; s++) + g->first[s] = INIT_SYMSET; + + while (check_again) { + int p; + check_again = 0; + for (p = 0; p < g->production_count; p++) { + struct production *pr = g->productions[p]; + struct symset *head = &g->first[pr->head->num]; + + if (add_first(pr, 0, head, g, NULL)) + check_again = 1; + } + } + } + +### Building the `follow` sets. + +There are two different situations when we will want to generate "follow" +sets. If we are doing an SLR analysis, we want to generate a single +"follow" set for each non-terminal in the grammar. That is what is +happening here. If we are doing an LALR or LR analysis we will want +to generate a separate "LA" set for each item. We do that later +in state generation. + +There are two parts to generating a "follow" set. Firstly we look at +every place that any non-terminal appears in the body of any +production, and we find the set of possible "first" symbols after +there. This is done using `add_first` above and only needs to be done +once as the "first" sets are now stable and will not change. + +###### follow code + + for (p = 0; p < g->production_count; p++) { + struct production *pr = g->productions[p]; + int b; + + for (b = 0; b < pr->body_size - 1; b++) { + struct symbol *bs = pr->body[b]; + if (bs->type == Terminal) + continue; + add_first(pr, b+1, &g->follow[bs->num], g, NULL); + } + } + +The second part is to add the "follow" set of the head of a production +to the "follow" sets of the final symbol in the production, and any +other symbol which is followed only by `nullable` symbols. As this +depends on "follow" itself we need to repeatedly perform the process +until no further changes happen. + +###### follow code + + for (again = 0, p = 0; + p < g->production_count; + p < g->production_count-1 + ? p++ : again ? (again = 0, p = 0) + : p++) { + struct production *pr = g->productions[p]; + int b; + + for (b = pr->body_size - 1; b >= 0; b--) { + struct symbol *bs = pr->body[b]; + if (bs->type == Terminal) + break; + if (symset_union(&g->follow[bs->num], + &g->follow[pr->head->num])) + again = 1; + if (!bs->nullable) + break; + } + } + +We now just need to create and initialise the `follow` list to get a +complete function. + +###### grammar fields + struct symset *follow; + +###### functions + static void build_follow(struct grammar *g) + { + int s, again, p; + g->follow = calloc(g->num_syms, sizeof(g->follow[0])); + for (s = 0; s < g->num_syms; s++) + g->follow[s] = INIT_SYMSET; + ## follow code + } + +### Building itemsets and states + +There are three different levels of detail that can be chosen for +building the itemsets and states for the LR grammar. They are: + +1. LR(0) or SLR(1), where no look-ahead is considered. +2. LALR(1) where we build look-ahead sets with each item and merge + the LA sets when we find two paths to the same "kernel" set of items. +3. LR(1) where different look-ahead for any item in the code means + a different state must be created. + +###### forward declarations + enum grammar_type { LR0, LR05, SLR, LALR, LR1 }; + +We need to be able to look through existing states to see if a newly +generated state already exists. For now we use a simple sorted linked +list. + +An item is a pair of numbers: the production number and the position of +"DOT", which is an index into the body of the production. +As the numbers are not enormous we can combine them into a single "short" +and store them in a `symset` - 4 bits for "DOT" and 12 bits for the +production number (so 4000 productions with maximum of 15 symbols in the +body). + +Comparing the itemsets will be a little different to comparing symsets +as we want to do the lookup after generating the "kernel" of an +itemset, so we need to ignore the offset=zero items which are added during +completion. + +To facilitate this, we modify the "DOT" number so that "0" sorts to the end of +the list in the symset, and then only compare items before the first "0". + +###### declarations + static inline unsigned short item_num(int production, int index) + { + return production | (((index-1)&0x1f) << 11); + } + static inline int item_prod(unsigned short item) + { + return item & 0x7ff; + } + static inline int item_index(unsigned short item) + { + return ((item >> 11)+1) & 0x1f; + } + +For LR(1) analysis we need to compare not just the itemset in a state +but also the LA sets. As we assign each unique LA set a number, we +can just compare the symset and the data values together. + +###### functions + static int itemset_cmp(struct symset a, struct symset b, + enum grammar_type type) + { + int i; + int av, bv; + + for (i = 0; + i < a.cnt && i < b.cnt && + item_index(a.syms[i]) > 0 && + item_index(b.syms[i]) > 0; + i++) { + int diff = a.syms[i] - b.syms[i]; + if (diff) + return diff; + if (type == LR1) { + diff = a.data[i] - b.data[i]; + if (diff) + return diff; + } + } + if (i == a.cnt || item_index(a.syms[i]) == 0) + av = -1; + else + av = a.syms[i]; + if (i == b.cnt || item_index(b.syms[i]) == 0) + bv = -1; + else + bv = b.syms[i]; + if (av - bv) + return av - bv; + if (type < LR1 || av == -1) + return 0; + return + a.data[i] - b.data[i]; + } + +And now we can build the list of itemsets. The lookup routine returns +both a success flag and a pointer to where in the list an insert +should happen, so we don't need to search a second time. + +###### declarations + struct itemset { + struct itemset *next; + short state; + struct symset items; + struct symset go_to; + char completed; + }; + +###### grammar fields + struct itemset *items; + int states; + +###### functions + static int itemset_find(struct grammar *g, struct itemset ***where, + struct symset kernel, enum grammar_type type) + { + struct itemset **ip; + + for (ip = &g->items; *ip ; ip = & (*ip)->next) { + struct itemset *i = *ip; + int diff; + diff = itemset_cmp(i->items, kernel, type); + if (diff < 0) + continue; + if (diff > 0) + break; + /* found */ + *where = ip; + return 1; + } + *where = ip; + return 0; + } + +Adding an itemset may require merging the LA sets if LALR analysis is +happening. If any new LA set add symbol that weren't in the old LA set, we +clear the `completed` flag so that the dependants of this itemset will be +recalculated and their LA sets updated. + +`add_itemset` must consume the symsets it is passed, either by adding +them to a data structure, of freeing them. + + static int add_itemset(struct grammar *g, struct symset ss, + enum grammar_type type) + { + struct itemset **where, *is; + int i; + int found = itemset_find(g, &where, ss, type); + if (!found) { + is = calloc(1, sizeof(*is)); + is->state = g->states; + g->states += 1; + is->items = ss; + is->next = *where; + is->go_to = INIT_DATASET; + *where = is; + return is->state; + } + is = *where; + if (type != LALR) { + symset_free(ss); + return is->state; + } + for (i = 0; i < ss.cnt; i++) { + struct symset temp = INIT_SYMSET, s; + if (ss.data[i] == is->items.data[i]) + continue; + s = set_find(g, is->items.data[i]); + symset_union(&temp, &s); + s = set_find(g, ss.data[i]); + if (symset_union(&temp, &s)) { + is->items.data[i] = save_set(g, temp); + is->completed = 0; + } else + symset_free(temp); + } + symset_free(ss); + return is->state; + } + +#### The build + +To build all the itemsets, we first insert the initial itemset made from the +start symbol, complete each itemset, and then generate new itemsets from old +until no new ones can be made. + +Completing an itemset means finding all the items where "DOT" is followed by +a nonterminal and adding "DOT=0" items for every production from that +non-terminal - providing each item hasn't already been added. + +If LA sets are needed, the LA set for each new item is found using +`add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may +be supplemented by the LA set for the item which produce the new item. + +We also collect a set of all symbols which follow "DOT" (in `done`) as this +is used in the next stage. + +NOTE: precedence handling should happen here - I haven't written this yet +though. + +###### complete itemset + for (i = 0; i < is->items.cnt; i++) { + int p = item_prod(is->items.syms[i]); + int bs = item_index(is->items.syms[i]); + struct production *pr = g->productions[p]; + int p2; + struct symbol *s; + struct symset LA = INIT_SYMSET; + int sn = 0; + + if (bs == pr->body_size) + continue; + s = pr->body[bs]; + if (symset_find(&done, s->num) < 0) + symset_add(&done, s->num, 0); + if (s->type != Nonterminal) + continue; + again = 1; + if (type >= LALR) { + // Need the LA set. + int to_end; + add_first(pr, bs+1, &LA, g, &to_end); + if (to_end) { + struct symset ss = set_find(g, is->items.data[i]); + symset_union(&LA, &ss); + } + sn = save_set(g, LA); + LA = set_find(g, sn); + } + /* Add productions for this symbol */ + for (p2 = s->first_production; + p2 < g->production_count && + g->productions[p2]->head == s; + p2++) { + int itm = item_num(p2, 0); + int pos = symset_find(&is->items, itm); + if (pos < 0) { + symset_add(&is->items, itm, sn); + /* Will have re-ordered, so start + * from beginning again */ + i = -1; + } else if (type >= LALR) { + struct symset ss = set_find(g, is->items.data[pos]); + struct symset tmp = INIT_SYMSET; + + symset_union(&tmp, &ss); + if (symset_union(&tmp, &LA)) { + is->items.data[pos] = save_set(g, tmp); + i = -1; + }else + symset_free(tmp); + } + } + } + +For each symbol we found (and placed in `done`) we collect all the items for +which this symbol is next, and create a new itemset with "DOT" advanced over +the symbol. This is then added to the collection of itemsets (or merged +with a pre-existing itemset). + +###### derive itemsets + // Now we have a completed itemset, so we need to + // create all the 'next' itemset and create them + // if they don't exist. + for (i = 0; i < done.cnt; i++) { + int j; + int state; + struct symset newitemset = INIT_SYMSET; + if (type >= LALR) + newitemset = INIT_DATASET; + + for (j = 0; j < is->items.cnt; j++) { + int itm = is->items.syms[j]; + int p = item_prod(itm); + int bp = item_index(itm); + struct production *pr = g->productions[p]; + int la = 0; + int pos; + + if (bp == pr->body_size) + continue; + if (pr->body[bp]->num != done.syms[i]) + continue; + if (type >= LALR) + la = is->items.data[j]; + pos = symset_find(&newitemset, pr->head->num); + if (pos < 0) + symset_add(&newitemset, item_num(p, bp+1), la); + else if (type >= LALR) { + // Need to merge la set. + int la2 = newitemset.data[pos]; + if (la != la2) { + struct symset ss = set_find(g, la2); + struct symset LA = INIT_SYMSET; + symset_union(&LA, &ss); + ss = set_find(g, la); + if (symset_union(&LA, &ss)) + newitemset.data[pos] = save_set(g, LA); + else + symset_free(LA); + } + } + } + state = add_itemset(g, newitemset, type); + if (symset_find(&is->go_to, done.syms[i]) < 0) + symset_add(&is->go_to, done.syms[i], state); + } + +All that is left is to crate the initial itemset from production zero, and +with `TK_eof` as the LA set. + +###### functions + static void build_itemsets(struct grammar *g, enum grammar_type type) + { + struct symset first = INIT_SYMSET; + struct itemset *is; + int again; + int la = 0; + if (type >= LALR) { + // LA set just has eof + struct symset eof = INIT_SYMSET; + symset_add(&eof, TK_eof, 0); + la = save_set(g, eof); + first = INIT_DATASET; + } + // production 0, offset 0 (with no data) + symset_add(&first, item_num(0, 0), la); + add_itemset(g, first, type); + for (again = 0, is = g->items; + is; + is = is->next ?: again ? (again = 0, g->items) : NULL) { + int i; + struct symset done = INIT_SYMSET; + if (is->completed) + continue; + is->completed = 1; + ## complete itemset + ## derive itemsets + symset_free(done); + } + } + +### Completing the analysis. + +The exact process of analysis depends on which level was requested. For +`LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and +`LR1` we need first, but not follow. For `SLR` we need both. + +We don't build the "action" tables that you might expect as the parser +doesn't use them. They will be calculated to some extent if needed for +a report. + +Once we have built everything we allocate arrays for the two lists: +symbols and itemsets. This allows more efficient access during reporting. +The symbols are grouped as terminals and non-terminals and we record the +changeover point in `first_nonterm`. + +###### grammar fields + struct symbol **symtab; + struct itemset **statetab; + int first_nonterm; + +###### grammar_analyse + + static void grammar_analyse(struct grammar *g, enum grammar_type type) + { + struct symbol *s; + struct itemset *is; + int snum = TK_reserved; + for (s = g->syms; s; s = s->next) + if (s->num < 0 && s->type == Terminal) { + s->num = snum; + snum++; + } + g->first_nonterm = snum; + for (s = g->syms; s; s = s->next) + if (s->num < 0) { + s->num = snum; + snum++; + } + g->num_syms = snum; + g->symtab = calloc(g->num_syms, sizeof(g->symtab[0])); + for (s = g->syms; s; s = s->next) + g->symtab[s->num] = s; + + if (type >= SLR) { + set_nullable(g); + build_first(g); + } + if (type == SLR) + build_follow(g); + + build_itemsets(g, type); + + g->statetab = calloc(g->states, sizeof(g->statetab[0])); + for (is = g->items; is ; is = is->next) + g->statetab[is->state] = is; + } + +## Reporting on the Grammar + +The purpose of the report is to give the grammar developer insight into +how the grammar parser will work. It is basically a structured dump of +all the tables that have been generated, plus an description of any conflicts. + +###### grammar_report + static int grammar_report(struct grammar *g, enum grammar_type type) + { + report_symbols(g); + if (g->follow) + report_follow(g); + report_itemsets(g); + return report_conflicts(g, type); + } + +Firstly we have the complete list of symbols, together with the "FIRST" +set if that was generated. + +###### functions + + static void report_symbols(struct grammar *g) + { + int n; + if (g->first) + printf("SYMBOLS + FIRST:\n"); + else + printf("SYMBOLS:\n"); + + for (n = 0; n < g->num_syms; n++) { + struct symbol *s = g->symtab[n]; + if (!s) + continue; + + printf(" %c%3d%c: ", + s->nullable ? '*':' ', + s->num, symtypes[s->type]); + prtxt(s->name); + if (s->precedence) + printf(" (%d%s)", s->precedence, + assoc_names[s->assoc]); + + if (g->first && s->type == Nonterminal) { + int j; + char c = ':'; + for (j = 0; j < g->first[n].cnt; j++) { + printf("%c ", c); + c = ','; + prtxt(g->symtab[g->first[n].syms[j]]->name); + } + } + printf("\n"); + } + } + +Then we have to follow sets if they were computed. + + static void report_follow(struct grammar *g) + { + int n; + printf("FOLLOW:\n"); + for (n = 0; n < g->num_syms; n++) + if (g->follow[n].cnt) { + int j; + char c = ':'; + printf(" "); + prtxt(g->symtab[n]->name); + for (j = 0; j < g->follow[n].cnt; j++) { + printf("%c ", c); + c = ','; + prtxt(g->symtab[g->follow[n].syms[j]]->name); + } + printf("\n"); + } + } + +And finally the item sets. These include the GO TO tables and, for +LALR and LR1, the LA set for each item. Lots of stuff, so we break +it up a bit. First the items, with production number and associativity. + + static void report_item(struct grammar *g, int itm) + { + int p = item_prod(itm); + int dot = item_index(itm); + struct production *pr = g->productions[p]; + int i; + + printf(" "); + prtxt(pr->head->name); + printf(" ->"); + for (i = 0; i < pr->body_size; i++) { + printf(" %s", (dot == i ? ". ": "")); + prtxt(pr->body[i]->name); + } + if (dot == pr->body_size) + printf(" ."); + printf(" [%d]", p); + if (pr->precedence) + printf(" (%d%s)", pr->precedence, + assoc_names[pr->assoc]); + printf("\n"); + } + +The LA sets which are (possibly) reported with each item: + + static void report_la(struct grammar *g, int lanum) + { + struct symset la = set_find(g, lanum); + int i; + char c = ':'; + + printf(" LOOK AHEAD(%d)", lanum); + for (i = 0; i < la.cnt; i++) { + printf("%c ", c); + c = ','; + prtxt(g->symtab[la.syms[i]]->name); + } + printf("\n"); + } + +Then the go to sets: + + + static void report_goto(struct grammar *g, struct symset gt) + { + int i; + printf(" GOTO:\n"); + + for (i = 0; i < gt.cnt; i++) { + printf(" "); + prtxt(g->symtab[gt.syms[i]]->name); + printf(" -> %d\n", gt.data[i]); + } + } + +Now we can report all the item sets complete with items, LA sets, and GO TO. + + static void report_itemsets(struct grammar *g) + { + int s; + printf("ITEM SETS(%d)\n", g->states); + for (s = 0; s < g->states; s++) { + int j; + struct itemset *is = g->statetab[s]; + printf(" Itemset %d:\n", s); + for (j = 0; j < is->items.cnt; j++) { + report_item(g, is->items.syms[j]); + if (is->items.data != NO_DATA) + report_la(g, is->items.data[j]); + } + report_goto(g, is->go_to); + } + } + +### Reporting conflicts + +Conflict detection varies a lot among different analysis levels. However +LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and +LR1 are also similar as they have FOLLOW or LA sets. + +###### functions + + ## conflict functions + + static int report_conflicts(struct grammar *g, enum grammar_type type) + { + int cnt = 0; + printf("Conflicts:\n"); + if (type < SLR) + cnt = conflicts_lr0(g, type); + else + cnt = conflicts_slr(g, type); + if (cnt == 0) + printf(" - no conflicts\n"); + return cnt; + } + +LR0 conflicts are any state which have both a reducible item and +a shiftable item. + +LR05 conflicts only occurs if two possibly reductions exist, +as shifts always over-ride reductions. + +###### conflict functions + static int conflicts_lr0(struct grammar *g, enum grammar_type type) + { + int i; + int cnt = 0; + for (i = 0; i < g->states; i++) { + struct itemset *is = g->statetab[i]; + int last_reduce = -1; + int prev_reduce = -1; + int last_shift = -1; + int j; + if (!is) + continue; + for (j = 0; j < is->items.cnt; j++) { + int itm = is->items.syms[j]; + int p = item_prod(itm); + int bp = item_index(itm); + struct production *pr = g->productions[p]; + + if (bp == pr->body_size) { + prev_reduce = last_reduce; + last_reduce = j; + continue; + } + if (pr->body[bp]->type == Terminal) + last_shift = j; + } + if (type == LR0 && last_reduce >= 0 && last_shift >= 0) { + printf(" State %d has both SHIFT and REDUCE:\n", i); + report_item(g, is->items.syms[last_shift]); + report_item(g, is->items.syms[last_reduce]); + cnt++; + } + if (prev_reduce >= 0) { + printf(" State %d has 2 (or more) reducible items\n", i); + report_item(g, is->items.syms[prev_reduce]); + report_item(g, is->items.syms[last_reduce]); + cnt++; + } + } + return cnt; + } + +SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping +look ahead, or if a symbol in a look-ahead can be shifted. The differ only +in the source of the look ahead set. + +We build a dataset mapping terminal to item for possible SHIFTs and then +another for possible REDUCE operations. We report when we get conflicts +between the two. + + static int conflicts_slr(struct grammar *g, enum grammar_type type) + { + int i; + int cnt = 0; + + for (i = 0; i < g->states; i++) { + struct itemset *is = g->statetab[i]; + struct symset shifts = INIT_DATASET; + struct symset reduce = INIT_DATASET; + int j; + if (!is) + continue; + /* First collect the shifts */ + for (j = 0; j < is->items.cnt; j++) { + int itm = is->items.syms[j]; + int p = item_prod(itm); + int bp = item_index(itm); + struct production *pr = g->productions[p]; + + if (bp < pr->body_size && + pr->body[bp]->type == Terminal) { + /* shiftable */ + int sym = pr->body[bp]->num; + if (symset_find(&shifts, sym) < 0) + symset_add(&shifts, sym, itm); + } + } + /* Now look for reduction and conflicts */ + for (j = 0; j < is->items.cnt; j++) { + int itm = is->items.syms[j]; + int p = item_prod(itm); + int bp = item_index(itm); + struct production *pr = g->productions[p]; + + if (bp < pr->body_size) + continue; + /* reducible */ + struct symset la; + if (type == SLR) + la = g->follow[pr->head->num]; + else + la = set_find(g, is->items.data[j]); + int k; + for (k = 0; k < la.cnt; k++) { + int pos = symset_find(&shifts, la.syms[k]); + if (pos >= 0) { + printf(" State %d has SHIFT/REDUCE conflict on ", i); + prtxt(g->symtab[la.syms[k]]->name); + printf(":\n"); + report_item(g, shifts.data[pos]); + report_item(g, itm); + cnt++; + } + pos = symset_find(&reduce, la.syms[k]); + if (pos < 0) { + symset_add(&reduce, la.syms[k], itm); + continue; + } + printf(" State %d has REDUCE/REDUCE conflict on ", i); + prtxt(g->symtab[la.syms[k]]->name); + printf(":\n"); + report_item(g, itm); + report_item(g, reduce.data[pos]); + cnt++; + } + } + symset_free(shifts); + symset_free(reduce); + } + return cnt; + } + + +## Generating the parser + +The export part of the parser is the `parse_XX` function, where the name +`XX` is based on the name of the parser files. + +This takes a `code_node`, a partially initialized `token_config`, and an +optional `FILE` to send tracing to. The `token_config` gets the list of +known words added and then is used with the `code_node` to initialize the +scanner. + +`parse_XX` then call the library function `parser_run` to actually complete +the parse, This needs the `states` table and function to call the various +pieces of code provided in the grammar file, so they are generated first. + +###### parser_generate + + static void gen_parser(FILE *f, struct grammar *g, char *file, char *name) + { + gen_known(f, g); + gen_goto(f, g); + gen_states(f, g); + gen_reduce(f, g, file); + gen_free(f, g); + + fprintf(f, "#line 0 \"gen_parser\"\n"); + fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n", + name); + fprintf(f, "{\n"); + fprintf(f, "\tstruct token_state *tokens;\n"); + fprintf(f, "\tconfig->words_marks = known;\n"); + fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n"); + fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n"); + fprintf(f, "\ttokens = token_open(code, config);\n"); + fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace);\n"); + fprintf(f, "\ttoken_close(tokens);\n"); + fprintf(f, "\treturn rv;\n"); + fprintf(f, "}\n\n"); + } + +### Table words table + +The know words is simply an array of terminal symbols. + +###### functions + + static void gen_known(FILE *f, struct grammar *g) + { + int i; + fprintf(f, "#line 0 \"gen_known\"\n"); + fprintf(f, "static const char *known[] = {\n"); + for (i = TK_reserved; + i < g->num_syms && g->symtab[i]->type == Terminal; + i++) + fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len, + g->symtab[i]->name.txt); + fprintf(f, "};\n\n"); + } + +### States and the goto tables. + +For each state we record the goto table and the reducible production if +there is one. +Some of the details of the reducible production are stored in the +`do_reduce` function to come later. Here we store the production number, +the body size (useful for stack management) and the resulting symbol (useful +for knowing how to free data later). + +The go to table is stored in a simple array of `sym` and corresponding +`state`. + +###### exported types + + struct lookup { + short sym; + short state; + }; + struct state { + short go_to_cnt; + const struct lookup * go_to; + short reduce_prod; + short reduce_size; + short reduce_sym; + }; + + +###### functions + + static void gen_goto(FILE *f, struct grammar *g) + { + int i; + fprintf(f, "#line 0 \"gen_goto\"\n"); + for (i = 0; i < g->states; i++) { + int j; + fprintf(f, "static const struct lookup goto_%d[] = {\n", + i); + struct symset gt = g->statetab[i]->go_to; + for (j = 0; j < gt.cnt; j++) + fprintf(f, "\t{ %d, %d },\n", + gt.syms[j], gt.data[j]); + fprintf(f, "};\n"); + } + } + +###### functions + + static void gen_states(FILE *f, struct grammar *g) + { + int i; + fprintf(f, "#line 0 \"gen_states\"\n"); + fprintf(f, "static const struct state states[] = {\n"); + for (i = 0; i < g->states; i++) { + struct itemset *is = g->statetab[i]; + int j, prod = -1; + for (j = 0; j < is->items.cnt; j++) { + int itm = is->items.syms[j]; + int p = item_prod(itm); + int bp = item_index(itm); + struct production *pr = g->productions[p]; + + if (bp < pr->body_size) + continue; + /* This is what we reduce */ + prod = p; + break; + } + + fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d },\n", + i, is->go_to.cnt, i, prod, + prod < 0 ? -1 : g->productions[prod]->body_size, + prod < 0 ? -1 : g->productions[prod]->head->num); + } + fprintf(f, "};\n\n"); + } + +### The `do_reduce` function and the code + +When the parser engine decides to reduce a production, it calls `do_reduce`. +This has two functions. + +Firstly, if a non-NULL `trace` file is passed, it prints out details of the +production being reduced. Secondly it runs the code that was included with +the production if any. + +This code needs to be able to store data somewhere. Rather than requiring +`do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have +`do_reduce` return the size to be saved. + +The code fragment requires translation when written out. Any `$N` needs to +be converted to a reference either to that buffer (if `$0`) or to the +structure returned by a previous reduction. These pointer need to be cast +to the appropriate type for each access. All this is handling in +`gen_code`. + + +###### functions + + static void gen_code(struct production *p, FILE *f, struct grammar *g) + { + char *c; + fprintf(f, "\t\t\t"); + for (c = p->code.txt; c < p->code.txt + p->code.len; c++) { + int n; + if (*c != '$') { + fputc(*c, f); + if (*c == '\n') + fputs("\t\t\t", f); + continue; + } + c++; + if (*c < '0' || *c > '9') { + fputc(*c, f); + continue; + } + n = *c - '0'; + while (c[1] >= '0' && c[1] <= '9') { + c += 1; + n = n * 10 + *c - '0'; + } + if (n == 0) + fprintf(f, "(*(struct %.*s*)ret)", + p->head->struct_name.len, + p->head->struct_name.txt); + else if (n > p->body_size) + fprintf(f, "$%d", n); + else if (p->body[n-1]->type == Terminal) + fprintf(f, "(*(struct token *)body[%d])", + n-1); + else if (p->body[n-1]->struct_name.txt == NULL) + fprintf(f, "$%d", n); + else + fprintf(f, "(*(struct %.*s*)body[%d])", + p->body[n-1]->struct_name.len, + p->body[n-1]->struct_name.txt, n-1); + } + fputs("\n", f); + } + +###### functions + + static void gen_reduce(FILE *f, struct grammar *g, char *file) + { + int i, j; + fprintf(f, "#line 0 \"gen_reduce\"\n"); + fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n"); + fprintf(f, " void *ret, FILE *trace)\n"); + fprintf(f, "{\n"); + fprintf(f, "\tint ret_size = 0;\n"); + + fprintf(f, "\tswitch(prod) {\n"); + for (i = 0; i < g->production_count; i++) { + struct production *p = g->productions[i]; + fprintf(f, "\tcase %d:\n", i); + + if (p->code.txt) + gen_code(p, f, g); + + fprintf(f, "\t\tif (trace) {\n"); + fprintf(f, "\t\t\tfprintf(trace, \"[%%2d]%.*s ->\", depth);\n", + p->head->name.len, p->head->name.txt); + for (j = 0; j < p->body_size; j++) { + if (p->body[j]->type == Terminal) { + fputs("\t\t\tfputs(\" \", trace);\n", f); + fprintf(f, "\t\t\ttext_dump(trace, (*(struct token*)body[%d]).txt, 20);\n", j); + } else { + fprintf(f, "\t\t\tfprintf(trace, \" %.*s\");\n", + p->body[j]->name.len, + p->body[j]->name.txt); + } + } + fprintf(f, "\t\t}\n"); + + if (p->head->struct_name.txt) + fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n", + p->head->struct_name.len, + p->head->struct_name.txt); + + fprintf(f, "\t\tbreak;\n"); + } + fprintf(f, "\t}\n\treturn ret_size;\n}\n\n"); + } + +### `do_free` + +As each non-terminal can potentially cause a different type of data +structure to be allocated and filled in, we need to be able to free it when +done. + +It is particularly important to have fine control over freeing during error +recovery where individual stack frames might need to be freed. + +For this, the grammar author required to defined a `free_XX` function for +each structure that is used by a non-terminal. `do_free` all call whichever +is appropriate given a symbol number, and will call `free` (as is +appropriate for tokens` on any terminal symbol. + +###### functions + + static void gen_free(FILE *f, struct grammar *g) + { + int i; + + fprintf(f, "#line 0 \"gen_free\"\n"); + fprintf(f, "static void do_free(short sym, void *asn)\n"); + fprintf(f, "{\n"); + fprintf(f, "\tif (!asn) return;\n"); + fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm); + fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n"); + fprintf(f, "\tswitch(sym) {\n"); + + for (i = 0; i < g->num_syms; i++) { + struct symbol *s = g->symtab[i]; + if (!s || + s->type != Nonterminal || + s->struct_name.txt == NULL) + continue; + + fprintf(f, "\tcase %d:\n", s->num); + fprintf(f, "\t\tfree_%.*s(asn);\n", + s->struct_name.len, + s->struct_name.txt); + fprintf(f, "\t\tbreak;\n"); + } + fprintf(f, "\t}\n}\n\n"); + } + +## The main routine. + +There are three key parts to the "main" function of parsergen: processing +the arguments, loading the grammar file, and dealing with the grammar. + +### Argument processing. + +Command line options allow the selection of analysis mode, name of output +file, and whether or not a report should be generated. By default we create +a report only if no code output was requested. + +The `parse_XX` function name uses the basename of the output file which +should not have a suffix (`.c`). `.c` is added to the given name for the +code, and `.h` is added for the header (if header text is specifed in the +grammar file). + +###### includes + #include + +###### declarations + static const struct option long_options[] = { + { "LR0", 0, NULL, '0' }, + { "LR05", 0, NULL, '5' }, + { "SLR", 0, NULL, 'S' }, + { "LALR", 0, NULL, 'L' }, + { "LR1", 0, NULL, '1' }, + { "report", 0, NULL, 'R' }, + { "output", 1, NULL, 'o' }, + { NULL, 0, NULL, 0 } + }; + const char *options = "05SL1Ro:"; + +###### process arguments + int opt; + char *outfile = NULL; + char *infile; + char *name; + int report = 1; + enum grammar_type type = LR05; + while ((opt = getopt_long(argc, argv, options, + long_options, NULL)) != -1) { + switch(opt) { + case '0': + type = LR0; break; + case '5': + type = LR05; break; + case 'S': + type = SLR; break; + case 'L': + type = LALR; break; + case '1': + type = LR1; break; + case 'R': + report = 2; break; + case 'o': + outfile = optarg; break; + default: + fprintf(stderr, "Usage: parsergen ...\n"); + exit(1); + } + } + if (optind < argc) + infile = argv[optind++]; + else { + fprintf(stderr, "No input file given\n"); + exit(1); + } + if (outfile && report == 1) + report = 0; + name = outfile; + if (name && strchr(name, '/')) + name = strrchr(name, '/')+1; + + if (optind < argc) { + fprintf(stderr, "Excess command line arguments\n"); + exit(1); + } + +### Loading the grammar file + +To be able to run `mdcode` and `scanner` on the grammar we need to memory +map it. + +One we have extracted the code (with `mdcode`) we expect to file three +sections: header, code, and grammar. Anything else is an error. + +"header" and "code" are optional, though it is hard to build a working +parser with neither. "grammar" must be provided. + +###### includes + #include + #include + #include + +###### functions + static int errs; + static void pr_err(char *msg) + { + errs++; + fprintf(stderr, "%s\n", msg); + } + +###### load file + struct section *table; + int fd; + int len; + char *file; + fd = open(infile, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "parsergen: cannot open %s: %s\n", + infile, strerror(errno)); + exit(1); + } + len = lseek(fd, 0, 2); + file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); + table = code_extract(file, file+len, pr_err); + + struct code_node *hdr = NULL; + struct code_node *code = NULL; + struct code_node *gram = NULL; + for (s = table; s; s = s->next) { + if (text_is(s->section, "header")) + hdr = s->code; + else if (text_is(s->section, "code")) + code = s->code; + else if (text_is(s->section, "grammar")) + gram = s->code; + else { + fprintf(stderr, "Unknown content section: %.*s\n", + s->section.len, s->section.txt); + rv |= 2; + } + } + +### Processing the input + +As we need to append an extention to a filename and then open it for +writing, and we need to do this twice, it helps to have a separate function. + +###### functions + + static FILE *open_ext(char *base, char *ext) + { + char *fn = malloc(strlen(base) + strlen(ext) + 1); + FILE *f; + strcat(strcpy(fn, base), ext); + f = fopen(fn, "w"); + free(fn); + return f; + } + +If we can read the grammar, then we analyse and optionally report on it. If +the report finds conflicts we will exit with an error status. + +###### process input + struct grammar *g = NULL; + if (gram == NULL) { + fprintf(stderr, "No grammar section provided\n"); + rv |= 2; + } else { + g = grammar_read(gram); + if (!g) { + fprintf(stderr, "Failure to parse grammar\n"); + rv |= 2; + } + } + if (g) { + grammar_analyse(g, type); + if (report) + if (grammar_report(g, type)) + rv |= 1; + } + +If a headers section is defined, we write it out and include a declaration +for the `parse_XX` function so it can be used from separate code. + + if (rv == 0 && hdr && outfile) { + FILE *f = open_ext(outfile, ".h"); + if (f) { + code_node_print(f, hdr, infile); + fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n", + name); + fclose(f); + } else { + fprintf(stderr, "Cannot create %s.h\n", + outfile); + rv |= 4; + } + } + +And if all goes well and an output file was provided, we create the `.c` +file with the code section (if any) and the parser tables and function. + + if (rv == 0 && outfile) { + FILE *f = open_ext(outfile, ".c"); + if (f) { + if (code) + code_node_print(f, code, infile); + gen_parser(f, g, infile, name); + fclose(f); + } else { + fprintf(stderr, "Cannot create %s.c\n", + outfile); + rv |= 4; + } + } + +And that about wraps it up. We need to set the locale so that UTF-8 is +recognised properly, and link with `libicuuc` is `libmdcode` requires that. + +###### File: parsergen.mk + parsergen : parsergen.o libscanner.o libmdcode.o + $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc + +###### includes + #include + +###### main + + int main(int argc, char *argv[]) + { + struct section *s; + int rv = 0; + + setlocale(LC_ALL,""); + + ## process arguments + ## load file + ## process input + + return rv; + } + +## The SHIFT/REDUCE parser + +Having analysed the grammar and generated all the table, we only need the +shift/reduce engine to bring it all together. + +### Goto table lookup + +The parser generator has nicely provided us with goto tables sorted by +symbol number. We need a binary search function to find a symbol in the +table. + +###### parser + + static int search(const struct state *l, int sym) + { + int lo = 0; + int hi = l->go_to_cnt; + + if (hi == 0) + return -1; + while (lo + 1 < hi) { + int mid = (lo + hi) / 2; + if (l->go_to[mid].sym <= sym) + lo = mid; + else + hi = mid; + } + if (l->go_to[lo].sym == sym) + return l->go_to[lo].state; + else + return -1; + } + +### The state stack. + +The core data structure for the parser is the stack. This track all the +symbols that have been recognised or partially recognised. + +The stack usually won't grow very large - maybe a few tens of entries. So +we dynamically resize and array as required but never bother to shrink it +down again. + +We keep the stack as two separate allocations. One, `asn_stack` stores the +"abstract syntax nodes" which are created by each reduction. When we call +`do_reduce` we need to pass an array of the `asn`s of the body of the +production, and by keeping a separate `asn` stack, we can just pass a +pointer into this stack. + +The other allocation store all other stack fields of which there are two. +The `state` is the most important one and guides the parsing process. The +`sym` is nearly unnecessary. However when we want to free entries from the +`asn_stack`, it helps to know what type they are so we can call the right +freeing function. The symbol leads us to the right free function through +`do_free`. + +###### parser + + struct parser { + int state; + struct frame { + short state; + short sym; + } *stack; + void **asn_stack; + int stack_size; + int tos; + }; + +#### Shift and pop + +The operations are needed on the stack - shift (which is like push) and pop. + +Shift applies no only to terminals but also to non-terminals. When we +reduce a production we will pop off entries corresponding to the body +symbols, then push on an item for the head of the production. This last is +exactly the same process as shifting in a terminal so we use the same +function for both. + +To simplify other code we arrange for `shift` to fail if there is no `goto` +state for the symbol. This is useful in basic parsing due to our design +that we shift when we can, and reduce when we cannot. So the `shift` +function reports if it could. + +So `shift` finds the next state. If that succeed it extends the allocations +if needed and pushed all the information onto the stacks. + +###### parser + + static int shift(struct parser *p, + int sym, void *asn, + const struct state states[]) + { + // Push an entry onto the stack + int newstate = search(&states[p->state], sym); + if (newstate < 0) + return 0; + if (p->tos >= p->stack_size) { + p->stack_size += 10; + p->stack = realloc(p->stack, p->stack_size + * sizeof(p->stack[0])); + p->asn_stack = realloc(p->asn_stack, p->stack_size + * sizeof(p->asn_stack[0])); + } + p->stack[p->tos].state = p->state; + p->stack[p->tos].sym = sym; + p->asn_stack[p->tos] = asn; + p->tos++; + p->state = newstate; + return 1; + } + +`pop` simply moves the top of stack (`tos`) back down the required amount +and frees and `asn` entries that need to be freed. It is called _after_ we +reduce a production, just before we `shift` the nonterminal in. + +###### parser + + static void pop(struct parser *p, int num, + void(*do_free)(short sym, void *asn)) + { + int i; + p->tos -= num; + for (i = 0; i < num; i++) + do_free(p->stack[p->tos+i].sym, + p->asn_stack[p->tos+i]); + + p->state = p->stack[p->tos].state; + } + +### Memory allocation + +The `scanner` returns tokens in a local variable - we want them in allocated +memory so they can live in the `asn_stack`. Similarly the `asn` produce by +a reduce is in a large buffer. Both of these require some allocation and +copying, hence `memdup` and `tokcopy`. + +###### parser includes + #include + +###### parser + + void *memdup(void *m, int len) + { + void *ret; + if (len == 0) + return NULL; + ret = malloc(len); + memcpy(ret, m, len); + return ret; + } + + static struct token *tok_copy(struct token tk) + { + struct token *new = malloc(sizeof(*new)); + *new = tk; + return new; + } + +### The heart of the parser. + +Now we have the parser. If we can shift, we do. If not and we can reduce +we do. If the production we reduced was production zero, then we have +accepted the input and can finish. + +If we can neither shift nor reduce we have an error to handle. We pop +single entries off the stack until we can shift the `TK_error` symbol, the +drop input tokens until we find one we can shift into the new error state. + +We return whatever `asn` was returned by reducing production zero. + +###### parser includes + #include "parser.h" +###### parser + void *parser_run(struct token_state *tokens, + const struct state states[], + int (*do_reduce)(int, int, void**, void*, FILE*), + void (*do_free)(short, void*), + FILE *trace) + { + struct parser p = { 0 }; + struct token *tk; + int accepted = 0; + void *ret; + + tk = tok_copy(token_next(tokens)); + while (!accepted) { + if (shift(&p, tk->num, tk, states)) { + if (trace) { + fputs("Shift ", trace); + text_dump(trace, tk->txt, 20); + fputs("\n", trace); + } + tk = tok_copy(token_next(tokens)); + continue; + } + if (states[p.state].reduce_prod >= 0) { + void **body; + int prod = states[p.state].reduce_prod; + int size = states[p.state].reduce_size; + int sym = states[p.state].reduce_sym; + int bufsize; + static char buf[16*1024]; + + body = p.asn_stack + + (p.tos - states[p.state].reduce_size); + + bufsize = do_reduce(prod, p.tos, body, + buf, trace); + if (trace) + fputs("\n", trace); + + pop(&p, size, do_free); + shift(&p, sym, memdup(buf, bufsize), states); + if (prod == 0) + accepted = 1; + continue; + } + /* Error. we walk up the stack until we + * find a state which will accept TK_error. + * We then shift in TK_error and see what state + * that takes us too. + * Then we discard input tokens until + * we find one that is acceptable. + */ + while (shift(&p, TK_error, tk, states) < 0 + && p.tos > 0) + // discard this state + pop(&p, 1, do_free); + tk = tok_copy(*tk); + while (search(&states[p.state], tk->num) < 0 && + tk->num != TK_eof) { + free(tk); + tk = tok_copy(token_next(tokens)); + } + if (p.tos == 0 && tk->num == TK_eof) + break; + } + free(tk); + if (accepted) + ret = p.asn_stack[0]; + else + pop(&p, p.tos, do_free); + free(p.asn_stack); + free(p.stack); + return ret; + } + +###### exported functions + void *parser_run(struct token_state *tokens, + const struct state states[], + int (*do_reduce)(int, int, void**, void*, FILE*), + void (*do_free)(short, void*), + FILE *trace); + + +# A Worked Example + +The obvious example for a parser is a calculator. + +As `scanner` provides number parsing function using `libgmp` is it not much +work to perform arbitrary rational number calculations. + +This calculator takes one expression, or an equality test per line. The +results are printed and in any equality test fails, the program exits with +an error. + +Embedding mdcode inside mdcode is rather horrible. I'd like to find a +better approach, but as the grammar file must have 3 components I need +something like this. + +###### File: parsergen.mk + calc.c : parsergen calc.cgm + ./parsergen -o calc calc.cgm + calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o + $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp + +###### demo grammar + + # header + ~~~~~ + + #include "number.h" + // what do we use for a demo-grammar? A calculator of course. + struct number { + mpq_t val; + char tail[2]; + int err; + }; + + ~~~~~ + # code + ~~~~~ + + #include + #include + #include + #include + #include + #include + #include + #include "mdcode.h" + #include "scanner.h" + #include "number.h" + #include "parser.h" + + #include "calc.h" + + static void free_number(struct number *n) + { + mpq_clear(n->val); + free(n); + } + + int main(int argc, char *argv[]) + { + int fd = open(argv[1], O_RDONLY); + int len = lseek(fd, 0, 2); + char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0); + struct section *s = code_extract(file, file+len, NULL); + struct token_config config = { + .ignored = (1 << TK_line_comment) + | (1 << TK_block_comment) + | (1 << TK_indent) + | (1 << TK_undent), + .number_chars = ".,_+-", + .word_start = "", + .word_cont = "", + }; + parse_calc(s->code, &config, argc > 2 ? stderr : NULL); + exit(0); + } + + ~~~~~ + # grammar + ~~~~~ + + Session -> Session Line + | Line + + Line -> Expression NEWLINE ${ gmp_printf( "Answer = %Qd\n", $1.val); + { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val); + gmp_printf( " or as a decimal: %Fg\n", fl); + mpf_clear(fl); + } + }$ + | Expression = Expression NEWLINE ${ + if (mpq_equal($1.val, $3.val)) + gmp_printf( "Both equal %Qd\n", $1.val); + else { + gmp_printf( "NOT EQUAL: %Qd\n != : %Qd\n", + $1.val, $3.val); + exit(1); + } + }$ + | NEWLINE ${ printf( "Blank line\n"); }$ + | ERROR NEWLINE ${ printf( "Skipped a bad line\n"); }$ + + $number + Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$ + | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$ + | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$ + + Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$ + | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$ + | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$ + + Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$ + | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$ -- 2.43.0