--- /dev/null
+# LR(1) Parser Generator #
+
+This parser generator takes a grammar description combined with code
+fragments, analyses it, and can report details about the analysis and
+write out C code files which can be compiled to make a parser.
+
+There are several distinct sections.
+
+1. `grammar_read` will read a grammar from a literate-code file and
+ store the productions, symbols, and code fragments.
+2. `grammar_analyse` will take that grammar and build LR parsing
+ states and look-ahead tables.
+3. `grammar_report` will present the details of the analysis
+ in a readable format and will report any conflicts.
+4. `parser_generate` will write out C code files with various
+ tables and with the code fragments arranged in useful places.
+5. `parser_run` is a library function intended to be linked together
+ with the generated parser tables to complete the implementation of
+ a parser.
+6. `calc.cgm` is a test grammar for a simple calculator.
+
+###### File: parsergen.c
+ #include <unistd.h>
+ #include <stdlib.h>
+ #include <stdio.h>
+ ## includes
+ ## forward declarations
+ ## declarations
+ ## functions
+ ## grammar_read
+ ## grammar_analyse
+ ## grammar_report
+ ## parser_generate
+ ## main
+###### File: parser.h
+ ## exported types
+ ## exported functions
+###### File: libparser.c
+ #include <unistd.h>
+ #include <stdlib.h>
+ #include <stdio.h>
+ ## parser includes
+ ## parser
+###### File: calc.cgm
+ ## demo grammar
+###### File: parsergen.mk
+ CFLAGS += -Wall -g
+ all :: parsergen calc
+ parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
+ ./md2c parsergen.mdc
+
+## Reading the grammar
+
+The grammar must be stored in a markdown literate code file as parsed
+by "[mdcode][]". It must have three top level (i.e. unreferenced)
+sections: `header`, `code`, and `grammar`. The first two will be
+literally copied into the generated `.c` and `.h`. files. The last
+contains the grammar. This is tokenised with "[scanner][]".
+
+[mdcode]: mdcode.html
+[scanner]: scanner.html
+
+###### includes
+ #include "mdcode.h"
+ #include "scanner.h"
+
+###### parser includes
+ #include "mdcode.h"
+ #include "scanner.h"
+
+The grammar contains production sets, precedence/associativity
+declarations, and data type declarations. These are all parsed with
+_ad hoc_ parsing as we don't have a parser generator yet.
+
+The precedence and associativity can be set for each production, but
+can be inherited from symbols. The data type is potentially defined
+for each non-terminal and describes what C structure is used to store
+information about each symbol.
+
+###### declarations
+ enum assoc {Left, Right, Non};
+ char *assoc_names[] = {"Left","Right","Non"};
+
+ struct symbol {
+ struct text struct_name;
+ enum assoc assoc;
+ unsigned short precedence;
+ ## symbol fields
+ };
+ struct production {
+ unsigned short precedence;
+ enum assoc assoc;
+ ## production fields
+ };
+ struct grammar {
+ ## grammar fields
+ };
+
+The strings reported by `mdcode` and `scanner` are `struct text` which have
+length rather than being null terminated. To help with printing an
+comparing we define `text_is` and `prtxt`, which should possibly go in
+`mdcode`. `scanner` does provide `text_dump` which is useful for strings
+which might contain control characters.
+
+###### functions
+ static int text_is(struct text t, char *s)
+ {
+ return (strlen(s) == t.len &&
+ strncmp(s, t.txt, t.len) == 0);
+ }
+ static void prtxt(struct text t)
+ {
+ printf("%.*s", t.len, t.txt);
+ }
+
+### Symbols
+
+Productions are comprised primarily of symbols - terminal and
+non-terminal. We do not make a syntactic distinction between the two,
+though non-terminals must be identifiers. Non-terminal symbols are
+those which appear in the head of a production, terminal symbols are
+those which don't. There are also "virtual" symbols used for precedence
+marking discussed later, and sometimes we won't know what type a symbol
+is yet.
+
+###### forward declarations
+ enum symtype { Unknown, Virtual, Terminal, Nonterminal };
+ char *symtypes = "UVTN";
+###### symbol fields
+ enum symtype type;
+
+Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
+table of known symbols and the resulting parser will report them as
+`TK_reserved + N`. A small set of identifiers are reserved for the
+different token types that `scanner` can report.
+
+###### declarations
+ static char *reserved_words[] = {
+ [TK_error] = "ERROR",
+ [TK_number] = "NUMBER",
+ [TK_ident] = "IDENTIFIER",
+ [TK_mark] = "MARK",
+ [TK_string] = "STRING",
+ [TK_multi_string] = "MULTI_STRING",
+ [TK_indent] = "INDENT",
+ [TK_undent] = "UNDENT",
+ [TK_newline] = "NEWLINE",
+ [TK_eof] = "$eof",
+ };
+###### symbol fields
+ short num;
+
+Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
+recognised. The former is automatically expected at the end of the text
+being parsed. The latter are always ignored by the parser.
+
+All of these symbols are stored in a simple symbol table. We use the
+`struct text` from `mdcode` to store the name and link them together
+into a sorted list using an insertion sort.
+
+We don't have separate `find` and `insert` functions as any symbol we
+find needs to be remembered. We simply expect `find` to always return a
+symbol, but its type might be `Unknown`.
+
+###### includes
+ #include <string.h>
+
+###### symbol fields
+ struct text name;
+ struct symbol *next;
+
+###### grammar fields
+ struct symbol *syms;
+ int num_syms;
+
+###### functions
+ static int text_cmp(struct text a, struct text b)
+ {
+ int len = a.len;
+ if (a.len > b.len)
+ len = b.len;
+ int cmp = strncmp(a.txt, b.txt, len);
+ if (cmp)
+ return cmp;
+ else
+ return a.len - b.len;
+ }
+
+ static struct symbol *sym_find(struct grammar *g, struct text s)
+ {
+ struct symbol **l = &g->syms;
+ struct symbol *n;
+ int cmp = 1;
+
+ while (*l &&
+ (cmp = text_cmp((*l)->name, s)) < 0)
+ l = & (*l)->next;
+ if (cmp == 0)
+ return *l;
+ n = calloc(1, sizeof(*n));
+ n->name = s;
+ n->type = Unknown;
+ n->next = *l;
+ n->num = -1;
+ *l = n;
+ return n;
+ }
+
+ static void symbols_init(struct grammar *g)
+ {
+ int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
+ int i;
+ for (i = 0; i < entries; i++) {
+ struct text t;
+ struct symbol *s;
+ t.txt = reserved_words[i];
+ if (!t.txt)
+ continue;
+ t.len = strlen(t.txt);
+ s = sym_find(g, t);
+ s->type = Terminal;
+ s->num = i;
+ }
+ }
+
+### Data types and precedence.
+
+Data type specification and precedence specification are both
+introduced by a dollar sign at the start of the line. If the next
+word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+precedence, otherwise it specifies a data type.
+
+The data type name is simply stored and applied to the head of all
+subsequent productions. It must be the name of a structure, so `$expr`
+maps to `struct expr`.
+
+The precedence line must contain a list of symbols - typically
+terminal symbols, but not necessarily. It can only contain symbols
+that have not been seen yet, so precedence declaration must precede
+the productions that they affect.
+
+A precedence line may also contain a "virtual" symbol which is an
+identifier preceded by `$$`. Virtual terminals carry precedence
+information but are not included in the grammar. A production can
+declare that it inherits the precedence of a given virtual symbol.
+
+This use for `$$` precludes it from being used as a symbol in the
+described language. Two other symbols: `${` and `}$` are also
+unavailable.
+
+Each new precedence line introduces a new precedence level and
+declares how it associates. This level is stored in each symbol
+listed and may be inherited by any production which uses the symbol. A
+production inherits from the last symbol which has a precedence.
+
+###### grammar fields
+ struct text current_type;
+ int prec_levels;
+
+###### declarations
+ enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
+ static const char *known[] = { "$$", "${", "}$" };
+
+###### functions
+ static char *dollar_line(struct token_state *ts, struct grammar *g)
+ {
+ struct token t = token_next(ts);
+ char *err;
+ enum assoc assoc;
+ int found;
+
+ if (t.num != TK_ident) {
+ err = "type or assoc expected after '$'";
+ goto abort;
+ }
+ if (text_is(t.txt, "LEFT"))
+ assoc = Left;
+ else if (text_is(t.txt, "RIGHT"))
+ assoc = Right;
+ else if (text_is(t.txt, "NON"))
+ assoc = Non;
+ else {
+ g->current_type = t.txt;
+ t = token_next(ts);
+ if (t.num != TK_newline) {
+ err = "Extra tokens after type name";
+ goto abort;
+ }
+ return NULL;
+ }
+
+ // This is a precedence line, need some symbols.
+ found = 0;
+ g->prec_levels += 1;
+ t = token_next(ts);
+ while (t.num != TK_newline) {
+ enum symtype type = Terminal;
+ struct symbol *s;
+ if (t.num == TK_virtual) {
+ type = Virtual;
+ t = token_next(ts);
+ if (t.num != TK_ident) {
+ err = "$$ must be followed by a word";
+ goto abort;
+ }
+ } else if (t.num != TK_ident &&
+ t.num != TK_mark) {
+ err = "Illegal token in precedence line";
+ goto abort;
+ }
+ s = sym_find(g, t.txt);
+ if (s->type != Unknown) {
+ err = "Symbols in precedence line must not already be known.";
+ goto abort;
+ }
+ s->type = type;
+ s->precedence = g->prec_levels;
+ s->assoc = assoc;
+ found += 1;
+ }
+ if (found == 0)
+ err = "No symbols given on precedence line";
+ goto abort;
+ return NULL;
+ abort:
+ while (t.num != TK_newline && t.num != TK_eof)
+ t = token_next(ts);
+ return err;
+ }
+
+### Productions
+
+A production either starts with an identifier which is the head
+non-terminal, or a vertical bar (`|`) in which case this production
+uses the same head as the previous one. The identifier must be
+followed by a `->` mark. All productions for a given non-terminal must
+be grouped together with the `nonterminal ->` given only once.
+
+After this a (possibly empty) sequence of identifiers and marks form
+the body of the production. A virtual symbol may be given after the
+body by preceding it with `$$`. If a virtual symbol is given, the
+precedence of the production is that for the virtual symbol. If none
+is given, the precedence is inherited from the last symbol in the
+production which has a precedence specified.
+
+After the optional precedence may come the `${` mark. This indicates
+the start of a code fragment. If present, this must be on the same
+line as the start of the production.
+
+All of the text from the `${` through to the matching `}$` is
+collected and forms the code-fragment for the production. It must all
+be in one `code_node` of the literate code. The `}$` must be
+at the end of a line.
+
+Text in the code fragment will undergo substitutions where `$N` for
+some numeric `N` will be replaced with a variable holding the parse
+information for the particular symbol in the production. `$0` is the
+head of the production, `$1` is the first symbol of the body, etc.
+The type of `$N` for a terminal symbol is `struct token`. For
+non-terminal, it is whatever has been declared for that symbol.
+
+While building productions we will need to add to an array which needs to
+grow dynamically.
+
+###### functions
+ static void array_add(void *varray, int *cnt, void *new)
+ {
+ void ***array = varray;
+ int current = 0;
+ const int step = 8;
+ current = ((*cnt-1) | (step-1))+1;
+ if (*cnt == current) {
+ /* must grow */
+ current += step;
+ *array = realloc(*array, current * sizeof(void*));
+ }
+ (*array)[*cnt] = new;
+ (*cnt) += 1;
+ }
+
+Collecting the code fragment simply involves reading tokens until we
+hit the end token `}$`, and noting the character position of the start and
+the end.
+
+###### functions
+ static struct text collect_code(struct token_state *state,
+ struct token start)
+ {
+ struct text code;
+ struct token t;
+ code.txt = start.txt.txt + start.txt.len;
+ do
+ t = token_next(state);
+ while (t.node == start.node &&
+ t.num != TK_close && t.num != TK_error &&
+ t.num != TK_eof);
+ if (t.num == TK_close && t.node == start.node)
+ code.len = t.txt.txt - code.txt;
+ else
+ code.txt = NULL;
+ return code;
+ }
+
+Now we have all the bit we need to parse a full production.
+
+###### includes
+ #include <memory.h>
+
+###### grammar fields
+ struct production **productions;
+ int production_count;
+
+###### production fields
+ struct symbol *head;
+ struct symbol **body;
+ int body_size;
+ struct text code;
+
+###### symbol fields
+ int first_production;
+
+###### functions
+ static char *parse_production(struct grammar *g,
+ struct symbol *head,
+ struct token_state *state)
+ {
+ /* Head has already been parsed. */
+ struct token tk;
+ char *err;
+ struct production p, *pp;
+
+ memset(&p, 0, sizeof(p));
+ p.head = head;
+ tk = token_next(state);
+ while (tk.num == TK_ident || tk.num == TK_mark) {
+ struct symbol *bs = sym_find(g, tk.txt);
+ if (bs->type == Unknown)
+ bs->type = Terminal;
+ if (bs->type == Virtual) {
+ err = "Virtual symbol not permitted in production";
+ goto abort;
+ }
+ if (bs->precedence) {
+ p.precedence = bs->precedence;
+ p.assoc = bs->assoc;
+ }
+ array_add(&p.body, &p.body_size, bs);
+ tk = token_next(state);
+ }
+ if (tk.num == TK_virtual) {
+ struct symbol *vs;
+ tk = token_next(state);
+ if (tk.num != TK_ident) {
+ err = "word required after $$";
+ goto abort;
+ }
+ vs = sym_find(g, tk.txt);
+ if (vs->type != Virtual) {
+ err = "symbol after $$ must be virtual";
+ goto abort;
+ }
+ p.precedence = vs->precedence;
+ p.assoc = vs->assoc;
+ tk = token_next(state);
+ }
+ if (tk.num == TK_open) {
+ p.code = collect_code(state, tk);
+ if (p.code.txt == NULL) {
+ err = "code fragment not closed properly";
+ goto abort;
+ }
+ tk = token_next(state);
+ }
+ if (tk.num != TK_newline && tk.num != TK_eof) {
+ err = "stray tokens at end of line";
+ goto abort;
+ }
+ pp = malloc(sizeof(*pp));
+ *pp = p;
+ array_add(&g->productions, &g->production_count, pp);
+ return NULL;
+ abort:
+ while (tk.num != TK_newline && tk.num != TK_eof)
+ tk = token_next(state);
+ return err;
+ }
+
+With the ability to parse production and dollar-lines, we have nearly all
+that we need to parse a grammar from a `code_node`.
+
+The head of the first production will effectively be the `start` symbol of
+the grammar. However it wont _actually_ be so. Processing the grammar is
+greatly simplified if the real start symbol only has a single production,
+and expect `$eof` as the final terminal. So when we find the first explicit
+production we insert an extra production as production zero which looks like
+
+###### Example: production 0
+ $start -> START $eof
+
+where `START` is the first non-terminal give.
+
+###### create production zero
+ struct production *p = calloc(1,sizeof(*p));
+ struct text start = {"$start",6};
+ struct text eof = {"$eof",4};
+ p->head = sym_find(g, start);
+ p->head->type = Nonterminal;
+ array_add(&p->body, &p->body_size, head);
+ array_add(&p->body, &p->body_size, sym_find(g, eof));
+ g->start = p->head->num;
+ p->head->first_production = g->production_count;
+ array_add(&g->productions, &g->production_count, p);
+
+Now we are ready to read in the grammar.
+
+###### grammar fields
+ int start; // the 'start' symbol.
+
+###### grammar_read
+ static struct grammar *grammar_read(struct code_node *code)
+ {
+ struct token_config conf = {
+ .word_start = "",
+ .word_cont = "",
+ .words_marks = known,
+ .known_count = sizeof(known)/sizeof(known[0]),
+ .number_chars = "",
+ .ignored = (1 << TK_line_comment)
+ | (1 << TK_block_comment)
+ | (0 << TK_number)
+ | (0 << TK_string)
+ | (1 << TK_multi_string)
+ | (1 << TK_indent)
+ | (1 << TK_undent),
+ };
+
+ struct token_state *state = token_open(code, &conf);
+ struct token tk = token_next(state);
+ struct symbol *head = NULL;
+ struct grammar *g;
+ char *err = NULL;
+
+ g = calloc(1, sizeof(*g));
+ symbols_init(g);
+
+ for (tk = token_next(state); tk.num != TK_eof;
+ tk = token_next(state)) {
+ if (tk.num == TK_newline)
+ continue;
+ if (tk.num == TK_ident) {
+ // new non-terminal
+ head = sym_find(g, tk.txt);
+ if (head->type == Nonterminal)
+ err = "This non-terminal has already be used.";
+ else if (head->type == Virtual)
+ err = "Virtual symbol not permitted in head of production";
+ else {
+ head->type = Nonterminal;
+ head->struct_name = g->current_type;
+ if (g->start == 0) {
+ ## create production zero
+ }
+ head->first_production = g->production_count;
+ tk = token_next(state);
+ if (tk.num == TK_mark &&
+ text_is(tk.txt, "->"))
+ err = parse_production(g, head, state);
+ else
+ err = "'->' missing in production";
+ }
+ } else if (tk.num == TK_mark
+ && text_is(tk.txt, "|")) {
+ // another production for same non-term
+ if (head)
+ err = parse_production(g, head, state);
+ else
+ err = "First production must have a head";
+ } else if (tk.num == TK_mark
+ && text_is(tk.txt, "$")) {
+ err = dollar_line(state, g);
+ } else {
+ err = "Unrecognised token at start of line.";
+ }
+ if (err)
+ goto abort;
+ }
+ token_close(state);
+ return g;
+ abort:
+ fprintf(stderr, "Error at line %d: %s\n",
+ tk.line, err);
+ token_close(state);
+ free(g);
+ return NULL;
+ }
+
+## Analysing the grammar
+
+The central task in analysing the grammar is to determine a set of
+states to drive the parsing process. This will require finding
+various sets of symbols and of "items". Some of these sets will need
+to attach information to each element in the set, so they are more
+like maps.
+
+Each "item" may have a 'look-ahead' or `LA` set associated with
+it. Many of these will be the same as each other. To avoid memory
+wastage, and to simplify some comparisons of sets, these sets will be
+stored in a list of unique sets, each assigned a number.
+
+Once we have the data structures in hand to manage these sets and
+lists, we can start setting the 'nullable' flag, build the 'FIRST'
+sets, and then create the item sets which define the various states.
+
+### Symbol sets.
+
+Though we don't only store symbols in these sets, they are the main
+things we store, so they are called symbol sets or "symsets".
+
+A symset has a size, an array of shorts, and an optional array of data
+values, which are also shorts. If the array of data is not present,
+we store an impossible pointer, as `NULL` is used to indicate that no
+memory has been allocated yet;
+
+###### declarations
+ struct symset {
+ short cnt;
+ unsigned short *syms, *data;
+ };
+ #define NO_DATA ((unsigned short *)1)
+ const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
+ const struct symset INIT_DATASET = { 0, NULL, NULL };
+
+The arrays of shorts are allocated in blocks of 8 and are kept sorted
+by using an insertion sort. We don't explicitly record the amount of
+allocated space as it can be derived directly from the current `cnt` using
+`((cnt - 1) | 7) + 1`.
+
+###### functions
+ static void symset_add(struct symset *s, int key, int val)
+ {
+ int i;
+ int current = ((s->cnt-1) | 7) + 1;
+ if (current == s->cnt) {
+ current += 8;
+ s->syms = realloc(s->syms, sizeof(*s->syms) * current);
+ if (s->data != NO_DATA)
+ s->data = realloc(s->data, sizeof(*s->data) * current);
+ }
+ i = s->cnt;
+ while (i > 0 && s->syms[i-1] > key) {
+ s->syms[i] = s->syms[i-1];
+ if (s->data != NO_DATA)
+ s->data[i] = s->data[i-1];
+ i--;
+ }
+ s->syms[i] = key;
+ if (s->data != NO_DATA)
+ s->data[i] = val;
+ s->cnt += 1;
+ }
+
+Finding a symbol (or item) in a `symset` uses a simple binary search.
+We return the index where the value was found (so data can be accessed),
+or `-1` to indicate failure.
+
+ static int symset_find(struct symset *ss, int key)
+ {
+ int lo = 0;
+ int hi = ss->cnt;
+
+ if (hi == 0)
+ return -1;
+ while (lo + 1 < hi) {
+ int mid = (lo + hi) / 2;
+ if (ss->syms[mid] <= key)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (ss->syms[lo] == key)
+ return lo;
+ return -1;
+ }
+
+We will often want to form the union of two symsets. When we do, we
+will often want to know if anything changed (as they might mean there
+is more work to do). So `symset_union` reports whether anything was
+added to the first set. We use a slow quadratic approach as these
+sets don't really get very big. If profiles shows this to be a problem is
+can be optimised later.
+
+ static int symset_union(struct symset *a, struct symset *b)
+ {
+ int i;
+ int added = 0;
+ for (i = 0; i < b->cnt; i++)
+ if (symset_find(a, b->syms[i]) < 0) {
+ int data = 0;
+ if (b->data != NO_DATA)
+ data = b->data[i];
+ symset_add(a, b->syms[i], data);
+ added++;
+ }
+ return added;
+ }
+
+And of course we must be able to free a symset.
+
+ static void symset_free(struct symset ss)
+ {
+ free(ss.syms);
+ if (ss.data != NO_DATA)
+ free(ss.data);
+ }
+
+### Symset Storage
+
+Some symsets are simply stored somewhere appropriate in the `grammar`
+data structure, others needs a bit of indirection. This applies
+particularly to "LA" sets. These will be paired with "items" in an
+"itemset". As itemsets will be stored in a symset, the "LA" set needs to be
+stored in the `data` field, so we need a mapping from a small (short)
+number to an LA `symset`.
+
+As mentioned earlier this involves creating a list of unique symsets.
+
+For now, we just use a linear list sorted by insertion. A skiplist
+would be more efficient and may be added later.
+
+###### declarations
+
+ struct setlist {
+ struct symset ss;
+ int num;
+ struct setlist *next;
+ };
+
+###### grammar fields
+ struct setlist *sets;
+ int nextset;
+
+###### functions
+
+ static int ss_cmp(struct symset a, struct symset b)
+ {
+ int i;
+ int diff = a.cnt - b.cnt;
+ if (diff)
+ return diff;
+ for (i = 0; i < a.cnt; i++) {
+ diff = (int)a.syms[i] - (int)b.syms[i];
+ if (diff)
+ return diff;
+ }
+ return 0;
+ }
+
+ static int save_set(struct grammar *g, struct symset ss)
+ {
+ struct setlist **sl = &g->sets;
+ int cmp = 1;
+ struct setlist *s;
+
+ while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
+ sl = & (*sl)->next;
+ if (cmp == 0) {
+ symset_free(ss);
+ return (*sl)->num;
+ }
+
+ s = malloc(sizeof(*s));
+ s->ss = ss;
+ s->num = g->nextset;
+ g->nextset += 1;
+ s->next = *sl;
+ *sl = s;
+ return s->num;
+ }
+
+Finding a set by number is currently performed by a simple linear search.
+If this turns out to hurt performance, we can store the sets in a dynamic
+array like the productions.
+
+ static struct symset set_find(struct grammar *g, int num)
+ {
+ struct setlist *sl = g->sets;
+ while (sl && sl->num != num)
+ sl = sl->next;
+ return sl->ss;
+ }
+
+
+### Setting `nullable`
+
+We set `nullable` on the head symbol for any production for which all
+the body symbols (if any) are nullable. As this is a recursive
+definition, any change in the `nullable` setting means that we need to
+re-evaluate where it needs to be set.
+
+We simply loop around performing the same calculations until no more
+changes happen.
+
+###### symbol fields
+ int nullable;
+
+###### functions
+ static void set_nullable(struct grammar *g)
+ {
+ int check_again = 1;
+ while (check_again) {
+ int p;
+ check_again = 0;
+ for (p = 0; p < g->production_count; p++) {
+ struct production *pr = g->productions[p];
+ int s;
+
+ if (pr->head->nullable)
+ continue;
+ for (s = 0; s < pr->body_size; s++)
+ if (! pr->body[s]->nullable)
+ break;
+ if (s == pr->body_size) {
+ pr->head->nullable = 1;
+ check_again = 1;
+ }
+ }
+ }
+ }
+
+### Building the `first` sets
+
+When calculating what can follow a particular non-terminal, we will need to
+know what the "first" terminal in an subsequent non-terminal might be. So
+we calculate the `first` set for every non-terminal and store them in an
+array. We don't bother recording the "first" set for terminals as they are
+trivial.
+
+As the "first" for one symbol might depend on the "first" of another,
+we repeat the calculations until no changes happen, just like with
+`set_nullable`. This makes use of the fact that `symset_union`
+reports if any change happens.
+
+The core of this which finds the "first" of part of a production body
+will be reused for computing the "follow" sets, so we split it out
+into a separate function.
+
+###### grammar fields
+ struct symset *first;
+
+###### functions
+
+ static int add_first(struct production *pr, int start,
+ struct symset *target, struct grammar *g,
+ int *to_end)
+ {
+ int s;
+ int changed = 0;
+ for (s = start; s < pr->body_size; s++) {
+ struct symbol *bs = pr->body[s];
+ if (bs->type == Terminal) {
+ if (symset_find(target, bs->num) < 0) {
+ symset_add(target, bs->num, 0);
+ changed = 1;
+ }
+ break;
+ } else if (symset_union(target, &g->first[bs->num]))
+ changed = 1;
+ if (!bs->nullable)
+ break;
+ }
+ if (to_end)
+ *to_end = (s == pr->body_size);
+ return changed;
+ }
+
+ static void build_first(struct grammar *g)
+ {
+ int check_again = 1;
+ int s;
+ g->first = calloc(g->num_syms, sizeof(g->first[0]));
+ for (s = 0; s < g->num_syms; s++)
+ g->first[s] = INIT_SYMSET;
+
+ while (check_again) {
+ int p;
+ check_again = 0;
+ for (p = 0; p < g->production_count; p++) {
+ struct production *pr = g->productions[p];
+ struct symset *head = &g->first[pr->head->num];
+
+ if (add_first(pr, 0, head, g, NULL))
+ check_again = 1;
+ }
+ }
+ }
+
+### Building the `follow` sets.
+
+There are two different situations when we will want to generate "follow"
+sets. If we are doing an SLR analysis, we want to generate a single
+"follow" set for each non-terminal in the grammar. That is what is
+happening here. If we are doing an LALR or LR analysis we will want
+to generate a separate "LA" set for each item. We do that later
+in state generation.
+
+There are two parts to generating a "follow" set. Firstly we look at
+every place that any non-terminal appears in the body of any
+production, and we find the set of possible "first" symbols after
+there. This is done using `add_first` above and only needs to be done
+once as the "first" sets are now stable and will not change.
+
+###### follow code
+
+ for (p = 0; p < g->production_count; p++) {
+ struct production *pr = g->productions[p];
+ int b;
+
+ for (b = 0; b < pr->body_size - 1; b++) {
+ struct symbol *bs = pr->body[b];
+ if (bs->type == Terminal)
+ continue;
+ add_first(pr, b+1, &g->follow[bs->num], g, NULL);
+ }
+ }
+
+The second part is to add the "follow" set of the head of a production
+to the "follow" sets of the final symbol in the production, and any
+other symbol which is followed only by `nullable` symbols. As this
+depends on "follow" itself we need to repeatedly perform the process
+until no further changes happen.
+
+###### follow code
+
+ for (again = 0, p = 0;
+ p < g->production_count;
+ p < g->production_count-1
+ ? p++ : again ? (again = 0, p = 0)
+ : p++) {
+ struct production *pr = g->productions[p];
+ int b;
+
+ for (b = pr->body_size - 1; b >= 0; b--) {
+ struct symbol *bs = pr->body[b];
+ if (bs->type == Terminal)
+ break;
+ if (symset_union(&g->follow[bs->num],
+ &g->follow[pr->head->num]))
+ again = 1;
+ if (!bs->nullable)
+ break;
+ }
+ }
+
+We now just need to create and initialise the `follow` list to get a
+complete function.
+
+###### grammar fields
+ struct symset *follow;
+
+###### functions
+ static void build_follow(struct grammar *g)
+ {
+ int s, again, p;
+ g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
+ for (s = 0; s < g->num_syms; s++)
+ g->follow[s] = INIT_SYMSET;
+ ## follow code
+ }
+
+### Building itemsets and states
+
+There are three different levels of detail that can be chosen for
+building the itemsets and states for the LR grammar. They are:
+
+1. LR(0) or SLR(1), where no look-ahead is considered.
+2. LALR(1) where we build look-ahead sets with each item and merge
+ the LA sets when we find two paths to the same "kernel" set of items.
+3. LR(1) where different look-ahead for any item in the code means
+ a different state must be created.
+
+###### forward declarations
+ enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
+
+We need to be able to look through existing states to see if a newly
+generated state already exists. For now we use a simple sorted linked
+list.
+
+An item is a pair of numbers: the production number and the position of
+"DOT", which is an index into the body of the production.
+As the numbers are not enormous we can combine them into a single "short"
+and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
+production number (so 4000 productions with maximum of 15 symbols in the
+body).
+
+Comparing the itemsets will be a little different to comparing symsets
+as we want to do the lookup after generating the "kernel" of an
+itemset, so we need to ignore the offset=zero items which are added during
+completion.
+
+To facilitate this, we modify the "DOT" number so that "0" sorts to the end of
+the list in the symset, and then only compare items before the first "0".
+
+###### declarations
+ static inline unsigned short item_num(int production, int index)
+ {
+ return production | (((index-1)&0x1f) << 11);
+ }
+ static inline int item_prod(unsigned short item)
+ {
+ return item & 0x7ff;
+ }
+ static inline int item_index(unsigned short item)
+ {
+ return ((item >> 11)+1) & 0x1f;
+ }
+
+For LR(1) analysis we need to compare not just the itemset in a state
+but also the LA sets. As we assign each unique LA set a number, we
+can just compare the symset and the data values together.
+
+###### functions
+ static int itemset_cmp(struct symset a, struct symset b,
+ enum grammar_type type)
+ {
+ int i;
+ int av, bv;
+
+ for (i = 0;
+ i < a.cnt && i < b.cnt &&
+ item_index(a.syms[i]) > 0 &&
+ item_index(b.syms[i]) > 0;
+ i++) {
+ int diff = a.syms[i] - b.syms[i];
+ if (diff)
+ return diff;
+ if (type == LR1) {
+ diff = a.data[i] - b.data[i];
+ if (diff)
+ return diff;
+ }
+ }
+ if (i == a.cnt || item_index(a.syms[i]) == 0)
+ av = -1;
+ else
+ av = a.syms[i];
+ if (i == b.cnt || item_index(b.syms[i]) == 0)
+ bv = -1;
+ else
+ bv = b.syms[i];
+ if (av - bv)
+ return av - bv;
+ if (type < LR1 || av == -1)
+ return 0;
+ return
+ a.data[i] - b.data[i];
+ }
+
+And now we can build the list of itemsets. The lookup routine returns
+both a success flag and a pointer to where in the list an insert
+should happen, so we don't need to search a second time.
+
+###### declarations
+ struct itemset {
+ struct itemset *next;
+ short state;
+ struct symset items;
+ struct symset go_to;
+ char completed;
+ };
+
+###### grammar fields
+ struct itemset *items;
+ int states;
+
+###### functions
+ static int itemset_find(struct grammar *g, struct itemset ***where,
+ struct symset kernel, enum grammar_type type)
+ {
+ struct itemset **ip;
+
+ for (ip = &g->items; *ip ; ip = & (*ip)->next) {
+ struct itemset *i = *ip;
+ int diff;
+ diff = itemset_cmp(i->items, kernel, type);
+ if (diff < 0)
+ continue;
+ if (diff > 0)
+ break;
+ /* found */
+ *where = ip;
+ return 1;
+ }
+ *where = ip;
+ return 0;
+ }
+
+Adding an itemset may require merging the LA sets if LALR analysis is
+happening. If any new LA set add symbol that weren't in the old LA set, we
+clear the `completed` flag so that the dependants of this itemset will be
+recalculated and their LA sets updated.
+
+`add_itemset` must consume the symsets it is passed, either by adding
+them to a data structure, of freeing them.
+
+ static int add_itemset(struct grammar *g, struct symset ss,
+ enum grammar_type type)
+ {
+ struct itemset **where, *is;
+ int i;
+ int found = itemset_find(g, &where, ss, type);
+ if (!found) {
+ is = calloc(1, sizeof(*is));
+ is->state = g->states;
+ g->states += 1;
+ is->items = ss;
+ is->next = *where;
+ is->go_to = INIT_DATASET;
+ *where = is;
+ return is->state;
+ }
+ is = *where;
+ if (type != LALR) {
+ symset_free(ss);
+ return is->state;
+ }
+ for (i = 0; i < ss.cnt; i++) {
+ struct symset temp = INIT_SYMSET, s;
+ if (ss.data[i] == is->items.data[i])
+ continue;
+ s = set_find(g, is->items.data[i]);
+ symset_union(&temp, &s);
+ s = set_find(g, ss.data[i]);
+ if (symset_union(&temp, &s)) {
+ is->items.data[i] = save_set(g, temp);
+ is->completed = 0;
+ } else
+ symset_free(temp);
+ }
+ symset_free(ss);
+ return is->state;
+ }
+
+#### The build
+
+To build all the itemsets, we first insert the initial itemset made from the
+start symbol, complete each itemset, and then generate new itemsets from old
+until no new ones can be made.
+
+Completing an itemset means finding all the items where "DOT" is followed by
+a nonterminal and adding "DOT=0" items for every production from that
+non-terminal - providing each item hasn't already been added.
+
+If LA sets are needed, the LA set for each new item is found using
+`add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
+be supplemented by the LA set for the item which produce the new item.
+
+We also collect a set of all symbols which follow "DOT" (in `done`) as this
+is used in the next stage.
+
+NOTE: precedence handling should happen here - I haven't written this yet
+though.
+
+###### complete itemset
+ for (i = 0; i < is->items.cnt; i++) {
+ int p = item_prod(is->items.syms[i]);
+ int bs = item_index(is->items.syms[i]);
+ struct production *pr = g->productions[p];
+ int p2;
+ struct symbol *s;
+ struct symset LA = INIT_SYMSET;
+ int sn = 0;
+
+ if (bs == pr->body_size)
+ continue;
+ s = pr->body[bs];
+ if (symset_find(&done, s->num) < 0)
+ symset_add(&done, s->num, 0);
+ if (s->type != Nonterminal)
+ continue;
+ again = 1;
+ if (type >= LALR) {
+ // Need the LA set.
+ int to_end;
+ add_first(pr, bs+1, &LA, g, &to_end);
+ if (to_end) {
+ struct symset ss = set_find(g, is->items.data[i]);
+ symset_union(&LA, &ss);
+ }
+ sn = save_set(g, LA);
+ LA = set_find(g, sn);
+ }
+ /* Add productions for this symbol */
+ for (p2 = s->first_production;
+ p2 < g->production_count &&
+ g->productions[p2]->head == s;
+ p2++) {
+ int itm = item_num(p2, 0);
+ int pos = symset_find(&is->items, itm);
+ if (pos < 0) {
+ symset_add(&is->items, itm, sn);
+ /* Will have re-ordered, so start
+ * from beginning again */
+ i = -1;
+ } else if (type >= LALR) {
+ struct symset ss = set_find(g, is->items.data[pos]);
+ struct symset tmp = INIT_SYMSET;
+
+ symset_union(&tmp, &ss);
+ if (symset_union(&tmp, &LA)) {
+ is->items.data[pos] = save_set(g, tmp);
+ i = -1;
+ }else
+ symset_free(tmp);
+ }
+ }
+ }
+
+For each symbol we found (and placed in `done`) we collect all the items for
+which this symbol is next, and create a new itemset with "DOT" advanced over
+the symbol. This is then added to the collection of itemsets (or merged
+with a pre-existing itemset).
+
+###### derive itemsets
+ // Now we have a completed itemset, so we need to
+ // create all the 'next' itemset and create them
+ // if they don't exist.
+ for (i = 0; i < done.cnt; i++) {
+ int j;
+ int state;
+ struct symset newitemset = INIT_SYMSET;
+ if (type >= LALR)
+ newitemset = INIT_DATASET;
+
+ for (j = 0; j < is->items.cnt; j++) {
+ int itm = is->items.syms[j];
+ int p = item_prod(itm);
+ int bp = item_index(itm);
+ struct production *pr = g->productions[p];
+ int la = 0;
+ int pos;
+
+ if (bp == pr->body_size)
+ continue;
+ if (pr->body[bp]->num != done.syms[i])
+ continue;
+ if (type >= LALR)
+ la = is->items.data[j];
+ pos = symset_find(&newitemset, pr->head->num);
+ if (pos < 0)
+ symset_add(&newitemset, item_num(p, bp+1), la);
+ else if (type >= LALR) {
+ // Need to merge la set.
+ int la2 = newitemset.data[pos];
+ if (la != la2) {
+ struct symset ss = set_find(g, la2);
+ struct symset LA = INIT_SYMSET;
+ symset_union(&LA, &ss);
+ ss = set_find(g, la);
+ if (symset_union(&LA, &ss))
+ newitemset.data[pos] = save_set(g, LA);
+ else
+ symset_free(LA);
+ }
+ }
+ }
+ state = add_itemset(g, newitemset, type);
+ if (symset_find(&is->go_to, done.syms[i]) < 0)
+ symset_add(&is->go_to, done.syms[i], state);
+ }
+
+All that is left is to crate the initial itemset from production zero, and
+with `TK_eof` as the LA set.
+
+###### functions
+ static void build_itemsets(struct grammar *g, enum grammar_type type)
+ {
+ struct symset first = INIT_SYMSET;
+ struct itemset *is;
+ int again;
+ int la = 0;
+ if (type >= LALR) {
+ // LA set just has eof
+ struct symset eof = INIT_SYMSET;
+ symset_add(&eof, TK_eof, 0);
+ la = save_set(g, eof);
+ first = INIT_DATASET;
+ }
+ // production 0, offset 0 (with no data)
+ symset_add(&first, item_num(0, 0), la);
+ add_itemset(g, first, type);
+ for (again = 0, is = g->items;
+ is;
+ is = is->next ?: again ? (again = 0, g->items) : NULL) {
+ int i;
+ struct symset done = INIT_SYMSET;
+ if (is->completed)
+ continue;
+ is->completed = 1;
+ ## complete itemset
+ ## derive itemsets
+ symset_free(done);
+ }
+ }
+
+### Completing the analysis.
+
+The exact process of analysis depends on which level was requested. For
+`LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
+`LR1` we need first, but not follow. For `SLR` we need both.
+
+We don't build the "action" tables that you might expect as the parser
+doesn't use them. They will be calculated to some extent if needed for
+a report.
+
+Once we have built everything we allocate arrays for the two lists:
+symbols and itemsets. This allows more efficient access during reporting.
+The symbols are grouped as terminals and non-terminals and we record the
+changeover point in `first_nonterm`.
+
+###### grammar fields
+ struct symbol **symtab;
+ struct itemset **statetab;
+ int first_nonterm;
+
+###### grammar_analyse
+
+ static void grammar_analyse(struct grammar *g, enum grammar_type type)
+ {
+ struct symbol *s;
+ struct itemset *is;
+ int snum = TK_reserved;
+ for (s = g->syms; s; s = s->next)
+ if (s->num < 0 && s->type == Terminal) {
+ s->num = snum;
+ snum++;
+ }
+ g->first_nonterm = snum;
+ for (s = g->syms; s; s = s->next)
+ if (s->num < 0) {
+ s->num = snum;
+ snum++;
+ }
+ g->num_syms = snum;
+ g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
+ for (s = g->syms; s; s = s->next)
+ g->symtab[s->num] = s;
+
+ if (type >= SLR) {
+ set_nullable(g);
+ build_first(g);
+ }
+ if (type == SLR)
+ build_follow(g);
+
+ build_itemsets(g, type);
+
+ g->statetab = calloc(g->states, sizeof(g->statetab[0]));
+ for (is = g->items; is ; is = is->next)
+ g->statetab[is->state] = is;
+ }
+
+## Reporting on the Grammar
+
+The purpose of the report is to give the grammar developer insight into
+how the grammar parser will work. It is basically a structured dump of
+all the tables that have been generated, plus an description of any conflicts.
+
+###### grammar_report
+ static int grammar_report(struct grammar *g, enum grammar_type type)
+ {
+ report_symbols(g);
+ if (g->follow)
+ report_follow(g);
+ report_itemsets(g);
+ return report_conflicts(g, type);
+ }
+
+Firstly we have the complete list of symbols, together with the "FIRST"
+set if that was generated.
+
+###### functions
+
+ static void report_symbols(struct grammar *g)
+ {
+ int n;
+ if (g->first)
+ printf("SYMBOLS + FIRST:\n");
+ else
+ printf("SYMBOLS:\n");
+
+ for (n = 0; n < g->num_syms; n++) {
+ struct symbol *s = g->symtab[n];
+ if (!s)
+ continue;
+
+ printf(" %c%3d%c: ",
+ s->nullable ? '*':' ',
+ s->num, symtypes[s->type]);
+ prtxt(s->name);
+ if (s->precedence)
+ printf(" (%d%s)", s->precedence,
+ assoc_names[s->assoc]);
+
+ if (g->first && s->type == Nonterminal) {
+ int j;
+ char c = ':';
+ for (j = 0; j < g->first[n].cnt; j++) {
+ printf("%c ", c);
+ c = ',';
+ prtxt(g->symtab[g->first[n].syms[j]]->name);
+ }
+ }
+ printf("\n");
+ }
+ }
+
+Then we have to follow sets if they were computed.
+
+ static void report_follow(struct grammar *g)
+ {
+ int n;
+ printf("FOLLOW:\n");
+ for (n = 0; n < g->num_syms; n++)
+ if (g->follow[n].cnt) {
+ int j;
+ char c = ':';
+ printf(" ");
+ prtxt(g->symtab[n]->name);
+ for (j = 0; j < g->follow[n].cnt; j++) {
+ printf("%c ", c);
+ c = ',';
+ prtxt(g->symtab[g->follow[n].syms[j]]->name);
+ }
+ printf("\n");
+ }
+ }
+
+And finally the item sets. These include the GO TO tables and, for
+LALR and LR1, the LA set for each item. Lots of stuff, so we break
+it up a bit. First the items, with production number and associativity.
+
+ static void report_item(struct grammar *g, int itm)
+ {
+ int p = item_prod(itm);
+ int dot = item_index(itm);
+ struct production *pr = g->productions[p];
+ int i;
+
+ printf(" ");
+ prtxt(pr->head->name);
+ printf(" ->");
+ for (i = 0; i < pr->body_size; i++) {
+ printf(" %s", (dot == i ? ". ": ""));
+ prtxt(pr->body[i]->name);
+ }
+ if (dot == pr->body_size)
+ printf(" .");
+ printf(" [%d]", p);
+ if (pr->precedence)
+ printf(" (%d%s)", pr->precedence,
+ assoc_names[pr->assoc]);
+ printf("\n");
+ }
+
+The LA sets which are (possibly) reported with each item:
+
+ static void report_la(struct grammar *g, int lanum)
+ {
+ struct symset la = set_find(g, lanum);
+ int i;
+ char c = ':';
+
+ printf(" LOOK AHEAD(%d)", lanum);
+ for (i = 0; i < la.cnt; i++) {
+ printf("%c ", c);
+ c = ',';
+ prtxt(g->symtab[la.syms[i]]->name);
+ }
+ printf("\n");
+ }
+
+Then the go to sets:
+
+
+ static void report_goto(struct grammar *g, struct symset gt)
+ {
+ int i;
+ printf(" GOTO:\n");
+
+ for (i = 0; i < gt.cnt; i++) {
+ printf(" ");
+ prtxt(g->symtab[gt.syms[i]]->name);
+ printf(" -> %d\n", gt.data[i]);
+ }
+ }
+
+Now we can report all the item sets complete with items, LA sets, and GO TO.
+
+ static void report_itemsets(struct grammar *g)
+ {
+ int s;
+ printf("ITEM SETS(%d)\n", g->states);
+ for (s = 0; s < g->states; s++) {
+ int j;
+ struct itemset *is = g->statetab[s];
+ printf(" Itemset %d:\n", s);
+ for (j = 0; j < is->items.cnt; j++) {
+ report_item(g, is->items.syms[j]);
+ if (is->items.data != NO_DATA)
+ report_la(g, is->items.data[j]);
+ }
+ report_goto(g, is->go_to);
+ }
+ }
+
+### Reporting conflicts
+
+Conflict detection varies a lot among different analysis levels. However
+LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
+LR1 are also similar as they have FOLLOW or LA sets.
+
+###### functions
+
+ ## conflict functions
+
+ static int report_conflicts(struct grammar *g, enum grammar_type type)
+ {
+ int cnt = 0;
+ printf("Conflicts:\n");
+ if (type < SLR)
+ cnt = conflicts_lr0(g, type);
+ else
+ cnt = conflicts_slr(g, type);
+ if (cnt == 0)
+ printf(" - no conflicts\n");
+ return cnt;
+ }
+
+LR0 conflicts are any state which have both a reducible item and
+a shiftable item.
+
+LR05 conflicts only occurs if two possibly reductions exist,
+as shifts always over-ride reductions.
+
+###### conflict functions
+ static int conflicts_lr0(struct grammar *g, enum grammar_type type)
+ {
+ int i;
+ int cnt = 0;
+ for (i = 0; i < g->states; i++) {
+ struct itemset *is = g->statetab[i];
+ int last_reduce = -1;
+ int prev_reduce = -1;
+ int last_shift = -1;
+ int j;
+ if (!is)
+ continue;
+ for (j = 0; j < is->items.cnt; j++) {
+ int itm = is->items.syms[j];
+ int p = item_prod(itm);
+ int bp = item_index(itm);
+ struct production *pr = g->productions[p];
+
+ if (bp == pr->body_size) {
+ prev_reduce = last_reduce;
+ last_reduce = j;
+ continue;
+ }
+ if (pr->body[bp]->type == Terminal)
+ last_shift = j;
+ }
+ if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
+ printf(" State %d has both SHIFT and REDUCE:\n", i);
+ report_item(g, is->items.syms[last_shift]);
+ report_item(g, is->items.syms[last_reduce]);
+ cnt++;
+ }
+ if (prev_reduce >= 0) {
+ printf(" State %d has 2 (or more) reducible items\n", i);
+ report_item(g, is->items.syms[prev_reduce]);
+ report_item(g, is->items.syms[last_reduce]);
+ cnt++;
+ }
+ }
+ return cnt;
+ }
+
+SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
+look ahead, or if a symbol in a look-ahead can be shifted. The differ only
+in the source of the look ahead set.
+
+We build a dataset mapping terminal to item for possible SHIFTs and then
+another for possible REDUCE operations. We report when we get conflicts
+between the two.
+
+ static int conflicts_slr(struct grammar *g, enum grammar_type type)
+ {
+ int i;
+ int cnt = 0;
+
+ for (i = 0; i < g->states; i++) {
+ struct itemset *is = g->statetab[i];
+ struct symset shifts = INIT_DATASET;
+ struct symset reduce = INIT_DATASET;
+ int j;
+ if (!is)
+ continue;
+ /* First collect the shifts */
+ for (j = 0; j < is->items.cnt; j++) {
+ int itm = is->items.syms[j];
+ int p = item_prod(itm);
+ int bp = item_index(itm);
+ struct production *pr = g->productions[p];
+
+ if (bp < pr->body_size &&
+ pr->body[bp]->type == Terminal) {
+ /* shiftable */
+ int sym = pr->body[bp]->num;
+ if (symset_find(&shifts, sym) < 0)
+ symset_add(&shifts, sym, itm);
+ }
+ }
+ /* Now look for reduction and conflicts */
+ for (j = 0; j < is->items.cnt; j++) {
+ int itm = is->items.syms[j];
+ int p = item_prod(itm);
+ int bp = item_index(itm);
+ struct production *pr = g->productions[p];
+
+ if (bp < pr->body_size)
+ continue;
+ /* reducible */
+ struct symset la;
+ if (type == SLR)
+ la = g->follow[pr->head->num];
+ else
+ la = set_find(g, is->items.data[j]);
+ int k;
+ for (k = 0; k < la.cnt; k++) {
+ int pos = symset_find(&shifts, la.syms[k]);
+ if (pos >= 0) {
+ printf(" State %d has SHIFT/REDUCE conflict on ", i);
+ prtxt(g->symtab[la.syms[k]]->name);
+ printf(":\n");
+ report_item(g, shifts.data[pos]);
+ report_item(g, itm);
+ cnt++;
+ }
+ pos = symset_find(&reduce, la.syms[k]);
+ if (pos < 0) {
+ symset_add(&reduce, la.syms[k], itm);
+ continue;
+ }
+ printf(" State %d has REDUCE/REDUCE conflict on ", i);
+ prtxt(g->symtab[la.syms[k]]->name);
+ printf(":\n");
+ report_item(g, itm);
+ report_item(g, reduce.data[pos]);
+ cnt++;
+ }
+ }
+ symset_free(shifts);
+ symset_free(reduce);
+ }
+ return cnt;
+ }
+
+
+## Generating the parser
+
+The export part of the parser is the `parse_XX` function, where the name
+`XX` is based on the name of the parser files.
+
+This takes a `code_node`, a partially initialized `token_config`, and an
+optional `FILE` to send tracing to. The `token_config` gets the list of
+known words added and then is used with the `code_node` to initialize the
+scanner.
+
+`parse_XX` then call the library function `parser_run` to actually complete
+the parse, This needs the `states` table and function to call the various
+pieces of code provided in the grammar file, so they are generated first.
+
+###### parser_generate
+
+ static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
+ {
+ gen_known(f, g);
+ gen_goto(f, g);
+ gen_states(f, g);
+ gen_reduce(f, g, file);
+ gen_free(f, g);
+
+ fprintf(f, "#line 0 \"gen_parser\"\n");
+ fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
+ name);
+ fprintf(f, "{\n");
+ fprintf(f, "\tstruct token_state *tokens;\n");
+ fprintf(f, "\tconfig->words_marks = known;\n");
+ fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
+ fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
+ fprintf(f, "\ttokens = token_open(code, config);\n");
+ fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace);\n");
+ fprintf(f, "\ttoken_close(tokens);\n");
+ fprintf(f, "\treturn rv;\n");
+ fprintf(f, "}\n\n");
+ }
+
+### Table words table
+
+The know words is simply an array of terminal symbols.
+
+###### functions
+
+ static void gen_known(FILE *f, struct grammar *g)
+ {
+ int i;
+ fprintf(f, "#line 0 \"gen_known\"\n");
+ fprintf(f, "static const char *known[] = {\n");
+ for (i = TK_reserved;
+ i < g->num_syms && g->symtab[i]->type == Terminal;
+ i++)
+ fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
+ g->symtab[i]->name.txt);
+ fprintf(f, "};\n\n");
+ }
+
+### States and the goto tables.
+
+For each state we record the goto table and the reducible production if
+there is one.
+Some of the details of the reducible production are stored in the
+`do_reduce` function to come later. Here we store the production number,
+the body size (useful for stack management) and the resulting symbol (useful
+for knowing how to free data later).
+
+The go to table is stored in a simple array of `sym` and corresponding
+`state`.
+
+###### exported types
+
+ struct lookup {
+ short sym;
+ short state;
+ };
+ struct state {
+ short go_to_cnt;
+ const struct lookup * go_to;
+ short reduce_prod;
+ short reduce_size;
+ short reduce_sym;
+ };
+
+
+###### functions
+
+ static void gen_goto(FILE *f, struct grammar *g)
+ {
+ int i;
+ fprintf(f, "#line 0 \"gen_goto\"\n");
+ for (i = 0; i < g->states; i++) {
+ int j;
+ fprintf(f, "static const struct lookup goto_%d[] = {\n",
+ i);
+ struct symset gt = g->statetab[i]->go_to;
+ for (j = 0; j < gt.cnt; j++)
+ fprintf(f, "\t{ %d, %d },\n",
+ gt.syms[j], gt.data[j]);
+ fprintf(f, "};\n");
+ }
+ }
+
+###### functions
+
+ static void gen_states(FILE *f, struct grammar *g)
+ {
+ int i;
+ fprintf(f, "#line 0 \"gen_states\"\n");
+ fprintf(f, "static const struct state states[] = {\n");
+ for (i = 0; i < g->states; i++) {
+ struct itemset *is = g->statetab[i];
+ int j, prod = -1;
+ for (j = 0; j < is->items.cnt; j++) {
+ int itm = is->items.syms[j];
+ int p = item_prod(itm);
+ int bp = item_index(itm);
+ struct production *pr = g->productions[p];
+
+ if (bp < pr->body_size)
+ continue;
+ /* This is what we reduce */
+ prod = p;
+ break;
+ }
+
+ fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d },\n",
+ i, is->go_to.cnt, i, prod,
+ prod < 0 ? -1 : g->productions[prod]->body_size,
+ prod < 0 ? -1 : g->productions[prod]->head->num);
+ }
+ fprintf(f, "};\n\n");
+ }
+
+### The `do_reduce` function and the code
+
+When the parser engine decides to reduce a production, it calls `do_reduce`.
+This has two functions.
+
+Firstly, if a non-NULL `trace` file is passed, it prints out details of the
+production being reduced. Secondly it runs the code that was included with
+the production if any.
+
+This code needs to be able to store data somewhere. Rather than requiring
+`do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
+`do_reduce` return the size to be saved.
+
+The code fragment requires translation when written out. Any `$N` needs to
+be converted to a reference either to that buffer (if `$0`) or to the
+structure returned by a previous reduction. These pointer need to be cast
+to the appropriate type for each access. All this is handling in
+`gen_code`.
+
+
+###### functions
+
+ static void gen_code(struct production *p, FILE *f, struct grammar *g)
+ {
+ char *c;
+ fprintf(f, "\t\t\t");
+ for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
+ int n;
+ if (*c != '$') {
+ fputc(*c, f);
+ if (*c == '\n')
+ fputs("\t\t\t", f);
+ continue;
+ }
+ c++;
+ if (*c < '0' || *c > '9') {
+ fputc(*c, f);
+ continue;
+ }
+ n = *c - '0';
+ while (c[1] >= '0' && c[1] <= '9') {
+ c += 1;
+ n = n * 10 + *c - '0';
+ }
+ if (n == 0)
+ fprintf(f, "(*(struct %.*s*)ret)",
+ p->head->struct_name.len,
+ p->head->struct_name.txt);
+ else if (n > p->body_size)
+ fprintf(f, "$%d", n);
+ else if (p->body[n-1]->type == Terminal)
+ fprintf(f, "(*(struct token *)body[%d])",
+ n-1);
+ else if (p->body[n-1]->struct_name.txt == NULL)
+ fprintf(f, "$%d", n);
+ else
+ fprintf(f, "(*(struct %.*s*)body[%d])",
+ p->body[n-1]->struct_name.len,
+ p->body[n-1]->struct_name.txt, n-1);
+ }
+ fputs("\n", f);
+ }
+
+###### functions
+
+ static void gen_reduce(FILE *f, struct grammar *g, char *file)
+ {
+ int i, j;
+ fprintf(f, "#line 0 \"gen_reduce\"\n");
+ fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n");
+ fprintf(f, " void *ret, FILE *trace)\n");
+ fprintf(f, "{\n");
+ fprintf(f, "\tint ret_size = 0;\n");
+
+ fprintf(f, "\tswitch(prod) {\n");
+ for (i = 0; i < g->production_count; i++) {
+ struct production *p = g->productions[i];
+ fprintf(f, "\tcase %d:\n", i);
+
+ if (p->code.txt)
+ gen_code(p, f, g);
+
+ fprintf(f, "\t\tif (trace) {\n");
+ fprintf(f, "\t\t\tfprintf(trace, \"[%%2d]%.*s ->\", depth);\n",
+ p->head->name.len, p->head->name.txt);
+ for (j = 0; j < p->body_size; j++) {
+ if (p->body[j]->type == Terminal) {
+ fputs("\t\t\tfputs(\" \", trace);\n", f);
+ fprintf(f, "\t\t\ttext_dump(trace, (*(struct token*)body[%d]).txt, 20);\n", j);
+ } else {
+ fprintf(f, "\t\t\tfprintf(trace, \" %.*s\");\n",
+ p->body[j]->name.len,
+ p->body[j]->name.txt);
+ }
+ }
+ fprintf(f, "\t\t}\n");
+
+ if (p->head->struct_name.txt)
+ fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
+ p->head->struct_name.len,
+ p->head->struct_name.txt);
+
+ fprintf(f, "\t\tbreak;\n");
+ }
+ fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
+ }
+
+### `do_free`
+
+As each non-terminal can potentially cause a different type of data
+structure to be allocated and filled in, we need to be able to free it when
+done.
+
+It is particularly important to have fine control over freeing during error
+recovery where individual stack frames might need to be freed.
+
+For this, the grammar author required to defined a `free_XX` function for
+each structure that is used by a non-terminal. `do_free` all call whichever
+is appropriate given a symbol number, and will call `free` (as is
+appropriate for tokens` on any terminal symbol.
+
+###### functions
+
+ static void gen_free(FILE *f, struct grammar *g)
+ {
+ int i;
+
+ fprintf(f, "#line 0 \"gen_free\"\n");
+ fprintf(f, "static void do_free(short sym, void *asn)\n");
+ fprintf(f, "{\n");
+ fprintf(f, "\tif (!asn) return;\n");
+ fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
+ fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
+ fprintf(f, "\tswitch(sym) {\n");
+
+ for (i = 0; i < g->num_syms; i++) {
+ struct symbol *s = g->symtab[i];
+ if (!s ||
+ s->type != Nonterminal ||
+ s->struct_name.txt == NULL)
+ continue;
+
+ fprintf(f, "\tcase %d:\n", s->num);
+ fprintf(f, "\t\tfree_%.*s(asn);\n",
+ s->struct_name.len,
+ s->struct_name.txt);
+ fprintf(f, "\t\tbreak;\n");
+ }
+ fprintf(f, "\t}\n}\n\n");
+ }
+
+## The main routine.
+
+There are three key parts to the "main" function of parsergen: processing
+the arguments, loading the grammar file, and dealing with the grammar.
+
+### Argument processing.
+
+Command line options allow the selection of analysis mode, name of output
+file, and whether or not a report should be generated. By default we create
+a report only if no code output was requested.
+
+The `parse_XX` function name uses the basename of the output file which
+should not have a suffix (`.c`). `.c` is added to the given name for the
+code, and `.h` is added for the header (if header text is specifed in the
+grammar file).
+
+###### includes
+ #include <getopt.h>
+
+###### declarations
+ static const struct option long_options[] = {
+ { "LR0", 0, NULL, '0' },
+ { "LR05", 0, NULL, '5' },
+ { "SLR", 0, NULL, 'S' },
+ { "LALR", 0, NULL, 'L' },
+ { "LR1", 0, NULL, '1' },
+ { "report", 0, NULL, 'R' },
+ { "output", 1, NULL, 'o' },
+ { NULL, 0, NULL, 0 }
+ };
+ const char *options = "05SL1Ro:";
+
+###### process arguments
+ int opt;
+ char *outfile = NULL;
+ char *infile;
+ char *name;
+ int report = 1;
+ enum grammar_type type = LR05;
+ while ((opt = getopt_long(argc, argv, options,
+ long_options, NULL)) != -1) {
+ switch(opt) {
+ case '0':
+ type = LR0; break;
+ case '5':
+ type = LR05; break;
+ case 'S':
+ type = SLR; break;
+ case 'L':
+ type = LALR; break;
+ case '1':
+ type = LR1; break;
+ case 'R':
+ report = 2; break;
+ case 'o':
+ outfile = optarg; break;
+ default:
+ fprintf(stderr, "Usage: parsergen ...\n");
+ exit(1);
+ }
+ }
+ if (optind < argc)
+ infile = argv[optind++];
+ else {
+ fprintf(stderr, "No input file given\n");
+ exit(1);
+ }
+ if (outfile && report == 1)
+ report = 0;
+ name = outfile;
+ if (name && strchr(name, '/'))
+ name = strrchr(name, '/')+1;
+
+ if (optind < argc) {
+ fprintf(stderr, "Excess command line arguments\n");
+ exit(1);
+ }
+
+### Loading the grammar file
+
+To be able to run `mdcode` and `scanner` on the grammar we need to memory
+map it.
+
+One we have extracted the code (with `mdcode`) we expect to file three
+sections: header, code, and grammar. Anything else is an error.
+
+"header" and "code" are optional, though it is hard to build a working
+parser with neither. "grammar" must be provided.
+
+###### includes
+ #include <fcntl.h>
+ #include <sys/mman.h>
+ #include <errno.h>
+
+###### functions
+ static int errs;
+ static void pr_err(char *msg)
+ {
+ errs++;
+ fprintf(stderr, "%s\n", msg);
+ }
+
+###### load file
+ struct section *table;
+ int fd;
+ int len;
+ char *file;
+ fd = open(infile, O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "parsergen: cannot open %s: %s\n",
+ infile, strerror(errno));
+ exit(1);
+ }
+ len = lseek(fd, 0, 2);
+ file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+ table = code_extract(file, file+len, pr_err);
+
+ struct code_node *hdr = NULL;
+ struct code_node *code = NULL;
+ struct code_node *gram = NULL;
+ for (s = table; s; s = s->next) {
+ if (text_is(s->section, "header"))
+ hdr = s->code;
+ else if (text_is(s->section, "code"))
+ code = s->code;
+ else if (text_is(s->section, "grammar"))
+ gram = s->code;
+ else {
+ fprintf(stderr, "Unknown content section: %.*s\n",
+ s->section.len, s->section.txt);
+ rv |= 2;
+ }
+ }
+
+### Processing the input
+
+As we need to append an extention to a filename and then open it for
+writing, and we need to do this twice, it helps to have a separate function.
+
+###### functions
+
+ static FILE *open_ext(char *base, char *ext)
+ {
+ char *fn = malloc(strlen(base) + strlen(ext) + 1);
+ FILE *f;
+ strcat(strcpy(fn, base), ext);
+ f = fopen(fn, "w");
+ free(fn);
+ return f;
+ }
+
+If we can read the grammar, then we analyse and optionally report on it. If
+the report finds conflicts we will exit with an error status.
+
+###### process input
+ struct grammar *g = NULL;
+ if (gram == NULL) {
+ fprintf(stderr, "No grammar section provided\n");
+ rv |= 2;
+ } else {
+ g = grammar_read(gram);
+ if (!g) {
+ fprintf(stderr, "Failure to parse grammar\n");
+ rv |= 2;
+ }
+ }
+ if (g) {
+ grammar_analyse(g, type);
+ if (report)
+ if (grammar_report(g, type))
+ rv |= 1;
+ }
+
+If a headers section is defined, we write it out and include a declaration
+for the `parse_XX` function so it can be used from separate code.
+
+ if (rv == 0 && hdr && outfile) {
+ FILE *f = open_ext(outfile, ".h");
+ if (f) {
+ code_node_print(f, hdr, infile);
+ fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
+ name);
+ fclose(f);
+ } else {
+ fprintf(stderr, "Cannot create %s.h\n",
+ outfile);
+ rv |= 4;
+ }
+ }
+
+And if all goes well and an output file was provided, we create the `.c`
+file with the code section (if any) and the parser tables and function.
+
+ if (rv == 0 && outfile) {
+ FILE *f = open_ext(outfile, ".c");
+ if (f) {
+ if (code)
+ code_node_print(f, code, infile);
+ gen_parser(f, g, infile, name);
+ fclose(f);
+ } else {
+ fprintf(stderr, "Cannot create %s.c\n",
+ outfile);
+ rv |= 4;
+ }
+ }
+
+And that about wraps it up. We need to set the locale so that UTF-8 is
+recognised properly, and link with `libicuuc` is `libmdcode` requires that.
+
+###### File: parsergen.mk
+ parsergen : parsergen.o libscanner.o libmdcode.o
+ $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
+
+###### includes
+ #include <locale.h>
+
+###### main
+
+ int main(int argc, char *argv[])
+ {
+ struct section *s;
+ int rv = 0;
+
+ setlocale(LC_ALL,"");
+
+ ## process arguments
+ ## load file
+ ## process input
+
+ return rv;
+ }
+
+## The SHIFT/REDUCE parser
+
+Having analysed the grammar and generated all the table, we only need the
+shift/reduce engine to bring it all together.
+
+### Goto table lookup
+
+The parser generator has nicely provided us with goto tables sorted by
+symbol number. We need a binary search function to find a symbol in the
+table.
+
+###### parser
+
+ static int search(const struct state *l, int sym)
+ {
+ int lo = 0;
+ int hi = l->go_to_cnt;
+
+ if (hi == 0)
+ return -1;
+ while (lo + 1 < hi) {
+ int mid = (lo + hi) / 2;
+ if (l->go_to[mid].sym <= sym)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (l->go_to[lo].sym == sym)
+ return l->go_to[lo].state;
+ else
+ return -1;
+ }
+
+### The state stack.
+
+The core data structure for the parser is the stack. This track all the
+symbols that have been recognised or partially recognised.
+
+The stack usually won't grow very large - maybe a few tens of entries. So
+we dynamically resize and array as required but never bother to shrink it
+down again.
+
+We keep the stack as two separate allocations. One, `asn_stack` stores the
+"abstract syntax nodes" which are created by each reduction. When we call
+`do_reduce` we need to pass an array of the `asn`s of the body of the
+production, and by keeping a separate `asn` stack, we can just pass a
+pointer into this stack.
+
+The other allocation store all other stack fields of which there are two.
+The `state` is the most important one and guides the parsing process. The
+`sym` is nearly unnecessary. However when we want to free entries from the
+`asn_stack`, it helps to know what type they are so we can call the right
+freeing function. The symbol leads us to the right free function through
+`do_free`.
+
+###### parser
+
+ struct parser {
+ int state;
+ struct frame {
+ short state;
+ short sym;
+ } *stack;
+ void **asn_stack;
+ int stack_size;
+ int tos;
+ };
+
+#### Shift and pop
+
+The operations are needed on the stack - shift (which is like push) and pop.
+
+Shift applies no only to terminals but also to non-terminals. When we
+reduce a production we will pop off entries corresponding to the body
+symbols, then push on an item for the head of the production. This last is
+exactly the same process as shifting in a terminal so we use the same
+function for both.
+
+To simplify other code we arrange for `shift` to fail if there is no `goto`
+state for the symbol. This is useful in basic parsing due to our design
+that we shift when we can, and reduce when we cannot. So the `shift`
+function reports if it could.
+
+So `shift` finds the next state. If that succeed it extends the allocations
+if needed and pushed all the information onto the stacks.
+
+###### parser
+
+ static int shift(struct parser *p,
+ int sym, void *asn,
+ const struct state states[])
+ {
+ // Push an entry onto the stack
+ int newstate = search(&states[p->state], sym);
+ if (newstate < 0)
+ return 0;
+ if (p->tos >= p->stack_size) {
+ p->stack_size += 10;
+ p->stack = realloc(p->stack, p->stack_size
+ * sizeof(p->stack[0]));
+ p->asn_stack = realloc(p->asn_stack, p->stack_size
+ * sizeof(p->asn_stack[0]));
+ }
+ p->stack[p->tos].state = p->state;
+ p->stack[p->tos].sym = sym;
+ p->asn_stack[p->tos] = asn;
+ p->tos++;
+ p->state = newstate;
+ return 1;
+ }
+
+`pop` simply moves the top of stack (`tos`) back down the required amount
+and frees and `asn` entries that need to be freed. It is called _after_ we
+reduce a production, just before we `shift` the nonterminal in.
+
+###### parser
+
+ static void pop(struct parser *p, int num,
+ void(*do_free)(short sym, void *asn))
+ {
+ int i;
+ p->tos -= num;
+ for (i = 0; i < num; i++)
+ do_free(p->stack[p->tos+i].sym,
+ p->asn_stack[p->tos+i]);
+
+ p->state = p->stack[p->tos].state;
+ }
+
+### Memory allocation
+
+The `scanner` returns tokens in a local variable - we want them in allocated
+memory so they can live in the `asn_stack`. Similarly the `asn` produce by
+a reduce is in a large buffer. Both of these require some allocation and
+copying, hence `memdup` and `tokcopy`.
+
+###### parser includes
+ #include <memory.h>
+
+###### parser
+
+ void *memdup(void *m, int len)
+ {
+ void *ret;
+ if (len == 0)
+ return NULL;
+ ret = malloc(len);
+ memcpy(ret, m, len);
+ return ret;
+ }
+
+ static struct token *tok_copy(struct token tk)
+ {
+ struct token *new = malloc(sizeof(*new));
+ *new = tk;
+ return new;
+ }
+
+### The heart of the parser.
+
+Now we have the parser. If we can shift, we do. If not and we can reduce
+we do. If the production we reduced was production zero, then we have
+accepted the input and can finish.
+
+If we can neither shift nor reduce we have an error to handle. We pop
+single entries off the stack until we can shift the `TK_error` symbol, the
+drop input tokens until we find one we can shift into the new error state.
+
+We return whatever `asn` was returned by reducing production zero.
+
+###### parser includes
+ #include "parser.h"
+###### parser
+ void *parser_run(struct token_state *tokens,
+ const struct state states[],
+ int (*do_reduce)(int, int, void**, void*, FILE*),
+ void (*do_free)(short, void*),
+ FILE *trace)
+ {
+ struct parser p = { 0 };
+ struct token *tk;
+ int accepted = 0;
+ void *ret;
+
+ tk = tok_copy(token_next(tokens));
+ while (!accepted) {
+ if (shift(&p, tk->num, tk, states)) {
+ if (trace) {
+ fputs("Shift ", trace);
+ text_dump(trace, tk->txt, 20);
+ fputs("\n", trace);
+ }
+ tk = tok_copy(token_next(tokens));
+ continue;
+ }
+ if (states[p.state].reduce_prod >= 0) {
+ void **body;
+ int prod = states[p.state].reduce_prod;
+ int size = states[p.state].reduce_size;
+ int sym = states[p.state].reduce_sym;
+ int bufsize;
+ static char buf[16*1024];
+
+ body = p.asn_stack +
+ (p.tos - states[p.state].reduce_size);
+
+ bufsize = do_reduce(prod, p.tos, body,
+ buf, trace);
+ if (trace)
+ fputs("\n", trace);
+
+ pop(&p, size, do_free);
+ shift(&p, sym, memdup(buf, bufsize), states);
+ if (prod == 0)
+ accepted = 1;
+ continue;
+ }
+ /* Error. we walk up the stack until we
+ * find a state which will accept TK_error.
+ * We then shift in TK_error and see what state
+ * that takes us too.
+ * Then we discard input tokens until
+ * we find one that is acceptable.
+ */
+ while (shift(&p, TK_error, tk, states) < 0
+ && p.tos > 0)
+ // discard this state
+ pop(&p, 1, do_free);
+ tk = tok_copy(*tk);
+ while (search(&states[p.state], tk->num) < 0 &&
+ tk->num != TK_eof) {
+ free(tk);
+ tk = tok_copy(token_next(tokens));
+ }
+ if (p.tos == 0 && tk->num == TK_eof)
+ break;
+ }
+ free(tk);
+ if (accepted)
+ ret = p.asn_stack[0];
+ else
+ pop(&p, p.tos, do_free);
+ free(p.asn_stack);
+ free(p.stack);
+ return ret;
+ }
+
+###### exported functions
+ void *parser_run(struct token_state *tokens,
+ const struct state states[],
+ int (*do_reduce)(int, int, void**, void*, FILE*),
+ void (*do_free)(short, void*),
+ FILE *trace);
+
+
+# A Worked Example
+
+The obvious example for a parser is a calculator.
+
+As `scanner` provides number parsing function using `libgmp` is it not much
+work to perform arbitrary rational number calculations.
+
+This calculator takes one expression, or an equality test per line. The
+results are printed and in any equality test fails, the program exits with
+an error.
+
+Embedding mdcode inside mdcode is rather horrible. I'd like to find a
+better approach, but as the grammar file must have 3 components I need
+something like this.
+
+###### File: parsergen.mk
+ calc.c : parsergen calc.cgm
+ ./parsergen -o calc calc.cgm
+ calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
+ $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
+
+###### demo grammar
+
+ # header
+ ~~~~~
+
+ #include "number.h"
+ // what do we use for a demo-grammar? A calculator of course.
+ struct number {
+ mpq_t val;
+ char tail[2];
+ int err;
+ };
+
+ ~~~~~
+ # code
+ ~~~~~
+
+ #include <stdlib.h>
+ #include <unistd.h>
+ #include <fcntl.h>
+ #include <sys/mman.h>
+ #include <stdio.h>
+ #include <malloc.h>
+ #include <gmp.h>
+ #include "mdcode.h"
+ #include "scanner.h"
+ #include "number.h"
+ #include "parser.h"
+
+ #include "calc.h"
+
+ static void free_number(struct number *n)
+ {
+ mpq_clear(n->val);
+ free(n);
+ }
+
+ int main(int argc, char *argv[])
+ {
+ int fd = open(argv[1], O_RDONLY);
+ int len = lseek(fd, 0, 2);
+ char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+ struct section *s = code_extract(file, file+len, NULL);
+ struct token_config config = {
+ .ignored = (1 << TK_line_comment)
+ | (1 << TK_block_comment)
+ | (1 << TK_indent)
+ | (1 << TK_undent),
+ .number_chars = ".,_+-",
+ .word_start = "",
+ .word_cont = "",
+ };
+ parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
+ exit(0);
+ }
+
+ ~~~~~
+ # grammar
+ ~~~~~
+
+ Session -> Session Line
+ | Line
+
+ Line -> Expression NEWLINE ${ gmp_printf( "Answer = %Qd\n", $1.val);
+ { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
+ gmp_printf( " or as a decimal: %Fg\n", fl);
+ mpf_clear(fl);
+ }
+ }$
+ | Expression = Expression NEWLINE ${
+ if (mpq_equal($1.val, $3.val))
+ gmp_printf( "Both equal %Qd\n", $1.val);
+ else {
+ gmp_printf( "NOT EQUAL: %Qd\n != : %Qd\n",
+ $1.val, $3.val);
+ exit(1);
+ }
+ }$
+ | NEWLINE ${ printf( "Blank line\n"); }$
+ | ERROR NEWLINE ${ printf( "Skipped a bad line\n"); }$
+
+ $number
+ Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
+ | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
+ | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
+
+ Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
+ | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+ | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
+
+ Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
+ | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$