[find the source at parsergen.mdc]

LR(1) Parser Generator

This parser generator takes a grammar description combined with code fragments, analyses it, and can report details about the analysis and write out C code files which can be compiled to make a parser.

There are several distinct sections.

  1. grammar_read will read a grammar from a literate-code file and store the productions, symbols, and code fragments.
  2. grammar_analyse will take that grammar and build LR parsing states and look-ahead tables.
  3. grammar_report will present the details of the analysis in a readable format and will report any conflicts.
  4. parser_generate will write out C code files with various tables and with the code fragments arranged in useful places.
  5. parser_run is a library function intended to be linked together with the generated parser tables to complete the implementation of a parser.
  6. calc.cgm is a test grammar for a simple calculator.
File: parsergen.c
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
## includes
## forward declarations
## declarations
## functions
## grammar_read
## grammar_analyse
## grammar_report
## parser_generate
## main
File: parser.h
## exported types
## exported functions
File: libparser.c
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
## parser includes
## parser functions
## parser
File: calc.cgm
## demo grammar
File: parsergen.mk
CFLAGS += -Wall -g
all :: parsergen calc
parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
    ./md2c parsergen.mdc

Reading the grammar

The grammar must be stored in a markdown literate code file as parsed by "mdcode". It must have three top level (i.e. unreferenced) sections: header, code, and grammar. The first two will be literally copied into the generated .c and .h. files. The last contains the grammar. This is tokenised with "scanner".

includes
#include "mdcode.h"
#include "scanner.h"
parser includes
#include "mdcode.h"
#include "scanner.h"

The grammar contains production sets, precedence/associativity declarations, and data type declarations. These are all parsed with ad hoc parsing as we don't have a parser generator yet.

The precedence and associativity can be set for each production, but can be inherited from symbols. The data type is potentially defined for each non-terminal and describes what C structure is used to store information about each symbol.

declarations
enum assoc {Left, Right, Non};
char *assoc_names[] = {"Left","Right","Non"};

struct symbol {
    struct text struct_name;
    enum assoc assoc;
    unsigned short precedence;
    ## symbol fields
};
struct production {
    unsigned short precedence;
    enum assoc assoc;
    ## production fields
};
struct grammar {
    ## grammar fields
};

The strings reported by mdcode and scanner are struct text which have length rather than being null terminated. To help with printing an comparing we define text_is and prtxt, which should possibly go in mdcode. scanner does provide text_dump which is useful for strings which might contain control characters.

functions
static int text_is(struct text t, char *s)
{
    return (strlen(s) == t.len &&
        strncmp(s, t.txt, t.len) == 0);
}
static void prtxt(struct text t)
{
    printf("%.*s", t.len, t.txt);
}

Symbols

Productions are comprised primarily of symbols - terminal and non-terminal. We do not make a syntactic distinction between the two, though non-terminals must be identifiers. Non-terminal symbols are those which appear in the head of a production, terminal symbols are those which don't. There are also "virtual" symbols used for precedence marking discussed later, and sometimes we won't know what type a symbol is yet.

forward declarations
enum symtype { Unknown, Virtual, Terminal, Nonterminal };
char *symtypes = "UVTN";
symbol fields
enum symtype type;

Symbols can be either TK_ident or TK_mark. They are saved in a table of known symbols and the resulting parser will report them as TK_reserved + N. A small set of identifiers are reserved for the different token types that scanner can report.

declarations
static char *reserved_words[] = {
    [TK_error]        = "ERROR",
    [TK_number]       = "NUMBER",
    [TK_ident]        = "IDENTIFIER",
    [TK_mark]         = "MARK",
    [TK_string]       = "STRING",
    [TK_multi_string] = "MULTI_STRING",
    [TK_indent]       = "INDENT",
    [TK_undent]       = "UNDENT",
    [TK_newline]      = "NEWLINE",
    [TK_eof]          = "$eof",
};
symbol fields
short num;

Note that TK_eof and the two TK_*_comment tokens cannot be recognised. The former is automatically expected at the end of the text being parsed. The latter are always ignored by the parser.

All of these symbols are stored in a simple symbol table. We use the struct text from mdcode to store the name and link them together into a sorted list using an insertion sort.

We don't have separate find and insert functions as any symbol we find needs to be remembered. We simply expect find to always return a symbol, but its type might be Unknown.

includes
#include <string.h>
symbol fields
struct text name;
struct symbol *next;
grammar fields
struct symbol *syms;
int num_syms;
functions
static int text_cmp(struct text a, struct text b)
{
    int len = a.len;
    if (a.len > b.len)
        len = b.len;
    int cmp = strncmp(a.txt, b.txt, len);
    if (cmp)
        return cmp;
    else
        return a.len - b.len;
}

static struct symbol *sym_find(struct grammar *g, struct text s)
{
    struct symbol **l = &g->syms;
    struct symbol *n;
    int cmp = 1;

    while (*l &&
        (cmp = text_cmp((*l)->name, s)) < 0)
            l = & (*l)->next;
    if (cmp == 0)
        return *l;
    n = calloc(1, sizeof(*n));
    n->name = s;
    n->type = Unknown;
    n->next = *l;
    n->num = -1;
    *l = n;
    return n;
}

static void symbols_init(struct grammar *g)
{
    int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
    int i;
    for (i = 0; i < entries; i++) {
        struct text t;
        struct symbol *s;
        t.txt = reserved_words[i];
        if (!t.txt)
            continue;
        t.len = strlen(t.txt);
        s = sym_find(g, t);
        s->type = Terminal;
        s->num = i;
    }
}

Data types and precedence.

Data type specification and precedence specification are both introduced by a dollar sign at the start of the line. If the next word is LEFT, RIGHT or NON, then the line specifies a precedence, otherwise it specifies a data type.

The data type name is simply stored and applied to the head of all subsequent productions. It must be the name of a structure, so $expr maps to struct expr.

Any productions given before the first data type will have no data type and can carry no information. In order to allow other non-terminals to have no type, the data type $void can be given. This does not mean that struct void will be used, but rather than no type will be associated with future non-terminals.

The precedence line must contain a list of symbols - typically terminal symbols, but not necessarily. It can only contain symbols that have not been seen yet, so precedence declaration must precede the productions that they affect.

A precedence line may also contain a "virtual" symbol which is an identifier preceded by $$. Virtual terminals carry precedence information but are not included in the grammar. A production can declare that it inherits the precedence of a given virtual symbol.

This use for $$ precludes it from being used as a symbol in the described language. Two other symbols: ${ and }$ are also unavailable.

Each new precedence line introduces a new precedence level and declares how it associates. This level is stored in each symbol listed and may be inherited by any production which uses the symbol. A production inherits from the last symbol which has a precedence.

grammar fields
struct text current_type;
int prec_levels;
declarations
enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
static const char *known[] = { "$$", "${", "}$" };
functions
static char *dollar_line(struct token_state *ts, struct grammar *g)
{
    struct token t = token_next(ts);
    char *err;
    enum assoc assoc;
    int found;

    if (t.num != TK_ident) {
        err = "type or assoc expected after '$'";
        goto abort;
    }
    if (text_is(t.txt, "LEFT"))
        assoc = Left;
    else if (text_is(t.txt, "RIGHT"))
        assoc = Right;
    else if (text_is(t.txt, "NON"))
        assoc = Non;
    else {
        g->current_type = t.txt;
        if (text_is(t.txt, "void"))
            g->current_type.txt = NULL;
        t = token_next(ts);
        if (t.num != TK_newline) {
            err = "Extra tokens after type name";
            goto abort;
        }
        return NULL;
    }

    // This is a precedence line, need some symbols.
    found = 0;
    g->prec_levels += 1;
    t = token_next(ts);
    while (t.num != TK_newline) {
        enum symtype type = Terminal;
        struct symbol *s;
        if (t.num == TK_virtual) {
            type = Virtual;
            t = token_next(ts);
            if (t.num != TK_ident) {
                err = "$$ must be followed by a word";
                goto abort;
            }
        } else if (t.num != TK_ident &&
                   t.num != TK_mark) {
            err = "Illegal token in precedence line";
            goto abort;
        }
        s = sym_find(g, t.txt);
        if (s->type != Unknown) {
            err = "Symbols in precedence line must not already be known.";
            goto abort;
        }
        s->type = type;
        s->precedence = g->prec_levels;
        s->assoc = assoc;
        found += 1;
    }
    if (found == 0)
        err = "No symbols given on precedence line";
        goto abort;
    return NULL;
abort:
    while (t.num != TK_newline && t.num != TK_eof)
        t = token_next(ts);
    return err;
}

Productions

A production either starts with an identifier which is the head non-terminal, or a vertical bar (|) in which case this production uses the same head as the previous one. The identifier must be followed by a -> mark. All productions for a given non-terminal must be grouped together with the nonterminal -> given only once.

After this a (possibly empty) sequence of identifiers and marks form the body of the production. A virtual symbol may be given after the body by preceding it with $$. If a virtual symbol is given, the precedence of the production is that for the virtual symbol. If none is given, the precedence is inherited from the last symbol in the production which has a precedence specified.

After the optional precedence may come the ${ mark. This indicates the start of a code fragment. If present, this must be on the same line as the start of the production.

All of the text from the ${ through to the matching }$ is collected and forms the code-fragment for the production. It must all be in one code_node of the literate code. The }$ must be at the end of a line.

Text in the code fragment will undergo substitutions where $N for some numeric N will be replaced with a variable holding the parse information for the particular symbol in the production. $0 is the head of the production, $1 is the first symbol of the body, etc. The type of $N for a terminal symbol is struct token. For non-terminal, it is whatever has been declared for that symbol.

While building productions we will need to add to an array which needs to grow dynamically.

functions
static void array_add(void *varray, int *cnt, void *new)
{
    void ***array = varray;
    int current = 0;
    const int step = 8;
    current = ((*cnt-1) | (step-1))+1;
    if (*cnt == current) {
        /* must grow */
        current += step;
        *array = realloc(*array, current * sizeof(void*));
    }
    (*array)[*cnt] = new;
    (*cnt) += 1;
}

Collecting the code fragment simply involves reading tokens until we hit the end token }$, and noting the character position of the start and the end.

functions
static struct text collect_code(struct token_state *state,
                                struct token start)
{
    struct text code;
    struct token t;
    code.txt = start.txt.txt + start.txt.len;
    do
        t = token_next(state);
    while (t.node == start.node &&
           t.num != TK_close && t.num != TK_error &&
           t.num != TK_eof);
    if (t.num == TK_close && t.node == start.node)
        code.len = t.txt.txt - code.txt;
    else
        code.txt = NULL;
    return code;
}

Now we have all the bit we need to parse a full production.

includes
#include <memory.h>
grammar fields
struct production **productions;
int production_count;
production fields
struct symbol  *head;
struct symbol **body;
int             body_size;
struct text     code;
symbol fields
int first_production;
functions
static char *parse_production(struct grammar *g,
                              struct symbol *head,
                              struct token_state *state)
{
    /* Head has already been parsed. */
    struct token tk;
    char *err;
    struct production p, *pp;

    memset(&p, 0, sizeof(p));
    p.head = head;
    tk = token_next(state);
    while (tk.num == TK_ident || tk.num == TK_mark) {
        struct symbol *bs = sym_find(g, tk.txt);
        if (bs->type == Unknown)
            bs->type = Terminal;
        if (bs->type == Virtual) {
            err = "Virtual symbol not permitted in production";
            goto abort;
        }
        if (bs->precedence) {
            p.precedence = bs->precedence;
            p.assoc = bs->assoc;
        }
        array_add(&p.body, &p.body_size, bs);
        tk = token_next(state);
    }
    if (tk.num == TK_virtual) {
        struct symbol *vs;
        tk = token_next(state);
        if (tk.num != TK_ident) {
            err = "word required after $$";
            goto abort;
        }
        vs = sym_find(g, tk.txt);
        if (vs->type != Virtual) {
            err = "symbol after $$ must be virtual";
            goto abort;
        }
        p.precedence = vs->precedence;
        p.assoc = vs->assoc;
        tk = token_next(state);
    }
    if (tk.num == TK_open) {
        p.code = collect_code(state, tk);
        if (p.code.txt == NULL) {
            err = "code fragment not closed properly";
            goto abort;
        }
        tk = token_next(state);
    }
    if (tk.num != TK_newline && tk.num != TK_eof) {
        err = "stray tokens at end of line";
        goto abort;
    }
    pp = malloc(sizeof(*pp));
    *pp = p;
    array_add(&g->productions, &g->production_count, pp);
    return NULL;
abort:
    while (tk.num != TK_newline && tk.num != TK_eof)
        tk = token_next(state);
    return err;
}

With the ability to parse production and dollar-lines, we have nearly all that we need to parse a grammar from a code_node.

The head of the first production will effectively be the start symbol of the grammar. However it wont actually be so. Processing the grammar is greatly simplified if the real start symbol only has a single production, and expect $eof as the final terminal. So when we find the first explicit production we insert an extra production as production zero which looks like

Example: production 0
$start -> START $eof

where START is the first non-terminal give.

create production zero
struct production *p = calloc(1,sizeof(*p));
struct text start = {"$start",6};
struct text eof = {"$eof",4};
p->head = sym_find(g, start);
p->head->type = Nonterminal;
array_add(&p->body, &p->body_size, head);
array_add(&p->body, &p->body_size, sym_find(g, eof));
g->start  = p->head->num;
p->head->first_production = g->production_count;
array_add(&g->productions, &g->production_count, p);

Now we are ready to read in the grammar.

grammar fields
int start;  // the 'start' symbol.
grammar_read
static struct grammar *grammar_read(struct code_node *code)
{
    struct token_config conf = {
        .word_start = "",
        .word_cont = "",
        .words_marks = known,
        .known_count = sizeof(known)/sizeof(known[0]),
        .number_chars = "",
        .ignored = (1 << TK_line_comment)
                 | (1 << TK_block_comment)
             | (0 << TK_number)
             | (1 << TK_string)
             | (1 << TK_multi_string)
             | (1 << TK_indent)
             | (1 << TK_undent),
    };

    struct token_state *state = token_open(code, &conf);
    struct token tk = token_next(state);
    struct symbol *head = NULL;
    struct grammar *g;
    char *err = NULL;

    g = calloc(1, sizeof(*g));
    symbols_init(g);

    for (tk = token_next(state); tk.num != TK_eof;
         tk = token_next(state)) {
        if (tk.num == TK_newline)
            continue;
        if (tk.num == TK_ident) {
            // new non-terminal
            head = sym_find(g, tk.txt);
            if (head->type == Nonterminal)
                err = "This non-terminal has already be used.";
            else if (head->type == Virtual)
                err = "Virtual symbol not permitted in head of production";
            else {
                head->type = Nonterminal;
                head->struct_name = g->current_type;
                if (g->start == 0) {
                    ## create production zero
                }
                head->first_production = g->production_count;
                tk = token_next(state);
                if (tk.num == TK_mark &&
                    text_is(tk.txt, "->"))
                    err = parse_production(g, head, state);
                else
                    err = "'->' missing in production";
            }
        } else if (tk.num == TK_mark
                   && text_is(tk.txt, "|")) {
            // another production for same non-term
            if (head)
                err = parse_production(g, head, state);
            else
                err = "First production must have a head";
        } else if (tk.num == TK_mark
                   &&  text_is(tk.txt, "$")) {
            err = dollar_line(state, g);
        } else {
            err = "Unrecognised token at start of line.";
        }
        if (err)
            goto abort;
    }
    token_close(state);
    return g;
abort:
    fprintf(stderr, "Error at line %d: %s\n",
            tk.line, err);
    token_close(state);
    free(g);
    return NULL;
}

Analysing the grammar

The central task in analysing the grammar is to determine a set of states to drive the parsing process. This will require finding various sets of symbols and of "items". Some of these sets will need to attach information to each element in the set, so they are more like maps.

Each "item" may have a 'look-ahead' or LA set associated with it. Many of these will be the same as each other. To avoid memory wastage, and to simplify some comparisons of sets, these sets will be stored in a list of unique sets, each assigned a number.

Once we have the data structures in hand to manage these sets and lists, we can start setting the 'nullable' flag, build the 'FIRST' sets, and then create the item sets which define the various states.

Symbol sets.

Though we don't only store symbols in these sets, they are the main things we store, so they are called symbol sets or "symsets".

A symset has a size, an array of shorts, and an optional array of data values, which are also shorts. If the array of data is not present, we store an impossible pointer, as NULL is used to indicate that no memory has been allocated yet;

declarations
struct symset {
    short cnt;
    unsigned short *syms, *data;
};
#define NO_DATA ((unsigned short *)1)
const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
const struct symset INIT_DATASET = { 0, NULL, NULL };

The arrays of shorts are allocated in blocks of 8 and are kept sorted by using an insertion sort. We don't explicitly record the amount of allocated space as it can be derived directly from the current cnt using ((cnt - 1) | 7) + 1.

functions
static void symset_add(struct symset *s, unsigned short key, unsigned short val)
{
    int i;
    int current = ((s->cnt-1) | 7) + 1;
    if (current == s->cnt) {
        current += 8;
        s->syms = realloc(s->syms, sizeof(*s->syms) * current);
        if (s->data != NO_DATA)
            s->data = realloc(s->data, sizeof(*s->data) * current);
    }
    i = s->cnt;
    while (i > 0 && s->syms[i-1] > key) {
        s->syms[i] = s->syms[i-1];
        if (s->data != NO_DATA)
            s->data[i] = s->data[i-1];
        i--;
    }
    s->syms[i] = key;
    if (s->data != NO_DATA)
        s->data[i] = val;
    s->cnt += 1;
}

Finding a symbol (or item) in a symset uses a simple binary search. We return the index where the value was found (so data can be accessed), or -1 to indicate failure.

static int symset_find(struct symset *ss, unsigned short key)
{
    int lo = 0;
    int hi = ss->cnt;

    if (hi == 0)
        return -1;
    while (lo + 1 < hi) {
        int mid = (lo + hi) / 2;
        if (ss->syms[mid] <= key)
            lo = mid;
        else
            hi = mid;
    }
    if (ss->syms[lo] == key)
        return lo;
    return -1;
}

We will often want to form the union of two symsets. When we do, we will often want to know if anything changed (as they might mean there is more work to do). So symset_union reports whether anything was added to the first set. We use a slow quadratic approach as these sets don't really get very big. If profiles shows this to be a problem is can be optimised later.

static int symset_union(struct symset *a, struct symset *b)
{
    int i;
    int added = 0;
    for (i = 0; i < b->cnt; i++)
        if (symset_find(a, b->syms[i]) < 0) {
            unsigned short data = 0;
            if (b->data != NO_DATA)
                data = b->data[i];
            symset_add(a, b->syms[i], data);
            added++;
        }
    return added;
}

And of course we must be able to free a symset.

static void symset_free(struct symset ss)
{
    free(ss.syms);
    if (ss.data != NO_DATA)
        free(ss.data);
}

Symset Storage

Some symsets are simply stored somewhere appropriate in the grammar data structure, others needs a bit of indirection. This applies particularly to "LA" sets. These will be paired with "items" in an "itemset". As itemsets will be stored in a symset, the "LA" set needs to be stored in the data field, so we need a mapping from a small (short) number to an LA symset.

As mentioned earlier this involves creating a list of unique symsets.

For now, we just use a linear list sorted by insertion. A skiplist would be more efficient and may be added later.

declarations
struct setlist {
    struct symset ss;
    int num;
    struct setlist *next;
};
grammar fields
struct setlist *sets;
int nextset;
functions
static int ss_cmp(struct symset a, struct symset b)
{
    int i;
    int diff = a.cnt - b.cnt;
    if (diff)
        return diff;
    for (i = 0; i < a.cnt; i++) {
        diff = (int)a.syms[i] - (int)b.syms[i];
        if (diff)
            return diff;
    }
    return 0;
}

static int save_set(struct grammar *g, struct symset ss)
{
    struct setlist **sl = &g->sets;
    int cmp = 1;
    struct setlist *s;

    while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
        sl = & (*sl)->next;
    if (cmp == 0) {
        symset_free(ss);
        return (*sl)->num;
    }

    s = malloc(sizeof(*s));
    s->ss = ss;
    s->num = g->nextset;
    g->nextset += 1;
    s->next = *sl;
    *sl = s;
    return s->num;
}

Finding a set by number is currently performed by a simple linear search. If this turns out to hurt performance, we can store the sets in a dynamic array like the productions.

static struct symset set_find(struct grammar *g, int num)
{
    struct setlist *sl = g->sets;
    while (sl && sl->num != num)
        sl = sl->next;
    return sl->ss;
}

Setting nullable

We set nullable on the head symbol for any production for which all the body symbols (if any) are nullable. As this is a recursive definition, any change in the nullable setting means that we need to re-evaluate where it needs to be set.

We simply loop around performing the same calculations until no more changes happen.

symbol fields
int nullable;
functions
static void set_nullable(struct grammar *g)
{
    int check_again = 1;
    while (check_again) {
        int p;
        check_again = 0;
        for (p = 0; p < g->production_count; p++) {
            struct production *pr = g->productions[p];
            int s;

            if (pr->head->nullable)
                continue;
            for (s = 0; s < pr->body_size; s++)
                if (! pr->body[s]->nullable)
                    break;
            if (s == pr->body_size) {
                pr->head->nullable = 1;
                check_again = 1;
            }
        }
    }
}

Building the first sets

When calculating what can follow a particular non-terminal, we will need to know what the "first" terminal in an subsequent non-terminal might be. So we calculate the first set for every non-terminal and store them in an array. We don't bother recording the "first" set for terminals as they are trivial.

As the "first" for one symbol might depend on the "first" of another, we repeat the calculations until no changes happen, just like with set_nullable. This makes use of the fact that symset_union reports if any change happens.

The core of this which finds the "first" of part of a production body will be reused for computing the "follow" sets, so we split it out into a separate function.

grammar fields
struct symset *first;
functions
static int add_first(struct production *pr, int start,
                     struct symset *target, struct grammar *g,
                     int *to_end)
{
    int s;
    int changed = 0;
    for (s = start; s < pr->body_size; s++) {
        struct symbol *bs = pr->body[s];
        if (bs->type == Terminal) {
            if (symset_find(target, bs->num) < 0) {
                symset_add(target, bs->num, 0);
                changed = 1;
            }
            break;
        } else if (symset_union(target, &g->first[bs->num]))
            changed = 1;
        if (!bs->nullable)
            break;
    }
    if (to_end)
        *to_end = (s == pr->body_size);
    return changed;
}

static void build_first(struct grammar *g)
{
    int check_again = 1;
    int s;
    g->first = calloc(g->num_syms, sizeof(g->first[0]));
    for (s = 0; s < g->num_syms; s++)
        g->first[s] = INIT_SYMSET;

    while (check_again) {
        int p;
        check_again = 0;
        for (p = 0; p < g->production_count; p++) {
            struct production *pr = g->productions[p];
            struct symset *head = &g->first[pr->head->num];

            if (add_first(pr, 0, head, g, NULL))
                check_again = 1;
        }
    }
}

Building the follow sets.

There are two different situations when we will want to generate "follow" sets. If we are doing an SLR analysis, we want to generate a single "follow" set for each non-terminal in the grammar. That is what is happening here. If we are doing an LALR or LR analysis we will want to generate a separate "LA" set for each item. We do that later in state generation.

There are two parts to generating a "follow" set. Firstly we look at every place that any non-terminal appears in the body of any production, and we find the set of possible "first" symbols after there. This is done using add_first above and only needs to be done once as the "first" sets are now stable and will not change.

follow code
for (p = 0; p < g->production_count; p++) {
    struct production *pr = g->productions[p];
    int b;

    for (b = 0; b < pr->body_size - 1; b++) {
        struct symbol *bs = pr->body[b];
        if (bs->type == Terminal)
            continue;
        add_first(pr, b+1, &g->follow[bs->num], g, NULL);
    }
}

The second part is to add the "follow" set of the head of a production to the "follow" sets of the final symbol in the production, and any other symbol which is followed only by nullable symbols. As this depends on "follow" itself we need to repeatedly perform the process until no further changes happen.

follow code
for (again = 0, p = 0;
     p < g->production_count;
     p < g->production_count-1
        ? p++ : again ? (again = 0, p = 0)
              : p++) {
    struct production *pr = g->productions[p];
    int b;

    for (b = pr->body_size - 1; b >= 0; b--) {
        struct symbol *bs = pr->body[b];
        if (bs->type == Terminal)
            break;
        if (symset_union(&g->follow[bs->num],
                         &g->follow[pr->head->num]))
            again = 1;
        if (!bs->nullable)
            break;
    }
}

We now just need to create and initialise the follow list to get a complete function.

grammar fields
struct symset *follow;
functions
static void build_follow(struct grammar *g)
{
    int s, again, p;
    g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
    for (s = 0; s < g->num_syms; s++)
        g->follow[s] = INIT_SYMSET;
    ## follow code
}

Building itemsets and states

There are three different levels of detail that can be chosen for building the itemsets and states for the LR grammar. They are:

  1. LR(0) or SLR(1), where no look-ahead is considered.
  2. LALR(1) where we build look-ahead sets with each item and merge the LA sets when we find two paths to the same "kernel" set of items.
  3. LR(1) where different look-ahead for any item in the code means a different state must be created.
forward declarations
enum grammar_type { LR0, LR05, SLR, LALR, LR1 };

We need to be able to look through existing states to see if a newly generated state already exists. For now we use a simple sorted linked list.

An item is a pair of numbers: the production number and the position of "DOT", which is an index into the body of the production. As the numbers are not enormous we can combine them into a single "short" and store them in a symset - 4 bits for "DOT" and 12 bits for the production number (so 4000 productions with maximum of 15 symbols in the body).

Comparing the itemsets will be a little different to comparing symsets as we want to do the lookup after generating the "kernel" of an itemset, so we need to ignore the offset=zero items which are added during completion.

To facilitate this, we modify the "DOT" number so that "0" sorts to the end of the list in the symset, and then only compare items before the first "0".

declarations
static inline unsigned short item_num(int production, int index)
{
    return production | ((31-index) << 11);
}
static inline int item_prod(unsigned short item)
{
    return item & 0x7ff;
}
static inline int item_index(unsigned short item)
{
    return (31-(item >> 11)) & 0x1f;
}

For LR(1) analysis we need to compare not just the itemset in a state but also the LA sets. As we assign each unique LA set a number, we can just compare the symset and the data values together.

functions
static int itemset_cmp(struct symset a, struct symset b,
                       enum grammar_type type)
{
    int i;
    int av, bv;

    for (i = 0;
         i < a.cnt && i < b.cnt &&
         item_index(a.syms[i]) > 0 &&
         item_index(b.syms[i]) > 0;
         i++) {
        int diff = a.syms[i] - b.syms[i];
        if (diff)
            return diff;
        if (type == LR1) {
            diff = a.data[i] - b.data[i];
            if (diff)
                return diff;
        }
    }
    if (i == a.cnt || item_index(a.syms[i]) == 0)
        av = -1;
    else
        av = a.syms[i];
    if (i == b.cnt || item_index(b.syms[i]) == 0)
        bv = -1;
    else
        bv = b.syms[i];
    if (av - bv)
        return av - bv;
    if (type < LR1 || av == -1)
        return 0;
    return
        a.data[i] - b.data[i];
}

And now we can build the list of itemsets. The lookup routine returns both a success flag and a pointer to where in the list an insert should happen, so we don't need to search a second time.

declarations
struct itemset {
    struct itemset *next;
    short state;
    struct symset items;
    struct symset go_to;
    char completed;
};
grammar fields
struct itemset *items;
int states;
functions
static int itemset_find(struct grammar *g, struct itemset ***where,
                        struct symset kernel, enum grammar_type type)
{
    struct itemset **ip;

    for (ip = &g->items; *ip ; ip = & (*ip)->next) {
        struct itemset *i = *ip;
        int diff;
        diff = itemset_cmp(i->items, kernel, type);
        if (diff < 0)
            continue;
        if (diff > 0)
            break;
        /* found */
        *where = ip;
        return 1;
    }
    *where = ip;
    return 0;
}

Adding an itemset may require merging the LA sets if LALR analysis is happening. If any new LA set add symbol that weren't in the old LA set, we clear the completed flag so that the dependants of this itemset will be recalculated and their LA sets updated.

add_itemset must consume the symsets it is passed, either by adding them to a data structure, of freeing them.

static int add_itemset(struct grammar *g, struct symset ss,
                       enum grammar_type type)
{
    struct itemset **where, *is;
    int i;
    int found = itemset_find(g, &where, ss, type);
    if (!found) {
        is = calloc(1, sizeof(*is));
        is->state = g->states;
        g->states += 1;
        is->items = ss;
        is->next = *where;
        is->go_to = INIT_DATASET;
        *where = is;
        return is->state;
    }
    is = *where;
    if (type != LALR) {
        symset_free(ss);
        return is->state;
    }
    for (i = 0; i < ss.cnt; i++) {
        struct symset temp = INIT_SYMSET, s;
        if (ss.data[i] == is->items.data[i])
            continue;
        s = set_find(g, is->items.data[i]);
        symset_union(&temp, &s);
        s = set_find(g, ss.data[i]);
        if (symset_union(&temp, &s)) {
            is->items.data[i] = save_set(g, temp);
            is->completed = 0;
        } else
            symset_free(temp);
    }
    symset_free(ss);
    return is->state;
}

The build

To build all the itemsets, we first insert the initial itemset made from the start symbol, complete each itemset, and then generate new itemsets from old until no new ones can be made.

Completing an itemset means finding all the items where "DOT" is followed by a nonterminal and adding "DOT=0" items for every production from that non-terminal - providing each item hasn't already been added.

If LA sets are needed, the LA set for each new item is found using add_first which was developed earlier for FIRST and FOLLOW. This may be supplemented by the LA set for the item which produce the new item.

We also collect a set of all symbols which follow "DOT" (in done) as this is used in the next stage.

NOTE: precedence handling should happen here - I haven't written this yet though.

complete itemset
for (i = 0; i < is->items.cnt; i++) {
    int p = item_prod(is->items.syms[i]);
    int bs = item_index(is->items.syms[i]);
    struct production *pr = g->productions[p];
    int p2;
    struct symbol *s;
    struct symset LA = INIT_SYMSET;
    unsigned short sn = 0;

    if (bs == pr->body_size)
        continue;
    s = pr->body[bs];
    if (symset_find(&done, s->num) < 0)
        symset_add(&done, s->num, 0);
    if (s->type != Nonterminal)
        continue;
    again = 1;
    if (type >= LALR) {
        // Need the LA set.
        int to_end;
        add_first(pr, bs+1, &LA, g, &to_end);
        if (to_end) {
            struct symset ss = set_find(g, is->items.data[i]);
            symset_union(&LA, &ss);
        }
        sn = save_set(g, LA);
        LA = set_find(g, sn);
    }
    /* Add productions for this symbol */
    for (p2 = s->first_production;
         p2 < g->production_count &&
          g->productions[p2]->head == s;
         p2++) {
        int itm = item_num(p2, 0);
        int pos = symset_find(&is->items, itm);
        if (pos < 0) {
            symset_add(&is->items, itm, sn);
            /* Will have re-ordered, so start
             * from beginning again */
            i = -1;
        } else if (type >= LALR) {
            struct symset ss = set_find(g, is->items.data[pos]);
            struct symset tmp = INIT_SYMSET;

            symset_union(&tmp, &ss);
            if (symset_union(&tmp, &LA)) {
                is->items.data[pos] = save_set(g, tmp);
                i = -1;
            }else
                symset_free(tmp);
        }
    }
}

For each symbol we found (and placed in done) we collect all the items for which this symbol is next, and create a new itemset with "DOT" advanced over the symbol. This is then added to the collection of itemsets (or merged with a pre-existing itemset).

derive itemsets
// Now we have a completed itemset, so we need to
// create all the 'next' itemset and create them
// if they don't exist.
for (i = 0; i < done.cnt; i++) {
    int j;
    unsigned short state;
    struct symset newitemset = INIT_SYMSET;
    if (type >= LALR)
        newitemset = INIT_DATASET;

    for (j = 0; j < is->items.cnt; j++) {
        int itm = is->items.syms[j];
        int p = item_prod(itm);
        int bp = item_index(itm);
        struct production *pr = g->productions[p];
        unsigned short la = 0;
        int pos;

        if (bp == pr->body_size)
            continue;
        if (pr->body[bp]->num != done.syms[i])
            continue;
        if (type >= LALR)
            la = is->items.data[j];
        pos = symset_find(&newitemset, pr->head->num);
        if (pos < 0)
            symset_add(&newitemset, item_num(p, bp+1), la);
        else if (type >= LALR) {
            // Need to merge la set.
            int la2 = newitemset.data[pos];
            if (la != la2) {
                struct symset ss = set_find(g, la2);
                struct symset LA = INIT_SYMSET;
                symset_union(&LA, &ss);
                ss = set_find(g, la);
                if (symset_union(&LA, &ss))
                    newitemset.data[pos] = save_set(g, LA);
                else
                    symset_free(LA);
            }
        }
    }
    state = add_itemset(g, newitemset, type);
    if (symset_find(&is->go_to, done.syms[i]) < 0)
        symset_add(&is->go_to, done.syms[i], state);
}

All that is left is to crate the initial itemset from production zero, and with TK_eof as the LA set.

functions
static void build_itemsets(struct grammar *g, enum grammar_type type)
{
    struct symset first = INIT_SYMSET;
    struct itemset *is;
    int again;
    unsigned short la = 0;
    if (type >= LALR) {
        // LA set just has eof
        struct symset eof = INIT_SYMSET;
        symset_add(&eof, TK_eof, 0);
        la = save_set(g, eof);
        first = INIT_DATASET;
    }
    // production 0, offset 0  (with no data)
    symset_add(&first, item_num(0, 0), la);
    add_itemset(g, first, type);
    for (again = 0, is = g->items;
         is;
         is = is->next ?: again ? (again = 0, g->items) : NULL) {
        int i;
        struct symset done = INIT_SYMSET;
        if (is->completed)
            continue;
        is->completed = 1;
        again = 1;
        ## complete itemset
        ## derive itemsets
        symset_free(done);
    }
}

Completing the analysis.

The exact process of analysis depends on which level was requested. For LR0 and LR05 we don't need first and follow sets at all. For LALR and LR1 we need first, but not follow. For SLR we need both.

We don't build the "action" tables that you might expect as the parser doesn't use them. They will be calculated to some extent if needed for a report.

Once we have built everything we allocate arrays for the two lists: symbols and itemsets. This allows more efficient access during reporting. The symbols are grouped as terminals and non-terminals and we record the changeover point in first_nonterm.

grammar fields
struct symbol **symtab;
struct itemset **statetab;
int first_nonterm;
grammar_analyse
static void grammar_analyse(struct grammar *g, enum grammar_type type)
{
    struct symbol *s;
    struct itemset *is;
    int snum = TK_reserved;
    for (s = g->syms; s; s = s->next)
        if (s->num < 0 && s->type == Terminal) {
            s->num = snum;
            snum++;
        }
    g->first_nonterm = snum;
    for (s = g->syms; s; s = s->next)
        if (s->num < 0) {
            s->num = snum;
            snum++;
        }
    g->num_syms = snum;
    g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
    for (s = g->syms; s; s = s->next)
        g->symtab[s->num] = s;

    if (type >= SLR) {
        set_nullable(g);
        build_first(g);
    }
    if (type == SLR)
        build_follow(g);

    build_itemsets(g, type);

    g->statetab = calloc(g->states, sizeof(g->statetab[0]));
    for (is = g->items; is ; is = is->next)
        g->statetab[is->state] = is;
}

Reporting on the Grammar

The purpose of the report is to give the grammar developer insight into how the grammar parser will work. It is basically a structured dump of all the tables that have been generated, plus an description of any conflicts.

grammar_report
static int grammar_report(struct grammar *g, enum grammar_type type)
{
    report_symbols(g);
    if (g->follow)
        report_follow(g);
    report_itemsets(g);
    return report_conflicts(g, type);
}

Firstly we have the complete list of symbols, together with the "FIRST" set if that was generated.

functions
static void report_symbols(struct grammar *g)
{
    int n;
    if (g->first)
        printf("SYMBOLS + FIRST:\n");
    else
        printf("SYMBOLS:\n");

    for (n = 0; n < g->num_syms; n++) {
        struct symbol *s = g->symtab[n];
        if (!s)
            continue;

        printf(" %c%3d%c: ",
               s->nullable ? '*':' ',
               s->num, symtypes[s->type]);
        prtxt(s->name);
        if (s->precedence)
            printf(" (%d%s)", s->precedence,
                   assoc_names[s->assoc]);

        if (g->first && s->type == Nonterminal) {
            int j;
            char c = ':';
            for (j = 0; j < g->first[n].cnt; j++) {
                printf("%c ", c);
                c = ',';
                prtxt(g->symtab[g->first[n].syms[j]]->name);
            }
        }
        printf("\n");
    }
}

Then we have to follow sets if they were computed.

static void report_follow(struct grammar *g)
{
    int n;
    printf("FOLLOW:\n");
    for (n = 0; n < g->num_syms; n++)
        if (g->follow[n].cnt) {
            int j;
            char c = ':';
            printf("  ");
            prtxt(g->symtab[n]->name);
            for (j = 0; j < g->follow[n].cnt; j++) {
                printf("%c ", c);
                c = ',';
                prtxt(g->symtab[g->follow[n].syms[j]]->name);
            }
            printf("\n");
        }
}

And finally the item sets. These include the GO TO tables and, for LALR and LR1, the LA set for each item. Lots of stuff, so we break it up a bit. First the items, with production number and associativity.

static void report_item(struct grammar *g, int itm)
{
    int p = item_prod(itm);
    int dot = item_index(itm);
    struct production *pr = g->productions[p];
    int i;

    printf("    ");
    prtxt(pr->head->name);
    printf(" ->");
    for (i = 0; i < pr->body_size; i++) {
        printf(" %s", (dot == i ? ". ": ""));
        prtxt(pr->body[i]->name);
    }
    if (dot == pr->body_size)
        printf(" .");
    printf(" [%d]", p);
    if (pr->precedence)
        printf(" (%d%s)", pr->precedence,
               assoc_names[pr->assoc]);
    printf("\n");
}

The LA sets which are (possibly) reported with each item:

static void report_la(struct grammar *g, int lanum)
{
    struct symset la = set_find(g, lanum);
    int i;
    char c = ':';

    printf("        LOOK AHEAD(%d)", lanum);
    for (i = 0; i < la.cnt; i++) {
        printf("%c ", c);
        c = ',';
        prtxt(g->symtab[la.syms[i]]->name);
    }
    printf("\n");
}

Then the go to sets:

static void report_goto(struct grammar *g, struct symset gt)
{
    int i;
    printf("    GOTO:\n");

    for (i = 0; i < gt.cnt; i++) {
        printf("      ");
        prtxt(g->symtab[gt.syms[i]]->name);
        printf(" -> %d\n", gt.data[i]);
    }
}

Now we can report all the item sets complete with items, LA sets, and GO TO.

static void report_itemsets(struct grammar *g)
{
    int s;
    printf("ITEM SETS(%d)\n", g->states);
    for (s = 0; s < g->states; s++) {
        int j;
        struct itemset *is = g->statetab[s];
        printf("  Itemset %d:\n", s);
        for (j = 0; j < is->items.cnt; j++) {
            report_item(g, is->items.syms[j]);
            if (is->items.data != NO_DATA)
                report_la(g, is->items.data[j]);
        }
        report_goto(g, is->go_to);
    }
}

Reporting conflicts

Conflict detection varies a lot among different analysis levels. However LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and LR1 are also similar as they have FOLLOW or LA sets.

functions
## conflict functions

static int report_conflicts(struct grammar *g, enum grammar_type type)
{
    int cnt = 0;
    printf("Conflicts:\n");
    if (type < SLR)
        cnt = conflicts_lr0(g, type);
    else
        cnt = conflicts_slr(g, type);
    if (cnt == 0)
        printf(" - no conflicts\n");
    return cnt;
}

LR0 conflicts are any state which have both a reducible item and a shiftable item.

LR05 conflicts only occurs if two possibly reductions exist, as shifts always over-ride reductions.

conflict functions
static int conflicts_lr0(struct grammar *g, enum grammar_type type)
{
    int i;
    int cnt = 0;
    for (i = 0; i < g->states; i++) {
        struct itemset *is = g->statetab[i];
        int last_reduce = -1;
        int prev_reduce = -1;
        int last_shift = -1;
        int j;
        if (!is)
            continue;
        for (j = 0; j < is->items.cnt; j++) {
            int itm = is->items.syms[j];
            int p = item_prod(itm);
            int bp = item_index(itm);
            struct production *pr = g->productions[p];

            if (bp == pr->body_size) {
                prev_reduce = last_reduce;
                last_reduce = j;
                continue;
            }
            if (pr->body[bp]->type == Terminal)
                last_shift = j;
        }
        if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
            printf("  State %d has both SHIFT and REDUCE:\n", i);
            report_item(g, is->items.syms[last_shift]);
            report_item(g, is->items.syms[last_reduce]);
            cnt++;
        }
        if (prev_reduce >= 0) {
            printf("  State %d has 2 (or more) reducible items\n", i);
            report_item(g, is->items.syms[prev_reduce]);
            report_item(g, is->items.syms[last_reduce]);
            cnt++;
        }
    }
    return cnt;
}

SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping look ahead, or if a symbol in a look-ahead can be shifted. The differ only in the source of the look ahead set.

We build a dataset mapping terminal to item for possible SHIFTs and then another for possible REDUCE operations. We report when we get conflicts between the two.

static int conflicts_slr(struct grammar *g, enum grammar_type type)
{
    int i;
    int cnt = 0;

    for (i = 0; i < g->states; i++) {
        struct itemset *is = g->statetab[i];
        struct symset shifts = INIT_DATASET;
        struct symset reduce = INIT_DATASET;
        int j;
        if (!is)
            continue;
        /* First collect the shifts */
        for (j = 0; j < is->items.cnt; j++) {
            unsigned short itm = is->items.syms[j];
            int p = item_prod(itm);
            int bp = item_index(itm);
            struct production *pr = g->productions[p];

            if (bp < pr->body_size &&
                pr->body[bp]->type == Terminal) {
                /* shiftable */
                int sym = pr->body[bp]->num;
                if (symset_find(&shifts, sym) < 0)
                    symset_add(&shifts, sym, itm);
            }
        }
        /* Now look for reduction and conflicts */
        for (j = 0; j < is->items.cnt; j++) {
            unsigned short itm = is->items.syms[j];
            int p = item_prod(itm);
            int bp = item_index(itm);
            struct production *pr = g->productions[p];

            if (bp < pr->body_size)
                continue;
            /* reducible */
            struct symset la;
            if (type == SLR)
                la = g->follow[pr->head->num];
            else
                la = set_find(g, is->items.data[j]);
            int k;
            for (k = 0; k < la.cnt; k++) {
                int pos = symset_find(&shifts, la.syms[k]);
                if (pos >= 0) {
                    printf("  State %d has SHIFT/REDUCE conflict on ", i);
                    prtxt(g->symtab[la.syms[k]]->name);
                    printf(":\n");
                    report_item(g, shifts.data[pos]);
                    report_item(g, itm);
                    cnt++;
                }
                pos = symset_find(&reduce, la.syms[k]);
                if (pos < 0) {
                    symset_add(&reduce, la.syms[k], itm);
                    continue;
                }
                printf("  State %d has REDUCE/REDUCE conflict on ", i);
                prtxt(g->symtab[la.syms[k]]->name);
                printf(":\n");
                report_item(g, itm);
                report_item(g, reduce.data[pos]);
                cnt++;
            }
        }
        symset_free(shifts);
        symset_free(reduce);
    }
    return cnt;
}

Generating the parser

The export part of the parser is the parse_XX function, where the name XX is based on the name of the parser files.

This takes a code_node, a partially initialized token_config, and an optional FILE to send tracing to. The token_config gets the list of known words added and then is used with the code_node to initialize the scanner.

parse_XX then call the library function parser_run to actually complete the parse, This needs the states table and function to call the various pieces of code provided in the grammar file, so they are generated first.

parser_generate
static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
{
    gen_known(f, g);
    gen_non_term(f, g);
    gen_goto(f, g);
    gen_states(f, g);
    gen_reduce(f, g, file);
    gen_free(f, g);

    fprintf(f, "#line 0 \"gen_parser\"\n");
    fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
            name);
    fprintf(f, "{\n");
    fprintf(f, "\tstruct token_state *tokens;\n");
    fprintf(f, "\tconfig->words_marks = known;\n");
    fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
    fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
    fprintf(f, "\ttokens = token_open(code, config);\n");
    fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
    fprintf(f, "\ttoken_close(tokens);\n");
    fprintf(f, "\treturn rv;\n");
    fprintf(f, "}\n\n");
}

Table words table

The know words is simply an array of terminal symbols. The table of nonterminals used for tracing is a similar array.

functions
static void gen_known(FILE *f, struct grammar *g)
{
    int i;
    fprintf(f, "#line 0 \"gen_known\"\n");
    fprintf(f, "static const char *known[] = {\n");
    for (i = TK_reserved;
         i < g->num_syms && g->symtab[i]->type == Terminal;
         i++)
        fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
            g->symtab[i]->name.txt);
    fprintf(f, "};\n\n");
}

static void gen_non_term(FILE *f, struct grammar *g)
{
    int i;
    fprintf(f, "#line 0 \"gen_non_term\"\n");
    fprintf(f, "static const char *non_term[] = {\n");
    for (i = TK_reserved;
         i < g->num_syms;
         i++)
        if (g->symtab[i]->type == Nonterminal)
            fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
                g->symtab[i]->name.txt);
    fprintf(f, "};\n\n");
}

States and the goto tables.

For each state we record the goto table and the reducible production if there is one. Some of the details of the reducible production are stored in the do_reduce function to come later. Here we store the production number, the body size (useful for stack management) and the resulting symbol (useful for knowing how to free data later).

The go to table is stored in a simple array of sym and corresponding state.

exported types
struct lookup {
    short sym;
    short state;
};
struct state {
    short go_to_cnt;
    const struct lookup * go_to;
    short reduce_prod;
    short reduce_size;
    short reduce_sym;
};
functions
static void gen_goto(FILE *f, struct grammar *g)
{
    int i;
    fprintf(f, "#line 0 \"gen_goto\"\n");
    for (i = 0; i < g->states; i++) {
        int j;
        fprintf(f, "static const struct lookup goto_%d[] = {\n",
            i);
        struct symset gt = g->statetab[i]->go_to;
        for (j = 0; j < gt.cnt; j++)
            fprintf(f, "\t{ %d, %d },\n",
                gt.syms[j], gt.data[j]);
        fprintf(f, "};\n");
    }
}
functions
static void gen_states(FILE *f, struct grammar *g)
{
    int i;
    fprintf(f, "#line 0 \"gen_states\"\n");
    fprintf(f, "static const struct state states[] = {\n");
    for (i = 0; i < g->states; i++) {
        struct itemset *is = g->statetab[i];
        int j, prod = -1;
        for (j = 0; j < is->items.cnt; j++) {
            int itm = is->items.syms[j];
            int p = item_prod(itm);
            int bp = item_index(itm);
            struct production *pr = g->productions[p];

            if (bp < pr->body_size)
                continue;
            /* This is what we reduce */
            prod = p;
            break;
        }

        fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d },\n",
            i, is->go_to.cnt, i, prod,
            prod < 0 ? -1 : g->productions[prod]->body_size,
            prod < 0 ? -1 : g->productions[prod]->head->num);
    }
    fprintf(f, "};\n\n");
}

The do_reduce function and the code

When the parser engine decides to reduce a production, it calls do_reduce. This has two functions.

Firstly, if a non-NULL trace file is passed, it prints out details of the production being reduced. Secondly it runs the code that was included with the production if any.

This code needs to be able to store data somewhere. Rather than requiring do_reduce to malloc that "somewhere", we pass in a large buffer and have do_reduce return the size to be saved.

The code fragment requires translation when written out. Any $N needs to be converted to a reference either to that buffer (if $0) or to the structure returned by a previous reduction. These pointer need to be cast to the appropriate type for each access. All this is handling in gen_code.

functions
static void gen_code(struct production *p, FILE *f, struct grammar *g)
{
    char *c;
    fprintf(f, "\t\t\t");
    for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
        int n;
        if (*c != '$') {
            fputc(*c, f);
            if (*c == '\n')
                fputs("\t\t\t", f);
            continue;
        }
        c++;
        if (*c < '0' || *c > '9') {
            fputc(*c, f);
            continue;
        }
        n = *c - '0';
        while (c[1] >= '0' && c[1] <= '9') {
            c += 1;
            n = n * 10 + *c - '0';
        }
        if (n == 0)
            fprintf(f, "(*(struct %.*s*)ret)",
                p->head->struct_name.len,
                p->head->struct_name.txt);
        else if (n > p->body_size)
            fprintf(f, "$%d", n);
        else if (p->body[n-1]->type == Terminal)
            fprintf(f, "(*(struct token *)body[%d])",
                n-1);
        else if (p->body[n-1]->struct_name.txt == NULL)
            fprintf(f, "$%d", n);
        else
            fprintf(f, "(*(struct %.*s*)body[%d])",
                p->body[n-1]->struct_name.len,
                p->body[n-1]->struct_name.txt, n-1);
    }
    fputs("\n", f);
}
functions
static void gen_reduce(FILE *f, struct grammar *g, char *file)
{
    int i;
    fprintf(f, "#line 0 \"gen_reduce\"\n");
    fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n");
    fprintf(f, "                      void *ret)\n");
    fprintf(f, "{\n");
    fprintf(f, "\tint ret_size = 0;\n");

    fprintf(f, "\tswitch(prod) {\n");
    for (i = 0; i < g->production_count; i++) {
        struct production *p = g->productions[i];
        fprintf(f, "\tcase %d:\n", i);

        if (p->code.txt)
            gen_code(p, f, g);

        if (p->head->struct_name.txt)
            fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
                p->head->struct_name.len,
                p->head->struct_name.txt);

        fprintf(f, "\t\tbreak;\n");
    }
    fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
}

do_free

As each non-terminal can potentially cause a different type of data structure to be allocated and filled in, we need to be able to free it when done.

It is particularly important to have fine control over freeing during error recovery where individual stack frames might need to be freed.

For this, the grammar author required to defined a free_XX function for each structure that is used by a non-terminal. do_free all call whichever is appropriate given a symbol number, and will call free (as is appropriate for tokens` on any terminal symbol.

functions
static void gen_free(FILE *f, struct grammar *g)
{
    int i;

    fprintf(f, "#line 0 \"gen_free\"\n");
    fprintf(f, "static void do_free(short sym, void *asn)\n");
    fprintf(f, "{\n");
    fprintf(f, "\tif (!asn) return;\n");
    fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
    fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
    fprintf(f, "\tswitch(sym) {\n");

    for (i = 0; i < g->num_syms; i++) {
        struct symbol *s = g->symtab[i];
        if (!s ||
            s->type != Nonterminal ||
            s->struct_name.txt == NULL)
            continue;

        fprintf(f, "\tcase %d:\n", s->num);
        fprintf(f, "\t\tfree_%.*s(asn);\n",
                s->struct_name.len,
                s->struct_name.txt);
        fprintf(f, "\t\tbreak;\n");
    }
    fprintf(f, "\t}\n}\n\n");
}

The main routine.

There are three key parts to the "main" function of parsergen: processing the arguments, loading the grammar file, and dealing with the grammar.

Argument processing.

Command line options allow the selection of analysis mode, name of output file, and whether or not a report should be generated. By default we create a report only if no code output was requested.

The parse_XX function name uses the basename of the output file which should not have a suffix (.c). .c is added to the given name for the code, and .h is added for the header (if header text is specifed in the grammar file).

includes
#include <getopt.h>
declarations
static const struct option long_options[] = {
    { "LR0",    0, NULL, '0' },
    { "LR05",   0, NULL, '5' },
    { "SLR",    0, NULL, 'S' },
    { "LALR",   0, NULL, 'L' },
    { "LR1",    0, NULL, '1' },
    { "report", 0, NULL, 'R' },
    { "output", 1, NULL, 'o' },
    { NULL,     0, NULL, 0   }
};
const char *options = "05SL1Ro:";
process arguments
int opt;
char *outfile = NULL;
char *infile;
char *name;
int report = 1;
enum grammar_type type = LR05;
while ((opt = getopt_long(argc, argv, options,
                          long_options, NULL)) != -1) {
    switch(opt) {
    case '0':
        type = LR0; break;
    case '5':
        type = LR05; break;
    case 'S':
        type = SLR; break;
    case 'L':
        type = LALR; break;
    case '1':
        type = LR1; break;
    case 'R':
        report = 2; break;
    case 'o':
        outfile = optarg; break;
    default:
        fprintf(stderr, "Usage: parsergen ...\n");
        exit(1);
    }
}
if (optind < argc)
    infile = argv[optind++];
else {
    fprintf(stderr, "No input file given\n");
    exit(1);
}
if (outfile && report == 1)
    report = 0;
name = outfile;
if (name && strchr(name, '/'))
    name = strrchr(name, '/')+1;

if (optind < argc) {
    fprintf(stderr, "Excess command line arguments\n");
    exit(1);
}

Loading the grammar file

To be able to run mdcode and scanner on the grammar we need to memory map it.

One we have extracted the code (with mdcode) we expect to file three sections: header, code, and grammar. Anything else is an error.

"header" and "code" are optional, though it is hard to build a working parser with neither. "grammar" must be provided.

includes
#include <fcntl.h>
#include <sys/mman.h>
#include <errno.h>
functions
static int errs;
static void pr_err(char *msg)
{
    errs++;
    fprintf(stderr, "%s\n", msg);
}
load file
struct section *table;
int fd;
int len;
char *file;
fd = open(infile, O_RDONLY);
if (fd < 0) {
    fprintf(stderr, "parsergen: cannot open %s: %s\n",
        infile, strerror(errno));
    exit(1);
}
len = lseek(fd, 0, 2);
file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
table = code_extract(file, file+len, pr_err);

struct code_node *hdr = NULL;
struct code_node *code = NULL;
struct code_node *gram = NULL;
for (s = table; s; s = s->next) {
    if (text_is(s->section, "header"))
        hdr = s->code;
    else if (text_is(s->section, "code"))
        code = s->code;
    else if (text_is(s->section, "grammar"))
        gram = s->code;
    else {
        fprintf(stderr, "Unknown content section: %.*s\n",
                s->section.len, s->section.txt);
        rv |= 2;
    }
}

Processing the input

As we need to append an extention to a filename and then open it for writing, and we need to do this twice, it helps to have a separate function.

functions
static FILE *open_ext(char *base, char *ext)
{
    char *fn = malloc(strlen(base) + strlen(ext) + 1);
    FILE *f;
    strcat(strcpy(fn, base), ext);
    f = fopen(fn, "w");
    free(fn);
    return f;
}

If we can read the grammar, then we analyse and optionally report on it. If the report finds conflicts we will exit with an error status.

process input
struct grammar *g = NULL;
if (gram == NULL) {
    fprintf(stderr, "No grammar section provided\n");
    rv |= 2;
} else {
    g = grammar_read(gram);
    if (!g) {
        fprintf(stderr, "Failure to parse grammar\n");
        rv |= 2;
    }
}
if (g) {
    grammar_analyse(g, type);
    if (report)
        if (grammar_report(g, type))
            rv |= 1;
}

If a headers section is defined, we write it out and include a declaration for the parse_XX function so it can be used from separate code.

if (rv == 0 && hdr && outfile) {
    FILE *f = open_ext(outfile, ".h");
    if (f) {
        code_node_print(f, hdr, infile);
        fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
                name);
        fclose(f);
    } else {
        fprintf(stderr, "Cannot create %s.h\n",
                outfile);
        rv |= 4;
    }
}

And if all goes well and an output file was provided, we create the .c file with the code section (if any) and the parser tables and function.

if (rv == 0 && outfile) {
    FILE *f = open_ext(outfile, ".c");
    if (f) {
        if (code)
            code_node_print(f, code, infile);
        gen_parser(f, g, infile, name);
        fclose(f);
    } else {
        fprintf(stderr, "Cannot create %s.c\n",
                outfile);
        rv |= 4;
    }
}

And that about wraps it up. We need to set the locale so that UTF-8 is recognised properly, and link with libicuuc is libmdcode requires that.

File: parsergen.mk
parsergen : parsergen.o libscanner.o libmdcode.o
    $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
includes
#include <locale.h>
main
int main(int argc, char *argv[])
{
    struct section *s;
    int rv = 0;

    setlocale(LC_ALL,"");

    ## process arguments
    ## load file
    ## process input

    return rv;
}

The SHIFT/REDUCE parser

Having analysed the grammar and generated all the table, we only need the shift/reduce engine to bring it all together.

Goto table lookup

The parser generator has nicely provided us with goto tables sorted by symbol number. We need a binary search function to find a symbol in the table.

parser functions
static int search(const struct state *l, int sym)
{
    int lo = 0;
    int hi = l->go_to_cnt;

    if (hi == 0)
        return -1;
    while (lo + 1 < hi) {
        int mid = (lo + hi) / 2;
        if (l->go_to[mid].sym <= sym)
            lo = mid;
        else
            hi = mid;
    }
    if (l->go_to[lo].sym == sym)
        return l->go_to[lo].state;
    else
        return -1;
}

The state stack.

The core data structure for the parser is the stack. This track all the symbols that have been recognised or partially recognised.

The stack usually won't grow very large - maybe a few tens of entries. So we dynamically resize and array as required but never bother to shrink it down again.

We keep the stack as two separate allocations. One, asn_stack stores the "abstract syntax nodes" which are created by each reduction. When we call do_reduce we need to pass an array of the asns of the body of the production, and by keeping a separate asn stack, we can just pass a pointer into this stack.

The other allocation store all other stack fields of which there are two. The state is the most important one and guides the parsing process. The sym is nearly unnecessary. However when we want to free entries from the asn_stack, it helps to know what type they are so we can call the right freeing function. The symbol leads us to the right free function through do_free.

parser functions
struct parser {
    int state;
    struct frame {
        short state;
        short sym;
    } *stack;
    void **asn_stack;
    int stack_size;
    int tos;
};

Shift and pop

The operations are needed on the stack - shift (which is like push) and pop.

Shift applies no only to terminals but also to non-terminals. When we reduce a production we will pop off entries corresponding to the body symbols, then push on an item for the head of the production. This last is exactly the same process as shifting in a terminal so we use the same function for both.

To simplify other code we arrange for shift to fail if there is no goto state for the symbol. This is useful in basic parsing due to our design that we shift when we can, and reduce when we cannot. So the shift function reports if it could.

So shift finds the next state. If that succeed it extends the allocations if needed and pushed all the information onto the stacks.

parser functions
static int shift(struct parser *p,
                 int sym, void *asn,
                 const struct state states[])
{
    // Push an entry onto the stack
    int newstate = search(&states[p->state], sym);
    if (newstate < 0)
        return 0;
    if (p->tos >= p->stack_size) {
        p->stack_size += 10;
        p->stack = realloc(p->stack, p->stack_size
                               * sizeof(p->stack[0]));
        p->asn_stack = realloc(p->asn_stack, p->stack_size
                               * sizeof(p->asn_stack[0]));
    }
    p->stack[p->tos].state = p->state;
    p->stack[p->tos].sym = sym;
    p->asn_stack[p->tos] = asn;
    p->tos++;
    p->state = newstate;
    return 1;
}

pop simply moves the top of stack (tos) back down the required amount and frees and asn entries that need to be freed. It is called after we reduce a production, just before we shift the nonterminal in.

parser functions
static void pop(struct parser *p, int num,
                void(*do_free)(short sym, void *asn))
{
    int i;
    p->tos -= num;
    for (i = 0; i < num; i++)
        do_free(p->stack[p->tos+i].sym,
                p->asn_stack[p->tos+i]);

    p->state = p->stack[p->tos].state;
}

Memory allocation

The scanner returns tokens in a local variable - we want them in allocated memory so they can live in the asn_stack. Similarly the asn produce by a reduce is in a large buffer. Both of these require some allocation and copying, hence memdup and tokcopy.

parser includes
#include <memory.h>
parser functions
void *memdup(void *m, int len)
{
    void *ret;
    if (len == 0)
        return NULL;
    ret = malloc(len);
    memcpy(ret, m, len);
    return ret;
}

static struct token *tok_copy(struct token tk)
{
    struct token *new = malloc(sizeof(*new));
    *new = tk;
    return new;
}

The heart of the parser.

Now we have the parser. If we can shift, we do. If not and we can reduce we do. If the production we reduced was production zero, then we have accepted the input and can finish.

If we can neither shift nor reduce we have an error to handle. We pop single entries off the stack until we can shift the TK_error symbol, the drop input tokens until we find one we can shift into the new error state.

We return whatever asn was returned by reducing production zero.

parser includes
#include "parser.h"
parser
void *parser_run(struct token_state *tokens,
                 const struct state states[],
                 int (*do_reduce)(int, int, void**, void*),
                 void (*do_free)(short, void*),
                 FILE *trace, const char *non_term[], int knowns)
{
    struct parser p = { 0 };
    struct token *tk;
    int accepted = 0;
    void *ret;

    tk = tok_copy(token_next(tokens));
    while (!accepted) {
        if (trace)
            parser_trace(trace, &p, tk, states, non_term, knowns);

        if (shift(&p, tk->num, tk, states)) {
            tk = tok_copy(token_next(tokens));
            continue;
        }
        if (states[p.state].reduce_prod >= 0) {
            void **body;
            int prod = states[p.state].reduce_prod;
            int size = states[p.state].reduce_size;
            int sym = states[p.state].reduce_sym;
            int bufsize;
            static char buf[16*1024];

            body = p.asn_stack +
                (p.tos - states[p.state].reduce_size);

            bufsize = do_reduce(prod, p.tos, body,
                                buf);

            pop(&p, size, do_free);
            shift(&p, sym, memdup(buf, bufsize), states);
            if (prod == 0)
                accepted = 1;
            continue;
        }
        /* Error. we walk up the stack until we
         * find a state which will accept TK_error.
         * We then shift in TK_error and see what state
         * that takes us too.
         * Then we discard input tokens until
         * we find one that is acceptable.
         */
        while (shift(&p, TK_error, tk, states) == 0
               && p.tos > 0)
            // discard this state
            pop(&p, 1, do_free);
        tk = tok_copy(*tk);
        while (search(&states[p.state], tk->num) < 0 &&
               tk->num != TK_eof) {
            free(tk);
            tk = tok_copy(token_next(tokens));
        }
        if (p.tos == 0 && tk->num == TK_eof)
            break;
    }
    free(tk);
    if (accepted)
        ret = p.asn_stack[0];
    else
        pop(&p, p.tos, do_free);
    free(p.asn_stack);
    free(p.stack);
    return ret;
}
exported functions
void *parser_run(struct token_state *tokens,
                 const struct state states[],
                 int (*do_reduce)(int, int, void**, void*),
                 void (*do_free)(short, void*),
                 FILE *trace, const char *non_term[], int knowns);

Tracing

Being able to visualize the parser in action can be invaluable when debugging the parser code, or trying to understand how the parse of a particular grammar progresses. The stack contains all the important state, so just printing out the stack every time around the parse loop can make it possible to see exactly what is happening.

This doesn't explicitly show each SHIFT and REDUCE action. However they are easily deduced from the change between consecutive lines, and the details of each state can be found by cross referencing the states list in the stack with the "report" that parsergen can generate.

For terminal symbols, we just dump the token. For non-terminals we print the name of the symbol. The look ahead token is reported at the end inside square brackets.

parser functions
void parser_trace(FILE *trace, struct parser *p,
          struct token *tk, const struct state states[],
          const char *non_term[], int knowns)
{
    int i;
    for (i = 0; i < p->tos; i++) {
        int sym = p->stack[i].sym;
        fprintf(trace, "(%d) ", p->stack[i].state);
        if (sym < TK_reserved + knowns) {
            struct token *t = p->asn_stack[i];
            text_dump(trace, t->txt, 20);
        } else
            fputs(non_term[sym - TK_reserved - knowns],
                  trace);
        fputs(" ", trace);
    }
    fprintf(trace, "(%d) [", p->state);
    text_dump(trace, tk->txt, 20);
    fputs("]\n", trace);
}

A Worked Example

The obvious example for a parser is a calculator.

As scanner provides number parsing function using libgmp is it not much work to perform arbitrary rational number calculations.

This calculator takes one expression, or an equality test per line. The results are printed and in any equality test fails, the program exits with an error.

Embedding mdcode inside mdcode is rather horrible. I'd like to find a better approach, but as the grammar file must have 3 components I need something like this.

File: parsergen.mk
calc.c : parsergen calc.cgm
    ./parsergen -o calc calc.cgm
calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
    $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
demo grammar
# header
~~~~~

#include "number.h"
// what do we use for a demo-grammar?  A calculator of course.
struct number {
    mpq_t val;
    char tail[2];
    int err;
};

~~~~~
# code
~~~~~

#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <stdio.h>
#include <malloc.h>
#include <gmp.h>
#include "mdcode.h"
#include "scanner.h"
#include "number.h"
#include "parser.h"

#include "calc.h"

static void free_number(struct number *n)
{
    mpq_clear(n->val);
    free(n);
}

int main(int argc, char *argv[])
{
    int fd = open(argv[1], O_RDONLY);
    int len = lseek(fd, 0, 2);
    char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
    struct section *s = code_extract(file, file+len, NULL);
    struct token_config config = {
        .ignored = (1 << TK_line_comment)
                 | (1 << TK_block_comment)
                 | (1 << TK_indent)
                 | (1 << TK_undent),
        .number_chars = ".,_+-",
        .word_start = "",
        .word_cont = "",
    };
    parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
    exit(0);
}

~~~~~
# grammar
~~~~~

Session -> Session Line
    | Line

Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
                { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
                gmp_printf("  or as a decimal: %Fg\n", fl);
                mpf_clear(fl);
                }
                 }$
    | Expression = Expression NEWLINE ${
        if (mpq_equal($1.val, $3.val))
            gmp_printf("Both equal %Qd\n", $1.val);
        else {
            gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
                $1.val, $3.val);
            exit(1);
        }
    }$
    |  NEWLINE ${ printf("Blank line\n"); }$
    | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$

$number
Expression -> Expression +  Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
    | Expression -  Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
    | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$

Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
    | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
    | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$

Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
    | ( Expression  ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$