This parser generator takes a grammar description combined with code fragments, analyses it, and can report details about the analysis and write out C code files which can be compiled to make a parser.
There are several distinct sections.
grammar_read
will read a grammar from a literate-code file and
store the productions, symbols, and code fragments.grammar_analyse
will take that grammar and build LR parsing
states and look-ahead tables.grammar_report
will present the details of the analysis
in a readable format and will report any conflicts.parser_generate
will write out C code files with various
tables and with the code fragments arranged in useful places.parser_run
is a library function intended to be linked together
with the generated parser tables to complete the implementation of
a parser.calc.cgm
is a test grammar for a simple calculator.#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
## includes
## forward declarations
## declarations
## functions
## grammar_read
## grammar_analyse
## grammar_report
## parser_generate
## main
## exported types
## exported functions
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
## parser includes
## parser functions
## parser_run
## demo grammar
CFLAGS += -Wall -g
all :: parsergen calc
parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
./md2c parsergen.mdc
The grammar must be stored in a markdown literate code file as parsed
by "mdcode". It must have three top level (i.e. unreferenced)
sections: header
, code
, and grammar
. The first two will be
literally copied into the generated .c
and .h
. files. The last
contains the grammar. This is tokenised with "scanner".
#include "mdcode.h"
#include "scanner.h"
#include "mdcode.h"
#include "scanner.h"
The grammar contains production sets, precedence/associativity declarations, and data type declarations. These are all parsed with ad hoc parsing as we don't have a parser generator yet.
The precedence and associativity can be set for each production, but can be inherited from symbols. The data type is potentially defined for each non-terminal and describes what C structure is used to store information about each symbol.
enum assoc {Left, Right, Non};
char *assoc_names[] = {"Left","Right","Non"};
struct symbol {
struct text struct_name;
enum assoc assoc;
unsigned short precedence;
## symbol fields
};
struct production {
unsigned short precedence;
enum assoc assoc;
## production fields
};
struct grammar {
## grammar fields
};
The strings reported by mdcode
and scanner
are struct text
which have
length rather than being null terminated. To help with printing and
comparing we define text_is
and prtxt
, which should possibly go in
mdcode
. scanner
does provide text_dump
which is useful for strings
which might contain control characters.
static int text_is(struct text t, char *s)
{
return (strlen(s) == t.len &&
strncmp(s, t.txt, t.len) == 0);
}
static void prtxt(struct text t)
{
printf("%.*s", t.len, t.txt);
}
Productions are comprised primarily of symbols - terminal and non-terminal. We do not make a syntactic distinction between the two, though non-terminals must be identifiers. Non-terminal symbols are those which appear in the head of a production, terminal symbols are those which don't. There are also "virtual" symbols used for precedence marking discussed later, and sometimes we won't know what type a symbol is yet.
enum symtype { Unknown, Virtual, Terminal, Nonterminal };
char *symtypes = "UVTN";
enum symtype type;
Symbols can be either TK_ident
or TK_mark
. They are saved in a
table of known symbols and the resulting parser will report them as
TK_reserved + N
. A small set of identifiers are reserved for the
different token types that scanner
can report.
static char *reserved_words[] = {
[TK_error] = "ERROR",
[TK_number] = "NUMBER",
[TK_ident] = "IDENTIFIER",
[TK_mark] = "MARK",
[TK_string] = "STRING",
[TK_multi_string] = "MULTI_STRING",
[TK_in] = "IN",
[TK_out] = "OUT",
[TK_newline] = "NEWLINE",
[TK_eof] = "$eof",
};
short num;
Note that TK_eof
and the two TK_*_comment
tokens cannot be
recognised. The former is automatically expected at the end of the text
being parsed. The latter are always ignored by the parser.
All of these symbols are stored in a simple symbol table. We use the
struct text
from mdcode
to store the name and link them together
into a sorted list using an insertion sort.
We don't have separate find
and insert
functions as any symbol we
find needs to be remembered. We simply expect find
to always return a
symbol, but its type might be Unknown
.
#include <string.h>
struct text name;
struct symbol *next;
struct symbol *syms;
int num_syms;
static int text_cmp(struct text a, struct text b)
{
int len = a.len;
if (a.len > b.len)
len = b.len;
int cmp = strncmp(a.txt, b.txt, len);
if (cmp)
return cmp;
else
return a.len - b.len;
}
static struct symbol *sym_find(struct grammar *g, struct text s)
{
struct symbol **l = &g->syms;
struct symbol *n;
int cmp = 1;
while (*l &&
(cmp = text_cmp((*l)->name, s)) < 0)
l = & (*l)->next;
if (cmp == 0)
return *l;
n = calloc(1, sizeof(*n));
n->name = s;
n->type = Unknown;
n->next = *l;
n->num = -1;
*l = n;
return n;
}
static void symbols_init(struct grammar *g)
{
int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
int i;
for (i = 0; i < entries; i++) {
struct text t;
struct symbol *s;
t.txt = reserved_words[i];
if (!t.txt)
continue;
t.len = strlen(t.txt);
s = sym_find(g, t);
s->type = Terminal;
s->num = i;
}
}
Data type specification and precedence specification are both
introduced by a dollar sign at the start of the line. If the next
word is LEFT
, RIGHT
or NON
, then the line specifies a
precedence, otherwise it specifies a data type.
The data type name is simply stored and applied to the head of all
subsequent productions. It must be the name of a structure, so $expr
maps to struct expr
.
Any productions given before the first data type will have no data type
and can carry no information. In order to allow other non-terminals to
have no type, the data type $void
can be given. This does not mean
that struct void
will be used, but rather than no type will be
associated with future non-terminals.
The precedence line must contain a list of symbols - typically terminal symbols, but not necessarily. It can only contain symbols that have not been seen yet, so precedence declaration must precede the productions that they affect.
A precedence line may also contain a "virtual" symbol which is an
identifier preceded by $$
. Virtual terminals carry precedence
information but are not included in the grammar. A production can
declare that it inherits the precedence of a given virtual symbol.
This use for $$
precludes it from being used as a symbol in the
described language. Two other symbols: ${
and }$
are also
unavailable.
Each new precedence line introduces a new precedence level and declares how it associates. This level is stored in each symbol listed and may be inherited by any production which uses the symbol. A production inherits from the last symbol which has a precedence.
struct text current_type;
int prec_levels;
enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
static const char *known[] = { "$$", "${", "}$" };
static char *dollar_line(struct token_state *ts, struct grammar *g)
{
struct token t = token_next(ts);
char *err;
enum assoc assoc;
int found;
if (t.num != TK_ident) {
err = "type or assoc expected after '$'";
goto abort;
}
if (text_is(t.txt, "LEFT"))
assoc = Left;
else if (text_is(t.txt, "RIGHT"))
assoc = Right;
else if (text_is(t.txt, "NON"))
assoc = Non;
else {
g->current_type = t.txt;
if (text_is(t.txt, "void"))
g->current_type.txt = NULL;
t = token_next(ts);
if (t.num != TK_newline) {
err = "Extra tokens after type name";
goto abort;
}
return NULL;
}
// This is a precedence line, need some symbols.
found = 0;
g->prec_levels += 1;
t = token_next(ts);
while (t.num != TK_newline) {
enum symtype type = Terminal;
struct symbol *s;
if (t.num == TK_virtual) {
type = Virtual;
t = token_next(ts);
if (t.num != TK_ident) {
err = "$$ must be followed by a word";
goto abort;
}
} else if (t.num != TK_ident &&
t.num != TK_mark) {
err = "Illegal token in precedence line";
goto abort;
}
s = sym_find(g, t.txt);
if (s->type != Unknown) {
err = "Symbols in precedence line must not already be known.";
goto abort;
}
s->type = type;
s->precedence = g->prec_levels;
s->assoc = assoc;
found += 1;
}
if (found == 0)
err = "No symbols given on precedence line";
goto abort;
return NULL;
abort:
while (t.num != TK_newline && t.num != TK_eof)
t = token_next(ts);
return err;
}
A production either starts with an identifier which is the head
non-terminal, or a vertical bar (|
) in which case this production
uses the same head as the previous one. The identifier must be
followed by a ->
mark. All productions for a given non-terminal must
be grouped together with the nonterminal ->
given only once.
After this a (possibly empty) sequence of identifiers and marks form
the body of the production. A virtual symbol may be given after the
body by preceding it with $$
. If a virtual symbol is given, the
precedence of the production is that for the virtual symbol. If none
is given, the precedence is inherited from the last symbol in the
production which has a precedence specified.
After the optional precedence may come the ${
mark. This indicates
the start of a code fragment. If present, this must be on the same
line as the start of the production.
All of the text from the ${
through to the matching }$
is
collected and forms the code-fragment for the production. It must all
be in one code_node
of the literate code. The }$
must be
at the end of a line.
Text in the code fragment will undergo substitutions where $N
for
some numeric N
will be replaced with a variable holding the parse
information for the particular symbol in the production. $0
is the
head of the production, $1
is the first symbol of the body, etc.
The type of $N
for a terminal symbol is struct token
. For
a non-terminal, it is whatever has been declared for that symbol.
While building productions we will need to add to an array which needs to grow dynamically.
static void array_add(void *varray, int *cnt, void *new)
{
void ***array = varray;
int current = 0;
const int step = 8;
current = ((*cnt-1) | (step-1))+1;
if (*cnt == current) {
/* must grow */
current += step;
*array = realloc(*array, current * sizeof(void*));
}
(*array)[*cnt] = new;
(*cnt) += 1;
}
Collecting the code fragment simply involves reading tokens until we
hit the end token }$
, and noting the character position of the start and
the end.
static struct text collect_code(struct token_state *state,
struct token start)
{
struct text code;
struct token t;
code.txt = start.txt.txt + start.txt.len;
do
t = token_next(state);
while (t.node == start.node &&
t.num != TK_close && t.num != TK_error &&
t.num != TK_eof);
if (t.num == TK_close && t.node == start.node)
code.len = t.txt.txt - code.txt;
else
code.txt = NULL;
return code;
}
Now we have all the bit we need to parse a full production.
#include <memory.h>
struct production **productions;
int production_count;
struct symbol *head;
struct symbol **body;
int body_size;
struct text code;
int first_production;
static char *parse_production(struct grammar *g,
struct symbol *head,
struct token_state *state)
{
/* Head has already been parsed. */
struct token tk;
char *err;
struct production p, *pp;
memset(&p, 0, sizeof(p));
p.head = head;
tk = token_next(state);
while (tk.num == TK_ident || tk.num == TK_mark) {
struct symbol *bs = sym_find(g, tk.txt);
if (bs->type == Unknown)
bs->type = Terminal;
if (bs->type == Virtual) {
err = "Virtual symbol not permitted in production";
goto abort;
}
if (bs->precedence) {
p.precedence = bs->precedence;
p.assoc = bs->assoc;
}
array_add(&p.body, &p.body_size, bs);
tk = token_next(state);
}
if (tk.num == TK_virtual) {
struct symbol *vs;
tk = token_next(state);
if (tk.num != TK_ident) {
err = "word required after $$";
goto abort;
}
vs = sym_find(g, tk.txt);
if (vs->type != Virtual) {
err = "symbol after $$ must be virtual";
goto abort;
}
p.precedence = vs->precedence;
p.assoc = vs->assoc;
tk = token_next(state);
}
if (tk.num == TK_open) {
p.code = collect_code(state, tk);
if (p.code.txt == NULL) {
err = "code fragment not closed properly";
goto abort;
}
tk = token_next(state);
}
if (tk.num != TK_newline && tk.num != TK_eof) {
err = "stray tokens at end of line";
goto abort;
}
pp = malloc(sizeof(*pp));
*pp = p;
array_add(&g->productions, &g->production_count, pp);
return NULL;
abort:
while (tk.num != TK_newline && tk.num != TK_eof)
tk = token_next(state);
return err;
}
With the ability to parse production and dollar-lines, we have nearly all
that we need to parse a grammar from a code_node
.
The head of the first production will effectively be the start
symbol of
the grammar. However it won't actually be so. Processing the grammar is
greatly simplified if the real start symbol only has a single production,
and expects $eof
as the final terminal. So when we find the first
explicit production we insert an extra production as production zero which
looks like
$start -> START $eof
where START
is the first non-terminal given.
struct production *p = calloc(1,sizeof(*p));
struct text start = {"$start",6};
struct text eof = {"$eof",4};
p->head = sym_find(g, start);
p->head->type = Nonterminal;
array_add(&p->body, &p->body_size, head);
array_add(&p->body, &p->body_size, sym_find(g, eof));
g->start = p->head->num;
p->head->first_production = g->production_count;
array_add(&g->productions, &g->production_count, p);
Now we are ready to read in the grammar.
int start; // the 'start' symbol.
static struct grammar *grammar_read(struct code_node *code)
{
struct token_config conf = {
.word_start = "",
.word_cont = "",
.words_marks = known,
.known_count = sizeof(known)/sizeof(known[0]),
.number_chars = "",
.ignored = (1 << TK_line_comment)
| (1 << TK_block_comment)
| (0 << TK_number)
| (1 << TK_string)
| (1 << TK_multi_string)
| (1 << TK_in)
| (1 << TK_out),
};
struct token_state *state = token_open(code, &conf);
struct token tk = token_next(state);
struct symbol *head = NULL;
struct grammar *g;
char *err = NULL;
g = calloc(1, sizeof(*g));
symbols_init(g);
for (tk = token_next(state); tk.num != TK_eof;
tk = token_next(state)) {
if (tk.num == TK_newline)
continue;
if (tk.num == TK_ident) {
// new non-terminal
head = sym_find(g, tk.txt);
if (head->type == Nonterminal)
err = "This non-terminal has already be used.";
else if (head->type == Virtual)
err = "Virtual symbol not permitted in head of production";
else {
head->type = Nonterminal;
head->struct_name = g->current_type;
if (g->start == 0) {
## create production zero
}
head->first_production = g->production_count;
tk = token_next(state);
if (tk.num == TK_mark &&
text_is(tk.txt, "->"))
err = parse_production(g, head, state);
else
err = "'->' missing in production";
}
} else if (tk.num == TK_mark
&& text_is(tk.txt, "|")) {
// another production for same non-term
if (head)
err = parse_production(g, head, state);
else
err = "First production must have a head";
} else if (tk.num == TK_mark
&& text_is(tk.txt, "$")) {
err = dollar_line(state, g);
} else {
err = "Unrecognised token at start of line.";
}
if (err)
goto abort;
}
token_close(state);
return g;
abort:
fprintf(stderr, "Error at line %d: %s\n",
tk.line, err);
token_close(state);
free(g);
return NULL;
}
The central task in analysing the grammar is to determine a set of states to drive the parsing process. This will require finding various sets of symbols and of "items". Some of these sets will need to attach information to each element in the set, so they are more like maps.
Each "item" may have a 'look-ahead' or LA
set associated with
it. Many of these will be the same as each other. To avoid memory
wastage, and to simplify some comparisons of sets, these sets will be
stored in a list of unique sets, each assigned a number.
Once we have the data structures in hand to manage these sets and lists, we can start setting the 'nullable' flag, build the 'FIRST' sets, and then create the item sets which define the various states.
Though we don't only store symbols in these sets, they are the main things we store, so they are called symbol sets or "symsets".
A symset has a size, an array of shorts, and an optional array of data
values, which are also shorts. If the array of data is not present,
we store an impossible pointer, as NULL
is used to indicate that no
memory has been allocated yet;
struct symset {
short cnt;
unsigned short *syms, *data;
};
#define NO_DATA ((unsigned short *)1)
const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
const struct symset INIT_DATASET = { 0, NULL, NULL };
The arrays of shorts are allocated in blocks of 8 and are kept sorted
by using an insertion sort. We don't explicitly record the amount of
allocated space as it can be derived directly from the current cnt
using
((cnt - 1) | 7) + 1
.
static void symset_add(struct symset *s, unsigned short key, unsigned short val)
{
int i;
int current = ((s->cnt-1) | 7) + 1;
if (current == s->cnt) {
current += 8;
s->syms = realloc(s->syms, sizeof(*s->syms) * current);
if (s->data != NO_DATA)
s->data = realloc(s->data, sizeof(*s->data) * current);
}
i = s->cnt;
while (i > 0 && s->syms[i-1] > key) {
s->syms[i] = s->syms[i-1];
if (s->data != NO_DATA)
s->data[i] = s->data[i-1];
i--;
}
s->syms[i] = key;
if (s->data != NO_DATA)
s->data[i] = val;
s->cnt += 1;
}
Finding a symbol (or item) in a symset
uses a simple binary search.
We return the index where the value was found (so data can be accessed),
or -1
to indicate failure.
static int symset_find(struct symset *ss, unsigned short key)
{
int lo = 0;
int hi = ss->cnt;
if (hi == 0)
return -1;
while (lo + 1 < hi) {
int mid = (lo + hi) / 2;
if (ss->syms[mid] <= key)
lo = mid;
else
hi = mid;
}
if (ss->syms[lo] == key)
return lo;
return -1;
}
We will often want to form the union of two symsets. When we do, we
will often want to know if anything changed (as they might mean there
is more work to do). So symset_union
reports whether anything was
added to the first set. We use a slow quadratic approach as these
sets don't really get very big. If profiles shows this to be a problem is
can be optimised later.
static int symset_union(struct symset *a, struct symset *b)
{
int i;
int added = 0;
for (i = 0; i < b->cnt; i++)
if (symset_find(a, b->syms[i]) < 0) {
unsigned short data = 0;
if (b->data != NO_DATA)
data = b->data[i];
symset_add(a, b->syms[i], data);
added++;
}
return added;
}
And of course we must be able to free a symset.
static void symset_free(struct symset ss)
{
free(ss.syms);
if (ss.data != NO_DATA)
free(ss.data);
}
Some symsets are simply stored somewhere appropriate in the grammar
data structure, others needs a bit of indirection. This applies
particularly to "LA" sets. These will be paired with "items" in an
"itemset". As itemsets will be stored in a symset, the "LA" set needs to be
stored in the data
field, so we need a mapping from a small (short)
number to an LA symset
.
As mentioned earlier this involves creating a list of unique symsets.
For now, we just use a linear list sorted by insertion. A skiplist would be more efficient and may be added later.
struct setlist {
struct symset ss;
int num;
struct setlist *next;
};
struct setlist *sets;
int nextset;
static int ss_cmp(struct symset a, struct symset b)
{
int i;
int diff = a.cnt - b.cnt;
if (diff)
return diff;
for (i = 0; i < a.cnt; i++) {
diff = (int)a.syms[i] - (int)b.syms[i];
if (diff)
return diff;
}
return 0;
}
static int save_set(struct grammar *g, struct symset ss)
{
struct setlist **sl = &g->sets;
int cmp = 1;
struct setlist *s;
while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
sl = & (*sl)->next;
if (cmp == 0) {
symset_free(ss);
return (*sl)->num;
}
s = malloc(sizeof(*s));
s->ss = ss;
s->num = g->nextset;
g->nextset += 1;
s->next = *sl;
*sl = s;
return s->num;
}
Finding a set by number is currently performed by a simple linear search. If this turns out to hurt performance, we can store the sets in a dynamic array like the productions.
static struct symset set_find(struct grammar *g, int num)
{
struct setlist *sl = g->sets;
while (sl && sl->num != num)
sl = sl->next;
return sl->ss;
}
nullable
We set nullable
on the head symbol for any production for which all
the body symbols (if any) are nullable. As this is a recursive
definition, any change in the nullable
setting means that we need to
re-evaluate where it needs to be set.
We simply loop around performing the same calculations until no more changes happen.
int nullable;
static void set_nullable(struct grammar *g)
{
int check_again = 1;
while (check_again) {
int p;
check_again = 0;
for (p = 0; p < g->production_count; p++) {
struct production *pr = g->productions[p];
int s;
if (pr->head->nullable)
continue;
for (s = 0; s < pr->body_size; s++)
if (! pr->body[s]->nullable)
break;
if (s == pr->body_size) {
pr->head->nullable = 1;
check_again = 1;
}
}
}
}
can_eol
In order to be able to ignore newline tokens when not relevant, but still include them in the parse when needed, we will need to know which states can start a "line-like" section of code. We ignore newlines when there is an indent since the most recent start of a line-like section.
To know what is line-like, we first need to know which symbols can end a line-like section, which is precisely those which can end with a newline token. These symbols don't necessarily alway end with a newline, but they can. Hence they are not described as "lines" but only "line-like".
Clearly the TK_newline
token can end with a newline. Any symbol
which is the head of a production that contains a line-ending symbol
followed only by nullable symbols is also a line-ending symbol. We
use a new field can_eol
to record this attribute of symbols, and
compute it in a repetitive manner similar to set_nullable
.
int can_eol;
static void set_can_eol(struct grammar *g)
{
int check_again = 1;
g->symtab[TK_newline]->can_eol = 1;
while (check_again) {
int p;
check_again = 0;
for (p = 0; p < g->production_count; p++) {
struct production *pr = g->productions[p];
int s;
if (pr->head->can_eol)
continue;
for (s = pr->body_size - 1; s >= 0; s--) {
if (pr->body[s]->can_eol) {
pr->head->can_eol = 1;
check_again = 1;
break;
}
if (!pr->body[s]->nullable)
break;
}
}
}
}
first
setsWhen calculating what can follow a particular non-terminal, we will need to
know what the "first" terminal in any subsequent non-terminal might be. So
we calculate the first
set for every non-terminal and store them in an
array. We don't bother recording the "first" set for terminals as they are
trivial.
As the "first" for one symbol might depend on the "first" of another,
we repeat the calculations until no changes happen, just like with
set_nullable
. This makes use of the fact that symset_union
reports if any change happens.
The core of this which finds the "first" of part of a production body will be reused for computing the "follow" sets, so we split it out into a separate function.
struct symset *first;
static int add_first(struct production *pr, int start,
struct symset *target, struct grammar *g,
int *to_end)
{
int s;
int changed = 0;
for (s = start; s < pr->body_size; s++) {
struct symbol *bs = pr->body[s];
if (bs->type == Terminal) {
if (symset_find(target, bs->num) < 0) {
symset_add(target, bs->num, 0);
changed = 1;
}
break;
} else if (symset_union(target, &g->first[bs->num]))
changed = 1;
if (!bs->nullable)
break;
}
if (to_end)
*to_end = (s == pr->body_size);
return changed;
}
static void build_first(struct grammar *g)
{
int check_again = 1;
int s;
g->first = calloc(g->num_syms, sizeof(g->first[0]));
for (s = 0; s < g->num_syms; s++)
g->first[s] = INIT_SYMSET;
while (check_again) {
int p;
check_again = 0;
for (p = 0; p < g->production_count; p++) {
struct production *pr = g->productions[p];
struct symset *head = &g->first[pr->head->num];
if (add_first(pr, 0, head, g, NULL))
check_again = 1;
}
}
}
follow
sets.There are two different situations when we will want to generate "follow" sets. If we are doing an SLR analysis, we want to generate a single "follow" set for each non-terminal in the grammar. That is what is happening here. If we are doing an LALR or LR analysis we will want to generate a separate "LA" set for each item. We do that later in state generation.
There are two parts to generating a "follow" set. Firstly we look at
every place that any non-terminal appears in the body of any
production, and we find the set of possible "first" symbols after
there. This is done using add_first
above and only needs to be done
once as the "first" sets are now stable and will not change.
for (p = 0; p < g->production_count; p++) {
struct production *pr = g->productions[p];
int b;
for (b = 0; b < pr->body_size - 1; b++) {
struct symbol *bs = pr->body[b];
if (bs->type == Terminal)
continue;
add_first(pr, b+1, &g->follow[bs->num], g, NULL);
}
}
The second part is to add the "follow" set of the head of a production
to the "follow" sets of the final symbol in the production, and any
other symbol which is followed only by nullable
symbols. As this
depends on "follow" itself we need to repeatedly perform the process
until no further changes happen.
for (again = 0, p = 0;
p < g->production_count;
p < g->production_count-1
? p++ : again ? (again = 0, p = 0)
: p++) {
struct production *pr = g->productions[p];
int b;
for (b = pr->body_size - 1; b >= 0; b--) {
struct symbol *bs = pr->body[b];
if (bs->type == Terminal)
break;
if (symset_union(&g->follow[bs->num],
&g->follow[pr->head->num]))
again = 1;
if (!bs->nullable)
break;
}
}
We now just need to create and initialise the follow
list to get a
complete function.
struct symset *follow;
static void build_follow(struct grammar *g)
{
int s, again, p;
g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
for (s = 0; s < g->num_syms; s++)
g->follow[s] = INIT_SYMSET;
## follow code
}
There are three different levels of detail that can be chosen for building the itemsets and states for the LR grammar. They are:
enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
We need to be able to look through existing states to see if a newly generated state already exists. For now we use a simple sorted linked list.
An item is a pair of numbers: the production number and the position of
"DOT", which is an index into the body of the production.
As the numbers are not enormous we can combine them into a single "short"
and store them in a symset
- 4 bits for "DOT" and 12 bits for the
production number (so 4000 productions with maximum of 15 symbols in the
body).
Comparing the itemsets will be a little different to comparing symsets as we want to do the lookup after generating the "kernel" of an itemset, so we need to ignore the offset=zero items which are added during completion.
To facilitate this, we modify the "DOT" number so that "0" sorts to the end of the list in the symset, and then only compare items before the first "0".
static inline unsigned short item_num(int production, int index)
{
return production | ((31-index) << 11);
}
static inline int item_prod(unsigned short item)
{
return item & 0x7ff;
}
static inline int item_index(unsigned short item)
{
return (31-(item >> 11)) & 0x1f;
}
For LR(1) analysis we need to compare not just the itemset in a state but also the LA sets. As we assign each unique LA set a number, we can just compare the symset and the data values together.
static int itemset_cmp(struct symset a, struct symset b,
enum grammar_type type)
{
int i;
int av, bv;
for (i = 0;
i < a.cnt && i < b.cnt &&
item_index(a.syms[i]) > 0 &&
item_index(b.syms[i]) > 0;
i++) {
int diff = a.syms[i] - b.syms[i];
if (diff)
return diff;
if (type == LR1) {
diff = a.data[i] - b.data[i];
if (diff)
return diff;
}
}
if (i == a.cnt || item_index(a.syms[i]) == 0)
av = -1;
else
av = a.syms[i];
if (i == b.cnt || item_index(b.syms[i]) == 0)
bv = -1;
else
bv = b.syms[i];
if (av - bv)
return av - bv;
if (type < LR1 || av == -1)
return 0;
return
a.data[i] - b.data[i];
}
And now we can build the list of itemsets. The lookup routine returns both a success flag and a pointer to where in the list an insert should happen, so we don't need to search a second time.
struct itemset {
struct itemset *next;
short state;
struct symset items;
struct symset go_to;
char completed;
char starts_line;
};
struct itemset *items;
int states;
static int itemset_find(struct grammar *g, struct itemset ***where,
struct symset kernel, enum grammar_type type)
{
struct itemset **ip;
for (ip = &g->items; *ip ; ip = & (*ip)->next) {
struct itemset *i = *ip;
int diff;
diff = itemset_cmp(i->items, kernel, type);
if (diff < 0)
continue;
if (diff > 0)
break;
/* found */
*where = ip;
return 1;
}
*where = ip;
return 0;
}
Adding an itemset may require merging the LA sets if LALR analysis is
happening. If any new LA set add symbol that weren't in the old LA set, we
clear the completed
flag so that the dependants of this itemset will be
recalculated and their LA sets updated.
add_itemset
must consume the symsets it is passed, either by adding
them to a data structure, of freeing them.
static int add_itemset(struct grammar *g, struct symset ss,
enum grammar_type type, int starts_line)
{
struct itemset **where, *is;
int i;
int found = itemset_find(g, &where, ss, type);
if (!found) {
is = calloc(1, sizeof(*is));
is->state = g->states;
g->states += 1;
is->items = ss;
is->next = *where;
is->go_to = INIT_DATASET;
is->starts_line = starts_line;
*where = is;
return is->state;
}
is = *where;
if (type != LALR) {
symset_free(ss);
return is->state;
}
for (i = 0; i < ss.cnt; i++) {
struct symset temp = INIT_SYMSET, s;
if (ss.data[i] == is->items.data[i])
continue;
s = set_find(g, is->items.data[i]);
symset_union(&temp, &s);
s = set_find(g, ss.data[i]);
if (symset_union(&temp, &s)) {
is->items.data[i] = save_set(g, temp);
is->completed = 0;
} else
symset_free(temp);
}
symset_free(ss);
return is->state;
}
To build all the itemsets, we first insert the initial itemset made from the start symbol, complete each itemset, and then generate new itemsets from old until no new ones can be made.
Completing an itemset means finding all the items where "DOT" is followed by a nonterminal and adding "DOT=0" items for every production from that non-terminal - providing each item hasn't already been added.
If LA sets are needed, the LA set for each new item is found using
add_first
which was developed earlier for FIRST
and FOLLOW
. This may
be supplemented by the LA set for the item which produce the new item.
We also collect a set of all symbols which follow "DOT" (in done
) as this
is used in the next stage.
NOTE: precedence handling should happen here - I haven't written this yet though.
for (i = 0; i < is->items.cnt; i++) {
int p = item_prod(is->items.syms[i]);
int bs = item_index(is->items.syms[i]);
struct production *pr = g->productions[p];
int p2;
struct symbol *s;
struct symset LA = INIT_SYMSET;
unsigned short sn = 0;
if (bs == pr->body_size)
continue;
s = pr->body[bs];
if (symset_find(&done, s->num) < 0)
symset_add(&done, s->num, 0);
if (s->type != Nonterminal)
continue;
again = 1;
if (type >= LALR) {
// Need the LA set.
int to_end;
add_first(pr, bs+1, &LA, g, &to_end);
if (to_end) {
struct symset ss = set_find(g, is->items.data[i]);
symset_union(&LA, &ss);
}
sn = save_set(g, LA);
LA = set_find(g, sn);
}
/* Add productions for this symbol */
for (p2 = s->first_production;
p2 < g->production_count &&
g->productions[p2]->head == s;
p2++) {
int itm = item_num(p2, 0);
int pos = symset_find(&is->items, itm);
if (pos < 0) {
symset_add(&is->items, itm, sn);
/* Will have re-ordered, so start
* from beginning again */
i = -1;
} else if (type >= LALR) {
struct symset ss = set_find(g, is->items.data[pos]);
struct symset tmp = INIT_SYMSET;
symset_union(&tmp, &ss);
if (symset_union(&tmp, &LA)) {
is->items.data[pos] = save_set(g, tmp);
i = -1;
}else
symset_free(tmp);
}
}
}
For each symbol we found (and placed in done
) we collect all the items for
which this symbol is next, and create a new itemset with "DOT" advanced over
the symbol. This is then added to the collection of itemsets (or merged
with a pre-existing itemset).
// Now we have a completed itemset, so we need to
// compute all the 'next' itemsets and create them
// if they don't exist.
for (i = 0; i < done.cnt; i++) {
int j;
unsigned short state;
int starts_line = 0;
struct symbol *sym = g->symtab[done.syms[i]];
struct symset newitemset = INIT_SYMSET;
if (type >= LALR)
newitemset = INIT_DATASET;
if (sym->can_eol ||
(sym->nullable && is->starts_line))
starts_line = 1;
for (j = 0; j < is->items.cnt; j++) {
int itm = is->items.syms[j];
int p = item_prod(itm);
int bp = item_index(itm);
struct production *pr = g->productions[p];
unsigned short la = 0;
int pos;
if (bp == pr->body_size)
continue;
if (pr->body[bp] != sym)
continue;
if (type >= LALR)
la = is->items.data[j];
pos = symset_find(&newitemset, pr->head->num);
if (pos < 0)
symset_add(&newitemset, item_num(p, bp+1), la);
else if (type >= LALR) {
// Need to merge la set.
int la2 = newitemset.data[pos];
if (la != la2) {
struct symset ss = set_find(g, la2);
struct symset LA = INIT_SYMSET;
symset_union(&LA, &ss);
ss = set_find(g, la);
if (symset_union(&LA, &ss))
newitemset.data[pos] = save_set(g, LA);
else
symset_free(LA);
}
}
}
state = add_itemset(g, newitemset, type, starts_line);
if (symset_find(&is->go_to, done.syms[i]) < 0)
symset_add(&is->go_to, done.syms[i], state);
}
All that is left is to crate the initial itemset from production zero, and
with TK_eof
as the LA set.
static void build_itemsets(struct grammar *g, enum grammar_type type)
{
struct symset first = INIT_SYMSET;
struct itemset *is;
int again;
unsigned short la = 0;
if (type >= LALR) {
// LA set just has eof
struct symset eof = INIT_SYMSET;
symset_add(&eof, TK_eof, 0);
la = save_set(g, eof);
first = INIT_DATASET;
}
// production 0, offset 0 (with no data)
symset_add(&first, item_num(0, 0), la);
add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
for (again = 0, is = g->items;
is;
is = is->next ?: again ? (again = 0, g->items) : NULL) {
int i;
struct symset done = INIT_SYMSET;
if (is->completed)
continue;
is->completed = 1;
again = 1;
## complete itemset
## derive itemsets
symset_free(done);
}
}
The exact process of analysis depends on which level was requested. For
LR0
and LR05
we don't need first and follow sets at all. For LALR
and
LR1
we need first, but not follow. For SLR
we need both.
We don't build the "action" tables that you might expect as the parser doesn't use them. They will be calculated to some extent if needed for a report.
Once we have built everything we allocate arrays for the two lists:
symbols and itemsets. This allows more efficient access during reporting.
The symbols are grouped as terminals and non-terminals and we record the
changeover point in first_nonterm
.
struct symbol **symtab;
struct itemset **statetab;
int first_nonterm;
static void grammar_analyse(struct grammar *g, enum grammar_type type)
{
struct symbol *s;
struct itemset *is;
int snum = TK_reserved;
for (s = g->syms; s; s = s->next)
if (s->num < 0 && s->type == Terminal) {
s->num = snum;
snum++;
}
g->first_nonterm = snum;
for (s = g->syms; s; s = s->next)
if (s->num < 0) {
s->num = snum;
snum++;
}
g->num_syms = snum;
g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
for (s = g->syms; s; s = s->next)
g->symtab[s->num] = s;
set_nullable(g);
set_can_eol(g);
if (type >= SLR)
build_first(g);
if (type == SLR)
build_follow(g);
build_itemsets(g, type);
g->statetab = calloc(g->states, sizeof(g->statetab[0]));
for (is = g->items; is ; is = is->next)
g->statetab[is->state] = is;
}
The purpose of the report is to give the grammar developer insight into how the grammar parser will work. It is basically a structured dump of all the tables that have been generated, plus an description of any conflicts.
static int grammar_report(struct grammar *g, enum grammar_type type)
{
report_symbols(g);
if (g->follow)
report_follow(g);
report_itemsets(g);
return report_conflicts(g, type);
}
Firstly we have the complete list of symbols, together with the "FIRST" set if that was generated.
static void report_symbols(struct grammar *g)
{
int n;
if (g->first)
printf("SYMBOLS + FIRST:\n");
else
printf("SYMBOLS:\n");
for (n = 0; n < g->num_syms; n++) {
struct symbol *s = g->symtab[n];
if (!s)
continue;
printf(" %c%c%3d%c: ",
s->nullable ? '.':' ',
s->can_eol ? '>':' ',
s->num, symtypes[s->type]);
prtxt(s->name);
if (s->precedence)
printf(" (%d%s)", s->precedence,
assoc_names[s->assoc]);
if (g->first && s->type == Nonterminal) {
int j;
char c = ':';
for (j = 0; j < g->first[n].cnt; j++) {
printf("%c ", c);
c = ',';
prtxt(g->symtab[g->first[n].syms[j]]->name);
}
}
printf("\n");
}
}
Then we have to follow sets if they were computed.
static void report_follow(struct grammar *g)
{
int n;
printf("FOLLOW:\n");
for (n = 0; n < g->num_syms; n++)
if (g->follow[n].cnt) {
int j;
char c = ':';
printf(" ");
prtxt(g->symtab[n]->name);
for (j = 0; j < g->follow[n].cnt; j++) {
printf("%c ", c);
c = ',';
prtxt(g->symtab[g->follow[n].syms[j]]->name);
}
printf("\n");
}
}
And finally the item sets. These include the GO TO tables and, for LALR and LR1, the LA set for each item. Lots of stuff, so we break it up a bit. First the items, with production number and associativity.
static void report_item(struct grammar *g, int itm)
{
int p = item_prod(itm);
int dot = item_index(itm);
struct production *pr = g->productions[p];
int i;
printf(" ");
prtxt(pr->head->name);
printf(" ->");
for (i = 0; i < pr->body_size; i++) {
printf(" %s", (dot == i ? ". ": ""));
prtxt(pr->body[i]->name);
}
if (dot == pr->body_size)
printf(" .");
printf(" [%d]", p);
if (pr->precedence)
printf(" (%d%s)", pr->precedence,
assoc_names[pr->assoc]);
printf("\n");
}
The LA sets which are (possibly) reported with each item:
static void report_la(struct grammar *g, int lanum)
{
struct symset la = set_find(g, lanum);
int i;
char c = ':';
printf(" LOOK AHEAD(%d)", lanum);
for (i = 0; i < la.cnt; i++) {
printf("%c ", c);
c = ',';
prtxt(g->symtab[la.syms[i]]->name);
}
printf("\n");
}
Then the go to sets:
static void report_goto(struct grammar *g, struct symset gt)
{
int i;
printf(" GOTO:\n");
for (i = 0; i < gt.cnt; i++) {
printf(" ");
prtxt(g->symtab[gt.syms[i]]->name);
printf(" -> %d\n", gt.data[i]);
}
}
Now we can report all the item sets complete with items, LA sets, and GO TO.
static void report_itemsets(struct grammar *g)
{
int s;
printf("ITEM SETS(%d)\n", g->states);
for (s = 0; s < g->states; s++) {
int j;
struct itemset *is = g->statetab[s];
printf(" Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
for (j = 0; j < is->items.cnt; j++) {
report_item(g, is->items.syms[j]);
if (is->items.data != NO_DATA)
report_la(g, is->items.data[j]);
}
report_goto(g, is->go_to);
}
}
Conflict detection varies a lot among different analysis levels. However LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and LR1 are also similar as they have FOLLOW or LA sets.
## conflict functions
static int report_conflicts(struct grammar *g, enum grammar_type type)
{
int cnt = 0;
printf("Conflicts:\n");
if (type < SLR)
cnt = conflicts_lr0(g, type);
else
cnt = conflicts_slr(g, type);
if (cnt == 0)
printf(" - no conflicts\n");
return cnt;
}
LR0 conflicts are any state which have both a reducible item and a shiftable item.
LR05 conflicts only occurs if two possibly reductions exist, as shifts always over-ride reductions.
static int conflicts_lr0(struct grammar *g, enum grammar_type type)
{
int i;
int cnt = 0;
for (i = 0; i < g->states; i++) {
struct itemset *is = g->statetab[i];
int last_reduce = -1;
int prev_reduce = -1;
int last_shift = -1;
int j;
if (!is)
continue;
for (j = 0; j < is->items.cnt; j++) {
int itm = is->items.syms[j];
int p = item_prod(itm);
int bp = item_index(itm);
struct production *pr = g->productions[p];
if (bp == pr->body_size) {
prev_reduce = last_reduce;
last_reduce = j;
continue;
}
if (pr->body[bp]->type == Terminal)
last_shift = j;
}
if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
printf(" State %d has both SHIFT and REDUCE:\n", i);
report_item(g, is->items.syms[last_shift]);
report_item(g, is->items.syms[last_reduce]);
cnt++;
}
if (prev_reduce >= 0) {
printf(" State %d has 2 (or more) reducible items\n", i);
report_item(g, is->items.syms[prev_reduce]);
report_item(g, is->items.syms[last_reduce]);
cnt++;
}
}
return cnt;
}
SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping look ahead, or if a symbol in a look-ahead can be shifted. The differ only in the source of the look ahead set.
We build a dataset mapping terminal to item for possible SHIFTs and then another for possible REDUCE operations. We report when we get conflicts between the two.
static int conflicts_slr(struct grammar *g, enum grammar_type type)
{
int i;
int cnt = 0;
for (i = 0; i < g->states; i++) {
struct itemset *is = g->statetab[i];
struct symset shifts = INIT_DATASET;
struct symset reduce = INIT_DATASET;
int j;
if (!is)
continue;
/* First collect the shifts */
for (j = 0; j < is->items.cnt; j++) {
unsigned short itm = is->items.syms[j];
int p = item_prod(itm);
int bp = item_index(itm);
struct production *pr = g->productions[p];
if (bp < pr->body_size &&
pr->body[bp]->type == Terminal) {
/* shiftable */
int sym = pr->body[bp]->num;
if (symset_find(&shifts, sym) < 0)
symset_add(&shifts, sym, itm);
}
}
/* Now look for reduction and conflicts */
for (j = 0; j < is->items.cnt; j++) {
unsigned short itm = is->items.syms[j];
int p = item_prod(itm);
int bp = item_index(itm);
struct production *pr = g->productions[p];
if (bp < pr->body_size)
continue;
/* reducible */
struct symset la;
if (type == SLR)
la = g->follow[pr->head->num];
else
la = set_find(g, is->items.data[j]);
int k;
for (k = 0; k < la.cnt; k++) {
int pos = symset_find(&shifts, la.syms[k]);
if (pos >= 0) {
printf(" State %d has SHIFT/REDUCE conflict on ", i);
prtxt(g->symtab[la.syms[k]]->name);
printf(":\n");
report_item(g, shifts.data[pos]);
report_item(g, itm);
cnt++;
}
pos = symset_find(&reduce, la.syms[k]);
if (pos < 0) {
symset_add(&reduce, la.syms[k], itm);
continue;
}
printf(" State %d has REDUCE/REDUCE conflict on ", i);
prtxt(g->symtab[la.syms[k]]->name);
printf(":\n");
report_item(g, itm);
report_item(g, reduce.data[pos]);
cnt++;
}
}
symset_free(shifts);
symset_free(reduce);
}
return cnt;
}
The export part of the parser is the parse_XX
function, where the name
XX
is based on the name of the parser files.
This takes a code_node
, a partially initialized token_config
, and an
optional FILE
to send tracing to. The token_config
gets the list of
known words added and then is used with the code_node
to initialize the
scanner.
parse_XX
then call the library function parser_run
to actually complete
the parse. This needs the states
table and function to call the various
pieces of code provided in the grammar file, so they are generated first.
static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
{
gen_known(f, g);
gen_non_term(f, g);
gen_goto(f, g);
gen_states(f, g);
gen_reduce(f, g, file);
gen_free(f, g);
fprintf(f, "#line 0 \"gen_parser\"\n");
fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
name);
fprintf(f, "{\n");
fprintf(f, "\tstruct token_state *tokens;\n");
fprintf(f, "\tconfig->words_marks = known;\n");
fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
fprintf(f, "\ttokens = token_open(code, config);\n");
fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
fprintf(f, "\ttoken_close(tokens);\n");
fprintf(f, "\treturn rv;\n");
fprintf(f, "}\n\n");
}
The know words is simply an array of terminal symbols. The table of nonterminals used for tracing is a similar array.
static void gen_known(FILE *f, struct grammar *g)
{
int i;
fprintf(f, "#line 0 \"gen_known\"\n");
fprintf(f, "static const char *known[] = {\n");
for (i = TK_reserved;
i < g->num_syms && g->symtab[i]->type == Terminal;
i++)
fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
g->symtab[i]->name.txt);
fprintf(f, "};\n\n");
}
static void gen_non_term(FILE *f, struct grammar *g)
{
int i;
fprintf(f, "#line 0 \"gen_non_term\"\n");
fprintf(f, "static const char *non_term[] = {\n");
for (i = TK_reserved;
i < g->num_syms;
i++)
if (g->symtab[i]->type == Nonterminal)
fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
g->symtab[i]->name.txt);
fprintf(f, "};\n\n");
}
For each state we record the goto table, the reducible production if
there is one, or a symbol to shift for error recovery.
Some of the details of the reducible production are stored in the
do_reduce
function to come later. Here we store the production number,
the body size (useful for stack management) and the resulting symbol (useful
for knowing how to free data later).
The go to table is stored in a simple array of sym
and corresponding
state
.
struct lookup {
short sym;
short state;
};
struct state {
short go_to_cnt;
const struct lookup * go_to;
short reduce_prod;
short reduce_size;
short reduce_sym;
short shift_sym;
short starts_line;
};
static void gen_goto(FILE *f, struct grammar *g)
{
int i;
fprintf(f, "#line 0 \"gen_goto\"\n");
for (i = 0; i < g->states; i++) {
int j;
fprintf(f, "static const struct lookup goto_%d[] = {\n",
i);
struct symset gt = g->statetab[i]->go_to;
for (j = 0; j < gt.cnt; j++)
fprintf(f, "\t{ %d, %d },\n",
gt.syms[j], gt.data[j]);
fprintf(f, "};\n");
}
}
static void gen_states(FILE *f, struct grammar *g)
{
int i;
fprintf(f, "#line 0 \"gen_states\"\n");
fprintf(f, "static const struct state states[] = {\n");
for (i = 0; i < g->states; i++) {
struct itemset *is = g->statetab[i];
int j, prod = -1, prod_len;
int shift_sym = -1;
int shift_len = 0, shift_remain = 0;
for (j = 0; j < is->items.cnt; j++) {
int itm = is->items.syms[j];
int p = item_prod(itm);
int bp = item_index(itm);
struct production *pr = g->productions[p];
if (bp < pr->body_size) {
if (shift_sym < 0 ||
(shift_len == bp && shift_remain > pr->body_size - bp)) {
shift_sym = pr->body[bp]->num;
shift_len = bp;
shift_remain = pr->body_size - bp;
}
continue;
}
/* This is what we reduce */
if (prod < 0 || prod_len < pr->body_size) {
prod = p;
prod_len = pr->body_size;
}
}
if (prod >= 0)
fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
i, is->go_to.cnt, i, prod,
g->productions[prod]->body_size,
g->productions[prod]->head->num,
is->starts_line);
else
fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
i, is->go_to.cnt, i, shift_sym,
is->starts_line);
}
fprintf(f, "};\n\n");
}
do_reduce
function and the codeWhen the parser engine decides to reduce a production, it calls do_reduce
.
This has two functions.
Firstly, if a non-NULL trace
file is passed, it prints out details of the
production being reduced. Secondly it runs the code that was included with
the production if any.
This code needs to be able to store data somewhere. Rather than requiring
do_reduce
to malloc
that "somewhere", we pass in a large buffer and have
do_reduce
return the size to be saved.
The code fragment requires translation when written out. Any $N
needs to
be converted to a reference either to that buffer (if $0
) or to the
structure returned by a previous reduction. These pointer need to be cast
to the appropriate type for each access. All this is handling in
gen_code
.
static void gen_code(struct production *p, FILE *f, struct grammar *g)
{
char *c;
fprintf(f, "\t\t\t");
for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
int n;
if (*c != '$') {
fputc(*c, f);
if (*c == '\n')
fputs("\t\t\t", f);
continue;
}
c++;
if (*c < '0' || *c > '9') {
fputc(*c, f);
continue;
}
n = *c - '0';
while (c[1] >= '0' && c[1] <= '9') {
c += 1;
n = n * 10 + *c - '0';
}
if (n == 0)
fprintf(f, "(*(struct %.*s*)ret)",
p->head->struct_name.len,
p->head->struct_name.txt);
else if (n > p->body_size)
fprintf(f, "$%d", n);
else if (p->body[n-1]->type == Terminal)
fprintf(f, "(*(struct token *)body[%d])",
n-1);
else if (p->body[n-1]->struct_name.txt == NULL)
fprintf(f, "$%d", n);
else
fprintf(f, "(*(struct %.*s*)body[%d])",
p->body[n-1]->struct_name.len,
p->body[n-1]->struct_name.txt, n-1);
}
fputs("\n", f);
}
static void gen_reduce(FILE *f, struct grammar *g, char *file)
{
int i;
fprintf(f, "#line 0 \"gen_reduce\"\n");
fprintf(f, "static int do_reduce(int prod, void **body, void *ret)\n");
fprintf(f, "{\n");
fprintf(f, "\tint ret_size = 0;\n");
fprintf(f, "\tswitch(prod) {\n");
for (i = 0; i < g->production_count; i++) {
struct production *p = g->productions[i];
fprintf(f, "\tcase %d:\n", i);
if (p->code.txt)
gen_code(p, f, g);
if (p->head->struct_name.txt)
fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
p->head->struct_name.len,
p->head->struct_name.txt);
fprintf(f, "\t\tbreak;\n");
}
fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
}
do_free
As each non-terminal can potentially cause a different type of data structure to be allocated and filled in, we need to be able to free it when done.
It is particularly important to have fine control over freeing during error recovery where individual stack frames might need to be freed.
For this, the grammar author required to defined a free_XX
function for
each structure that is used by a non-terminal. do_free
all call whichever
is appropriate given a symbol number, and will call free
(as is
appropriate for tokens` on any terminal symbol.
static void gen_free(FILE *f, struct grammar *g)
{
int i;
fprintf(f, "#line 0 \"gen_free\"\n");
fprintf(f, "static void do_free(short sym, void *asn)\n");
fprintf(f, "{\n");
fprintf(f, "\tif (!asn) return;\n");
fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
fprintf(f, "\tswitch(sym) {\n");
for (i = 0; i < g->num_syms; i++) {
struct symbol *s = g->symtab[i];
if (!s ||
s->type != Nonterminal ||
s->struct_name.txt == NULL)
continue;
fprintf(f, "\tcase %d:\n", s->num);
fprintf(f, "\t\tfree_%.*s(asn);\n",
s->struct_name.len,
s->struct_name.txt);
fprintf(f, "\t\tbreak;\n");
}
fprintf(f, "\t}\n}\n\n");
}
There are three key parts to the "main" function of parsergen: processing the arguments, loading the grammar file, and dealing with the grammar.
Command line options allow the selection of analysis mode, name of output file, and whether or not a report should be generated. By default we create a report only if no code output was requested.
The parse_XX
function name uses the basename of the output file which
should not have a suffix (.c
). .c
is added to the given name for the
code, and .h
is added for the header (if header text is specifed in the
grammar file).
#include <getopt.h>
static const struct option long_options[] = {
{ "LR0", 0, NULL, '0' },
{ "LR05", 0, NULL, '5' },
{ "SLR", 0, NULL, 'S' },
{ "LALR", 0, NULL, 'L' },
{ "LR1", 0, NULL, '1' },
{ "report", 0, NULL, 'R' },
{ "output", 1, NULL, 'o' },
{ NULL, 0, NULL, 0 }
};
const char *options = "05SL1Ro:";
int opt;
char *outfile = NULL;
char *infile;
char *name;
int report = 1;
enum grammar_type type = LR05;
while ((opt = getopt_long(argc, argv, options,
long_options, NULL)) != -1) {
switch(opt) {
case '0':
type = LR0; break;
case '5':
type = LR05; break;
case 'S':
type = SLR; break;
case 'L':
type = LALR; break;
case '1':
type = LR1; break;
case 'R':
report = 2; break;
case 'o':
outfile = optarg; break;
default:
fprintf(stderr, "Usage: parsergen ...\n");
exit(1);
}
}
if (optind < argc)
infile = argv[optind++];
else {
fprintf(stderr, "No input file given\n");
exit(1);
}
if (outfile && report == 1)
report = 0;
name = outfile;
if (name && strchr(name, '/'))
name = strrchr(name, '/')+1;
if (optind < argc) {
fprintf(stderr, "Excess command line arguments\n");
exit(1);
}
To be able to run mdcode
and scanner
on the grammar we need to memory
map it.
One we have extracted the code (with mdcode
) we expect to file three
sections: header, code, and grammar. Anything else is an error.
"header" and "code" are optional, though it is hard to build a working parser with neither. "grammar" must be provided.
#include <fcntl.h>
#include <sys/mman.h>
#include <errno.h>
static int errs;
static void pr_err(char *msg)
{
errs++;
fprintf(stderr, "%s\n", msg);
}
struct section *table;
int fd;
int len;
char *file;
fd = open(infile, O_RDONLY);
if (fd < 0) {
fprintf(stderr, "parsergen: cannot open %s: %s\n",
infile, strerror(errno));
exit(1);
}
len = lseek(fd, 0, 2);
file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
table = code_extract(file, file+len, pr_err);
struct code_node *hdr = NULL;
struct code_node *code = NULL;
struct code_node *gram = NULL;
for (s = table; s; s = s->next) {
if (text_is(s->section, "header"))
hdr = s->code;
else if (text_is(s->section, "code"))
code = s->code;
else if (text_is(s->section, "grammar"))
gram = s->code;
else {
fprintf(stderr, "Unknown content section: %.*s\n",
s->section.len, s->section.txt);
rv |= 2;
}
}
As we need to append an extention to a filename and then open it for writing, and we need to do this twice, it helps to have a separate function.
static FILE *open_ext(char *base, char *ext)
{
char *fn = malloc(strlen(base) + strlen(ext) + 1);
FILE *f;
strcat(strcpy(fn, base), ext);
f = fopen(fn, "w");
free(fn);
return f;
}
If we can read the grammar, then we analyse and optionally report on it. If the report finds conflicts we will exit with an error status.
struct grammar *g = NULL;
if (gram == NULL) {
fprintf(stderr, "No grammar section provided\n");
rv |= 2;
} else {
g = grammar_read(gram);
if (!g) {
fprintf(stderr, "Failure to parse grammar\n");
rv |= 2;
}
}
if (g) {
grammar_analyse(g, type);
if (report)
if (grammar_report(g, type))
rv |= 1;
}
If a headers section is defined, we write it out and include a declaration
for the parse_XX
function so it can be used from separate code.
if (rv == 0 && hdr && outfile) {
FILE *f = open_ext(outfile, ".h");
if (f) {
code_node_print(f, hdr, infile);
fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
name);
fclose(f);
} else {
fprintf(stderr, "Cannot create %s.h\n",
outfile);
rv |= 4;
}
}
And if all goes well and an output file was provided, we create the .c
file with the code section (if any) and the parser tables and function.
if (rv == 0 && outfile) {
FILE *f = open_ext(outfile, ".c");
if (f) {
if (code)
code_node_print(f, code, infile);
gen_parser(f, g, infile, name);
fclose(f);
} else {
fprintf(stderr, "Cannot create %s.c\n",
outfile);
rv |= 4;
}
}
And that about wraps it up. We need to set the locale so that UTF-8 is
recognised properly, and link with libicuuc
is libmdcode
requires that.
parsergen : parsergen.o libscanner.o libmdcode.o
$(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
#include <locale.h>
int main(int argc, char *argv[])
{
struct section *s;
int rv = 0;
setlocale(LC_ALL,"");
## process arguments
## load file
## process input
return rv;
}
Having analysed the grammar and generated all the table, we only need the shift/reduce engine to bring it all together.
The parser generator has nicely provided us with goto tables sorted by symbol number. We need a binary search function to find a symbol in the table.
static int search(const struct state *l, int sym)
{
int lo = 0;
int hi = l->go_to_cnt;
if (hi == 0)
return -1;
while (lo + 1 < hi) {
int mid = (lo + hi) / 2;
if (l->go_to[mid].sym <= sym)
lo = mid;
else
hi = mid;
}
if (l->go_to[lo].sym == sym)
return l->go_to[lo].state;
else
return -1;
}
The core data structure for the parser is the stack. This tracks all the symbols that have been recognised or partially recognised.
The stack usually won't grow very large - maybe a few tens of entries. So we dynamically resize and array as required but never bother to shrink it down again.
We keep the stack as two separate allocations. One, asn_stack
stores the
"abstract syntax nodes" which are created by each reduction. When we call
do_reduce
we need to pass an array of the asn
s of the body of the
production, and by keeping a separate asn
stack, we can just pass a
pointer into this stack.
The other allocation stores all other stack fields of which there are four.
The state
is the most important one and guides the parsing process. The
sym
is nearly unnecessary. However when we want to free entries from the
asn_stack
, it helps to know what type they are so we can call the right
freeing function. The symbol leads us to the right free function through
do_free
.
The indents
count and the starts_indented
flag track the line
indents in the symbol. These are used to allow indent information to
guide parsing and error recovery.
As well as the stack of frames we have a next
frame which is
assembled from the incoming token and other information prior to
pushing it onto the stack.
struct parser {
struct frame {
short state;
short sym;
short starts_indented;
short indents;
short newline_permitted;
} *stack, next;
void **asn_stack;
int stack_size;
int tos;
};
Two operations are needed on the stack - shift (which is like push) and pop.
Shift applies not only to terminals but also to non-terminals. When we reduce a production we will pop off entries corresponding to the body symbols, then push on an item for the head of the production. This last is exactly the same process as shifting in a terminal so we use the same function for both.
To simplify other code we arrange for shift
to fail if there is no goto
state for the symbol. This is useful in basic parsing due to our design
that we shift when we can, and reduce when we cannot. So the shift
function reports if it could.
So shift
finds the next state. If that succeed it extends the allocations
if needed and pushes all the information onto the stacks.
static int shift(struct parser *p,
void *asn,
const struct state states[])
{
// Push an entry onto the stack
int newstate = search(&states[p->next.state], p->next.sym);
if (newstate < 0)
return 0;
if (p->tos >= p->stack_size) {
p->stack_size += 10;
p->stack = realloc(p->stack, p->stack_size
* sizeof(p->stack[0]));
p->asn_stack = realloc(p->asn_stack, p->stack_size
* sizeof(p->asn_stack[0]));
}
p->stack[p->tos] = p->next;
p->asn_stack[p->tos] = asn;
p->tos++;
p->next.state = newstate;
p->next.indents = 0;
p->next.starts_indented = 0;
// if new state doesn't start a line, we inherit newline_permitted status
if (states[newstate].starts_line)
p->next.newline_permitted = 1;
return 1;
}
pop
simply moves the top of stack (tos
) back down the required amount
and frees any asn
entries that need to be freed. It is called after we
reduce a production, just before we shift
the nonterminal in.
static void pop(struct parser *p, int num,
void(*do_free)(short sym, void *asn))
{
int i;
p->tos -= num;
for (i = 0; i < num; i++) {
p->next.indents += p->stack[p->tos+i].indents;
do_free(p->stack[p->tos+i].sym,
p->asn_stack[p->tos+i]);
}
if (num) {
p->next.state = p->stack[p->tos].state;
p->next.starts_indented = p->stack[p->tos].starts_indented;
p->next.newline_permitted = p->stack[p->tos].newline_permitted;
if (p->next.indents > p->next.starts_indented)
p->next.newline_permitted = 0;
}
}
The scanner
returns tokens in a local variable - we want them in allocated
memory so they can live in the asn_stack
. Similarly the asn
produced by
a reduce is in a large buffer. Both of these require some allocation and
copying, hence memdup
and tokcopy
.
#include <memory.h>
void *memdup(void *m, int len)
{
void *ret;
if (len == 0)
return NULL;
ret = malloc(len);
memcpy(ret, m, len);
return ret;
}
static struct token *tok_copy(struct token tk)
{
struct token *new = malloc(sizeof(*new));
*new = tk;
return new;
}
Now we have the parser. If we can shift, we do. If not and we can reduce we do. If the production we reduced was production zero, then we have accepted the input and can finish.
We return whatever asn
was returned by reducing production zero.
If we can neither shift nor reduce we have an error to handle. We pop
single entries off the stack until we can shift the TK_error
symbol, then
drop input tokens until we find one we can shift into the new error state.
When we find TK_in
and TK_out
tokens which report indents we need
to handle them directly as the grammar cannot express what we want to
do with them.
TK_in
tokens are easy: we simply update the next
stack frame to
record how many indents there are and that the next token started with
an indent.
TK_out
tokens must either be counted off against any pending indent,
or must force reductions until there is a pending indent which isn't
at the start of a production.
#include "parser.h"
void *parser_run(struct token_state *tokens,
const struct state states[],
int (*do_reduce)(int, void**, void*),
void (*do_free)(short, void*),
FILE *trace, const char *non_term[], int knowns)
{
struct parser p = { 0 };
struct token *tk = NULL;
int accepted = 0;
void *ret;
while (!accepted) {
struct token *err_tk;
if (!tk)
tk = tok_copy(token_next(tokens));
p.next.sym = tk->num;
if (trace)
parser_trace(trace, &p, tk, states, non_term, knowns);
if (p.next.sym == TK_in) {
p.next.starts_indented = 1;
p.next.indents = 1;
free(tk);
tk = NULL;
continue;
}
if (p.next.sym == TK_out) {
if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
(p.stack[p.tos-1].indents == 1 &&
states[p.next.state].reduce_size > 1)) {
p.stack[p.tos-1].indents -= 1;
if (p.stack[p.tos-1].indents == p.stack[p.tos-1].starts_indented) {
// no internal indent any more, reassess 'newline_permitted'
if (states[p.stack[p.tos-1].state].starts_line)
p.stack[p.tos-1].newline_permitted = 1;
else if (p.tos > 1)
p.stack[p.tos-1].newline_permitted = p.stack[p.tos-2].newline_permitted;
}
free(tk);
tk = NULL;
continue;
}
// fall through and force a REDUCE (as 'shift'
// will fail).
}
if (p.next.sym == TK_newline) {
if (! p.stack[p.tos-1].newline_permitted) {
free(tk);
tk = NULL;
continue;
}
}
if (shift(&p, tk, states)) {
tk = NULL;
continue;
}
if (states[p.next.state].reduce_prod >= 0) {
void **body;
int prod = states[p.next.state].reduce_prod;
int size = states[p.next.state].reduce_size;
int bufsize;
static char buf[16*1024];
p.next.sym = states[p.next.state].reduce_sym;
body = p.asn_stack +
(p.tos - states[p.next.state].reduce_size);
bufsize = do_reduce(prod, body, buf);
pop(&p, size, do_free);
shift(&p, memdup(buf, bufsize), states);
if (prod == 0)
accepted = 1;
continue;
}
if (tk->num == TK_out) {
// Indent problem - synthesise tokens to get us
// out of here.
fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
p.next.sym = states[p.next.state].shift_sym;
shift(&p, tok_copy(*tk), states);
// FIXME need to report this error somehow
continue;
}
/* Error. We walk up the stack until we
* find a state which will accept TK_error.
* We then shift in TK_error and see what state
* that takes us too.
* Then we discard input tokens until
* we find one that is acceptable.
*/
err_tk = tok_copy(*tk);
p.next.sym = TK_error;
while (shift(&p, err_tk, states) == 0
&& p.tos > 0)
// discard this state
pop(&p, 1, do_free);
if (p.tos == 0) {
free(err_tk);
// no state accepted TK_error
break;
}
while (search(&states[p.next.state], tk->num) < 0 &&
tk->num != TK_eof) {
free(tk);
tk = tok_copy(token_next(tokens));
if (tk->num == TK_in)
p.next.indents += 1;
if (tk->num == TK_out) {
if (p.next.indents == 0)
break;
p.next.indents -= 1;
}
}
if (p.tos == 0 && tk->num == TK_eof)
break;
}
free(tk);
if (accepted)
ret = p.asn_stack[0];
else
pop(&p, p.tos, do_free);
free(p.asn_stack);
free(p.stack);
return ret;
}
void *parser_run(struct token_state *tokens,
const struct state states[],
int (*do_reduce)(int, void**, void*),
void (*do_free)(short, void*),
FILE *trace, const char *non_term[], int knowns);
Being able to visualize the parser in action can be invaluable when debugging the parser code, or trying to understand how the parse of a particular grammar progresses. The stack contains all the important state, so just printing out the stack every time around the parse loop can make it possible to see exactly what is happening.
This doesn't explicitly show each SHIFT and REDUCE action. However they are easily deduced from the change between consecutive lines, and the details of each state can be found by cross referencing the states list in the stack with the "report" that parsergen can generate.
For terminal symbols, we just dump the token. For non-terminals we print the name of the symbol. The look ahead token is reported at the end inside square brackets.
static char *reserved_words[] = {
[TK_error] = "ERROR",
[TK_in] = "IN",
[TK_out] = "OUT",
[TK_newline] = "NEWLINE",
[TK_eof] = "$eof",
};
static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
{
fprintf(trace, "(%d", f->state);
if (f->indents)
fprintf(trace, "%c%d", f->starts_indented?':':'.',
f->indents);
if (states[f->state].starts_line)
fprintf(trace, "s");
if (f->newline_permitted)
fprintf(trace, "n");
fprintf(trace, ") ");
}
void parser_trace(FILE *trace, struct parser *p,
struct token *tk, const struct state states[],
const char *non_term[], int knowns)
{
int i;
for (i = 0; i < p->tos; i++) {
int sym = p->stack[i].sym;
parser_trace_state(trace, &p->stack[i], states);
if (sym < TK_reserved &&
reserved_words[sym] != NULL)
fputs(reserved_words[sym], trace);
else if (sym < TK_reserved + knowns) {
struct token *t = p->asn_stack[i];
text_dump(trace, t->txt, 20);
} else
fputs(non_term[sym - TK_reserved - knowns],
trace);
fputs(" ", trace);
}
parser_trace_state(trace, &p->next, states);
fprintf(trace, " [");
if (tk->num < TK_reserved &&
reserved_words[tk->num] != NULL)
fputs(reserved_words[tk->num], trace);
else
text_dump(trace, tk->txt, 20);
fputs("]\n", trace);
}
The obvious example for a parser is a calculator.
As scanner
provides number parsing function using libgmp
is it not much
work to perform arbitrary rational number calculations.
This calculator takes one expression, or an equality test per line. The results are printed and in any equality test fails, the program exits with an error.
Embedding mdcode inside mdcode is rather horrible. I'd like to find a better approach, but as the grammar file must have 3 components I need something like this.
calc.c : parsergen calc.cgm
./parsergen -o calc calc.cgm
calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
$(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
# header
~~~~~
#include "number.h"
// what do we use for a demo-grammar? A calculator of course.
struct number {
mpq_t val;
char tail[2];
int err;
};
~~~~~
# code
~~~~~
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <stdio.h>
#include <malloc.h>
#include <gmp.h>
#include "mdcode.h"
#include "scanner.h"
#include "number.h"
#include "parser.h"
#include "calc.h"
static void free_number(struct number *n)
{
mpq_clear(n->val);
free(n);
}
int main(int argc, char *argv[])
{
int fd = open(argv[1], O_RDONLY);
int len = lseek(fd, 0, 2);
char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
struct section *s = code_extract(file, file+len, NULL);
struct token_config config = {
.ignored = (1 << TK_line_comment)
| (1 << TK_block_comment)
| (1 << TK_in)
| (1 << TK_out),
.number_chars = ".,_+-",
.word_start = "",
.word_cont = "",
};
parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
exit(0);
}
~~~~~
# grammar
~~~~~
Session -> Session Line
| Line
Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
{ mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
gmp_printf(" or as a decimal: %Fg\n", fl);
mpf_clear(fl);
}
}$
| Expression = Expression NEWLINE ${
if (mpq_equal($1.val, $3.val))
gmp_printf("Both equal %Qd\n", $1.val);
else {
gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
$1.val, $3.val);
exit(1);
}
}$
| NEWLINE ${ printf("Blank line\n"); }$
| ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
$number
Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
| Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
| Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
| Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
| Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
| ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$