1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. Finally `calc` is a test grammar for a simple calculator. The
21 `parsergen` program built from the C code in this file can extract
22 that grammar directly from this file and process it.
25 ###### File: parsergen.c
30 ## forward declarations
41 ###### File: libparser.c
48 ###### File: parsergen.mk
51 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
54 ## Reading the grammar
56 The grammar must be stored in a markdown literate code file as parsed
57 by "[mdcode][]". It must have three top level (i.e. unreferenced)
58 sections: `header`, `code`, and `grammar`. The first two will be
59 literally copied into the generated `.c` and `.h`. files. The last
60 contains the grammar. This is tokenised with "[scanner][]".
62 If the `--tag` option is given, then any top level header that doesn't
63 start with the tag is ignored, and the tag is striped from the rest. So
65 means that the three needed sections must be `Foo: header`, `Foo: code`,
69 [scanner]: scanner.html
75 ###### parser includes
79 The grammar contains production sets, precedence/associativity
80 declarations, and data type declarations. These are all parsed with
81 _ad hoc_ parsing as we don't have a parser generator yet.
83 The precedence and associativity can be set for each production, but
84 can be inherited from symbols. The data type is potentially defined
85 for each non-terminal and describes what C structure is used to store
86 information about each symbol.
89 enum assoc {Left, Right, Non};
90 char *assoc_names[] = {"Left","Right","Non"};
93 struct text struct_name;
95 unsigned short precedence;
99 unsigned short precedence;
107 The strings reported by `mdcode` and `scanner` are `struct text` which have
108 length rather than being null terminated. To help with printing and
109 comparing we define `text_is` and `prtxt`, which should possibly go in
110 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
111 which might contain control characters.
113 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
114 because that is what we need to detect tags.
117 static int text_is(struct text t, char *s)
119 return (strlen(s) == t.len &&
120 strncmp(s, t.txt, t.len) == 0);
122 static void prtxt(struct text t)
124 printf("%.*s", t.len, t.txt);
127 static int strip_tag(struct text *t, char *tag)
129 int skip = strlen(tag) + 1;
130 if (skip >= t->len ||
131 strncmp(t->txt, tag, skip-1) != 0 ||
132 t->txt[skip-1] != ':')
134 while (skip < t->len && t->txt[skip] == ' ')
143 Productions are comprised primarily of symbols - terminal and
144 non-terminal. We do not make a syntactic distinction between the two,
145 though non-terminals must be identifiers. Non-terminal symbols are
146 those which appear in the head of a production, terminal symbols are
147 those which don't. There are also "virtual" symbols used for precedence
148 marking discussed later, and sometimes we won't know what type a symbol
151 ###### forward declarations
152 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
153 char *symtypes = "UVTN";
157 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
158 table of known symbols and the resulting parser will report them as
159 `TK_reserved + N`. A small set of identifiers are reserved for the
160 different token types that `scanner` can report.
163 static char *reserved_words[] = {
164 [TK_error] = "ERROR",
165 [TK_number] = "NUMBER",
166 [TK_ident] = "IDENTIFIER",
168 [TK_string] = "STRING",
169 [TK_multi_string] = "MULTI_STRING",
172 [TK_newline] = "NEWLINE",
178 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
179 recognised. The former is automatically expected at the end of the text
180 being parsed. The latter are always ignored by the parser.
182 All of these symbols are stored in a simple symbol table. We use the
183 `struct text` from `mdcode` to store the name and link them together
184 into a sorted list using an insertion sort.
186 We don't have separate `find` and `insert` functions as any symbol we
187 find needs to be remembered. We simply expect `find` to always return a
188 symbol, but its type might be `Unknown`.
197 ###### grammar fields
202 static int text_cmp(struct text a, struct text b)
207 int cmp = strncmp(a.txt, b.txt, len);
211 return a.len - b.len;
214 static struct symbol *sym_find(struct grammar *g, struct text s)
216 struct symbol **l = &g->syms;
221 (cmp = text_cmp((*l)->name, s)) < 0)
225 n = calloc(1, sizeof(*n));
234 static void symbols_init(struct grammar *g)
236 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
238 for (i = 0; i < entries; i++) {
241 t.txt = reserved_words[i];
244 t.len = strlen(t.txt);
251 ### Data types and precedence.
253 Data type specification and precedence specification are both
254 introduced by a dollar sign at the start of the line. If the next
255 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
256 precedence, otherwise it specifies a data type.
258 The data type name is simply stored and applied to the head of all
259 subsequent productions. It must be the name of a structure, so `$expr`
260 maps to `struct expr`.
262 Any productions given before the first data type will have no data type
263 and can carry no information. In order to allow other non-terminals to
264 have no type, the data type `$void` can be given. This does *not* mean
265 that `struct void` will be used, but rather than no type will be
266 associated with future non-terminals.
268 The precedence line must contain a list of symbols - typically
269 terminal symbols, but not necessarily. It can only contain symbols
270 that have not been seen yet, so precedence declaration must precede
271 the productions that they affect.
273 A precedence line may also contain a "virtual" symbol which is an
274 identifier preceded by `$$`. Virtual terminals carry precedence
275 information but are not included in the grammar. A production can
276 declare that it inherits the precedence of a given virtual symbol.
278 This use for `$$` precludes it from being used as a symbol in the
279 described language. Two other symbols: `${` and `}$` are also
282 Each new precedence line introduces a new precedence level and
283 declares how it associates. This level is stored in each symbol
284 listed and may be inherited by any production which uses the symbol. A
285 production inherits from the last symbol which has a precedence.
287 ###### grammar fields
288 struct text current_type;
292 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
293 static const char *known[] = { "$$", "${", "}$" };
296 static char *dollar_line(struct token_state *ts, struct grammar *g)
298 struct token t = token_next(ts);
303 if (t.num != TK_ident) {
304 err = "type or assoc expected after '$'";
307 if (text_is(t.txt, "LEFT"))
309 else if (text_is(t.txt, "RIGHT"))
311 else if (text_is(t.txt, "NON"))
314 g->current_type = t.txt;
315 if (text_is(t.txt, "void"))
316 g->current_type.txt = NULL;
318 if (t.num != TK_newline) {
319 err = "Extra tokens after type name";
325 // This is a precedence line, need some symbols.
329 while (t.num != TK_newline) {
330 enum symtype type = Terminal;
332 if (t.num == TK_virtual) {
335 if (t.num != TK_ident) {
336 err = "$$ must be followed by a word";
339 } else if (t.num != TK_ident &&
341 err = "Illegal token in precedence line";
344 s = sym_find(g, t.txt);
345 if (s->type != Unknown) {
346 err = "Symbols in precedence line must not already be known.";
350 s->precedence = g->prec_levels;
355 err = "No symbols given on precedence line";
359 while (t.num != TK_newline && t.num != TK_eof)
366 A production either starts with an identifier which is the head
367 non-terminal, or a vertical bar (`|`) in which case this production
368 uses the same head as the previous one. The identifier must be
369 followed by a `->` mark. All productions for a given non-terminal must
370 be grouped together with the `nonterminal ->` given only once.
372 After this a (possibly empty) sequence of identifiers and marks form
373 the body of the production. A virtual symbol may be given after the
374 body by preceding it with `$$`. If a virtual symbol is given, the
375 precedence of the production is that for the virtual symbol. If none
376 is given, the precedence is inherited from the last symbol in the
377 production which has a precedence specified.
379 After the optional precedence may come the `${` mark. This indicates
380 the start of a code fragment. If present, this must be on the same
381 line as the start of the production.
383 All of the text from the `${` through to the matching `}$` is
384 collected and forms the code-fragment for the production. It must all
385 be in one `code_node` of the literate code. The `}$` must be
386 at the end of a line.
388 Text in the code fragment will undergo substitutions where `$N` for
389 some numeric `N` will be replaced with a variable holding the parse
390 information for the particular symbol in the production. `$0` is the
391 head of the production, `$1` is the first symbol of the body, etc.
392 The type of `$N` for a terminal symbol is `struct token`. For
393 a non-terminal, it is whatever has been declared for that symbol.
395 While building productions we will need to add to an array which needs to
399 static void array_add(void *varray, int *cnt, void *new)
401 void ***array = varray;
404 current = ((*cnt-1) | (step-1))+1;
405 if (*cnt == current) {
408 *array = realloc(*array, current * sizeof(void*));
410 (*array)[*cnt] = new;
414 Collecting the code fragment simply involves reading tokens until we
415 hit the end token `}$`, and noting the character position of the start and
419 static struct text collect_code(struct token_state *state,
424 code.txt = start.txt.txt + start.txt.len;
426 t = token_next(state);
427 while (t.node == start.node &&
428 t.num != TK_close && t.num != TK_error &&
430 if (t.num == TK_close && t.node == start.node)
431 code.len = t.txt.txt - code.txt;
437 Now we have all the bit we need to parse a full production.
442 ###### grammar fields
443 struct production **productions;
444 int production_count;
446 ###### production fields
448 struct symbol **body;
453 int first_production;
456 static char *parse_production(struct grammar *g,
458 struct token_state *state)
460 /* Head has already been parsed. */
463 struct production p, *pp;
465 memset(&p, 0, sizeof(p));
467 tk = token_next(state);
468 while (tk.num == TK_ident || tk.num == TK_mark) {
469 struct symbol *bs = sym_find(g, tk.txt);
470 if (bs->type == Unknown)
472 if (bs->type == Virtual) {
473 err = "Virtual symbol not permitted in production";
476 if (bs->precedence) {
477 p.precedence = bs->precedence;
480 array_add(&p.body, &p.body_size, bs);
481 tk = token_next(state);
483 if (tk.num == TK_virtual) {
485 tk = token_next(state);
486 if (tk.num != TK_ident) {
487 err = "word required after $$";
490 vs = sym_find(g, tk.txt);
491 if (vs->type != Virtual) {
492 err = "symbol after $$ must be virtual";
495 p.precedence = vs->precedence;
497 tk = token_next(state);
499 if (tk.num == TK_open) {
500 p.code = collect_code(state, tk);
501 if (p.code.txt == NULL) {
502 err = "code fragment not closed properly";
505 tk = token_next(state);
507 if (tk.num != TK_newline && tk.num != TK_eof) {
508 err = "stray tokens at end of line";
511 pp = malloc(sizeof(*pp));
513 array_add(&g->productions, &g->production_count, pp);
516 while (tk.num != TK_newline && tk.num != TK_eof)
517 tk = token_next(state);
521 With the ability to parse production and dollar-lines, we have nearly all
522 that we need to parse a grammar from a `code_node`.
524 The head of the first production will effectively be the `start` symbol of
525 the grammar. However it won't _actually_ be so. Processing the grammar is
526 greatly simplified if the real start symbol only has a single production,
527 and expects `$eof` as the final terminal. So when we find the first
528 explicit production we insert an extra production as production zero which
531 ###### Example: production 0
534 where `START` is the first non-terminal given.
536 ###### create production zero
537 struct production *p = calloc(1,sizeof(*p));
538 struct text start = {"$start",6};
539 struct text eof = {"$eof",4};
540 p->head = sym_find(g, start);
541 p->head->type = Nonterminal;
542 array_add(&p->body, &p->body_size, head);
543 array_add(&p->body, &p->body_size, sym_find(g, eof));
544 p->head->first_production = g->production_count;
545 array_add(&g->productions, &g->production_count, p);
547 Now we are ready to read in the grammar.
550 static struct grammar *grammar_read(struct code_node *code)
552 struct token_config conf = {
555 .words_marks = known,
556 .known_count = sizeof(known)/sizeof(known[0]),
558 .ignored = (1 << TK_line_comment)
559 | (1 << TK_block_comment)
562 | (1 << TK_multi_string)
567 struct token_state *state = token_open(code, &conf);
569 struct symbol *head = NULL;
573 g = calloc(1, sizeof(*g));
576 for (tk = token_next(state); tk.num != TK_eof;
577 tk = token_next(state)) {
578 if (tk.num == TK_newline)
580 if (tk.num == TK_ident) {
582 head = sym_find(g, tk.txt);
583 if (head->type == Nonterminal)
584 err = "This non-terminal has already be used.";
585 else if (head->type == Virtual)
586 err = "Virtual symbol not permitted in head of production";
588 head->type = Nonterminal;
589 head->struct_name = g->current_type;
590 if (g->production_count == 0) {
591 ## create production zero
593 head->first_production = g->production_count;
594 tk = token_next(state);
595 if (tk.num == TK_mark &&
596 text_is(tk.txt, "->"))
597 err = parse_production(g, head, state);
599 err = "'->' missing in production";
601 } else if (tk.num == TK_mark
602 && text_is(tk.txt, "|")) {
603 // another production for same non-term
605 err = parse_production(g, head, state);
607 err = "First production must have a head";
608 } else if (tk.num == TK_mark
609 && text_is(tk.txt, "$")) {
610 err = dollar_line(state, g);
612 err = "Unrecognised token at start of line.";
620 fprintf(stderr, "Error at line %d: %s\n",
627 ## Analysing the grammar
629 The central task in analysing the grammar is to determine a set of
630 states to drive the parsing process. This will require finding
631 various sets of symbols and of "items". Some of these sets will need
632 to attach information to each element in the set, so they are more
635 Each "item" may have a 'look-ahead' or `LA` set associated with
636 it. Many of these will be the same as each other. To avoid memory
637 wastage, and to simplify some comparisons of sets, these sets will be
638 stored in a list of unique sets, each assigned a number.
640 Once we have the data structures in hand to manage these sets and
641 lists, we can start setting the 'nullable' flag, build the 'FIRST'
642 sets, and then create the item sets which define the various states.
646 Though we don't only store symbols in these sets, they are the main
647 things we store, so they are called symbol sets or "symsets".
649 A symset has a size, an array of shorts, and an optional array of data
650 values, which are also shorts. If the array of data is not present,
651 we store an impossible pointer, as `NULL` is used to indicate that no
652 memory has been allocated yet;
657 unsigned short *syms, *data;
659 #define NO_DATA ((unsigned short *)1)
660 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
661 const struct symset INIT_DATASET = { 0, NULL, NULL };
663 The arrays of shorts are allocated in blocks of 8 and are kept sorted
664 by using an insertion sort. We don't explicitly record the amount of
665 allocated space as it can be derived directly from the current `cnt` using
666 `((cnt - 1) | 7) + 1`.
669 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
672 int current = ((s->cnt-1) | 7) + 1;
673 if (current == s->cnt) {
675 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
676 if (s->data != NO_DATA)
677 s->data = realloc(s->data, sizeof(*s->data) * current);
680 while (i > 0 && s->syms[i-1] > key) {
681 s->syms[i] = s->syms[i-1];
682 if (s->data != NO_DATA)
683 s->data[i] = s->data[i-1];
687 if (s->data != NO_DATA)
692 Finding a symbol (or item) in a `symset` uses a simple binary search.
693 We return the index where the value was found (so data can be accessed),
694 or `-1` to indicate failure.
696 static int symset_find(struct symset *ss, unsigned short key)
703 while (lo + 1 < hi) {
704 int mid = (lo + hi) / 2;
705 if (ss->syms[mid] <= key)
710 if (ss->syms[lo] == key)
715 We will often want to form the union of two symsets. When we do, we
716 will often want to know if anything changed (as they might mean there
717 is more work to do). So `symset_union` reports whether anything was
718 added to the first set. We use a slow quadratic approach as these
719 sets don't really get very big. If profiles shows this to be a problem is
720 can be optimised later.
722 static int symset_union(struct symset *a, struct symset *b)
726 for (i = 0; i < b->cnt; i++)
727 if (symset_find(a, b->syms[i]) < 0) {
728 unsigned short data = 0;
729 if (b->data != NO_DATA)
731 symset_add(a, b->syms[i], data);
737 And of course we must be able to free a symset.
739 static void symset_free(struct symset ss)
742 if (ss.data != NO_DATA)
748 Some symsets are simply stored somewhere appropriate in the `grammar`
749 data structure, others needs a bit of indirection. This applies
750 particularly to "LA" sets. These will be paired with "items" in an
751 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
752 stored in the `data` field, so we need a mapping from a small (short)
753 number to an LA `symset`.
755 As mentioned earlier this involves creating a list of unique symsets.
757 For now, we just use a linear list sorted by insertion. A skiplist
758 would be more efficient and may be added later.
765 struct setlist *next;
768 ###### grammar fields
769 struct setlist *sets;
774 static int ss_cmp(struct symset a, struct symset b)
777 int diff = a.cnt - b.cnt;
780 for (i = 0; i < a.cnt; i++) {
781 diff = (int)a.syms[i] - (int)b.syms[i];
788 static int save_set(struct grammar *g, struct symset ss)
790 struct setlist **sl = &g->sets;
794 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
801 s = malloc(sizeof(*s));
810 Finding a set by number is currently performed by a simple linear search.
811 If this turns out to hurt performance, we can store the sets in a dynamic
812 array like the productions.
814 static struct symset set_find(struct grammar *g, int num)
816 struct setlist *sl = g->sets;
817 while (sl && sl->num != num)
823 ### Setting `nullable`
825 We set `nullable` on the head symbol for any production for which all
826 the body symbols (if any) are nullable. As this is a recursive
827 definition, any change in the `nullable` setting means that we need to
828 re-evaluate where it needs to be set.
830 We simply loop around performing the same calculations until no more
837 static void set_nullable(struct grammar *g)
840 while (check_again) {
843 for (p = 0; p < g->production_count; p++) {
844 struct production *pr = g->productions[p];
847 if (pr->head->nullable)
849 for (s = 0; s < pr->body_size; s++)
850 if (! pr->body[s]->nullable)
852 if (s == pr->body_size) {
853 pr->head->nullable = 1;
860 ### Setting `can_eol`
862 In order to be able to ignore newline tokens when not relevant, but
863 still include them in the parse when needed, we will need to know
864 which states can start a "line-like" section of code. We ignore
865 newlines when there is an indent since the most recent start of a
868 To know what is line-like, we first need to know which symbols can end
869 a line-like section, which is precisely those which can end with a
870 newline token. These symbols don't necessarily alway end with a
871 newline, but they can. Hence they are not described as "lines" but
874 Clearly the `TK_newline` token can end with a newline. Any symbol
875 which is the head of a production that contains a line-ending symbol
876 followed only by nullable symbols is also a line-ending symbol. We
877 use a new field `can_eol` to record this attribute of symbols, and
878 compute it in a repetitive manner similar to `set_nullable`.
884 static void set_can_eol(struct grammar *g)
887 g->symtab[TK_newline]->can_eol = 1;
888 while (check_again) {
891 for (p = 0; p < g->production_count; p++) {
892 struct production *pr = g->productions[p];
895 if (pr->head->can_eol)
898 for (s = pr->body_size - 1; s >= 0; s--) {
899 if (pr->body[s]->can_eol) {
900 pr->head->can_eol = 1;
904 if (!pr->body[s]->nullable)
911 ### Building the `first` sets
913 When calculating what can follow a particular non-terminal, we will need to
914 know what the "first" terminal in any subsequent non-terminal might be. So
915 we calculate the `first` set for every non-terminal and store them in an
916 array. We don't bother recording the "first" set for terminals as they are
919 As the "first" for one symbol might depend on the "first" of another,
920 we repeat the calculations until no changes happen, just like with
921 `set_nullable`. This makes use of the fact that `symset_union`
922 reports if any change happens.
924 The core of this which finds the "first" of part of a production body
925 will be reused for computing the "follow" sets, so we split it out
926 into a separate function.
928 ###### grammar fields
929 struct symset *first;
933 static int add_first(struct production *pr, int start,
934 struct symset *target, struct grammar *g,
939 for (s = start; s < pr->body_size; s++) {
940 struct symbol *bs = pr->body[s];
941 if (bs->type == Terminal) {
942 if (symset_find(target, bs->num) < 0) {
943 symset_add(target, bs->num, 0);
947 } else if (symset_union(target, &g->first[bs->num]))
953 *to_end = (s == pr->body_size);
957 static void build_first(struct grammar *g)
961 g->first = calloc(g->num_syms, sizeof(g->first[0]));
962 for (s = 0; s < g->num_syms; s++)
963 g->first[s] = INIT_SYMSET;
965 while (check_again) {
968 for (p = 0; p < g->production_count; p++) {
969 struct production *pr = g->productions[p];
970 struct symset *head = &g->first[pr->head->num];
972 if (add_first(pr, 0, head, g, NULL))
978 ### Building the `follow` sets.
980 There are two different situations when we will want to generate "follow"
981 sets. If we are doing an SLR analysis, we want to generate a single
982 "follow" set for each non-terminal in the grammar. That is what is
983 happening here. If we are doing an LALR or LR analysis we will want
984 to generate a separate "LA" set for each item. We do that later
987 There are two parts to generating a "follow" set. Firstly we look at
988 every place that any non-terminal appears in the body of any
989 production, and we find the set of possible "first" symbols after
990 there. This is done using `add_first` above and only needs to be done
991 once as the "first" sets are now stable and will not change.
995 for (p = 0; p < g->production_count; p++) {
996 struct production *pr = g->productions[p];
999 for (b = 0; b < pr->body_size - 1; b++) {
1000 struct symbol *bs = pr->body[b];
1001 if (bs->type == Terminal)
1003 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1007 The second part is to add the "follow" set of the head of a production
1008 to the "follow" sets of the final symbol in the production, and any
1009 other symbol which is followed only by `nullable` symbols. As this
1010 depends on "follow" itself we need to repeatedly perform the process
1011 until no further changes happen.
1015 for (again = 0, p = 0;
1016 p < g->production_count;
1017 p < g->production_count-1
1018 ? p++ : again ? (again = 0, p = 0)
1020 struct production *pr = g->productions[p];
1023 for (b = pr->body_size - 1; b >= 0; b--) {
1024 struct symbol *bs = pr->body[b];
1025 if (bs->type == Terminal)
1027 if (symset_union(&g->follow[bs->num],
1028 &g->follow[pr->head->num]))
1035 We now just need to create and initialise the `follow` list to get a
1038 ###### grammar fields
1039 struct symset *follow;
1042 static void build_follow(struct grammar *g)
1045 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1046 for (s = 0; s < g->num_syms; s++)
1047 g->follow[s] = INIT_SYMSET;
1051 ### Building itemsets and states
1053 There are three different levels of detail that can be chosen for
1054 building the itemsets and states for the LR grammar. They are:
1056 1. LR(0) or SLR(1), where no look-ahead is considered.
1057 2. LALR(1) where we build look-ahead sets with each item and merge
1058 the LA sets when we find two paths to the same "kernel" set of items.
1059 3. LR(1) where different look-ahead for any item in the set means
1060 a different state must be created.
1062 ###### forward declarations
1063 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1065 We need to be able to look through existing states to see if a newly
1066 generated state already exists. For now we use a simple sorted linked
1069 An item is a pair of numbers: the production number and the position of
1070 "DOT", which is an index into the body of the production.
1071 As the numbers are not enormous we can combine them into a single "short"
1072 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1073 production number (so 4000 productions with maximum of 15 symbols in the
1076 Comparing the itemsets will be a little different to comparing symsets
1077 as we want to do the lookup after generating the "kernel" of an
1078 itemset, so we need to ignore the offset=zero items which are added during
1081 To facilitate this, we modify the "DOT" number so that "0" sorts to
1082 the end of the list in the symset, and then only compare items before
1086 static inline unsigned short item_num(int production, int index)
1088 return production | ((31-index) << 11);
1090 static inline int item_prod(unsigned short item)
1092 return item & 0x7ff;
1094 static inline int item_index(unsigned short item)
1096 return (31-(item >> 11)) & 0x1f;
1099 For LR(1) analysis we need to compare not just the itemset in a state
1100 but also the LA sets. As we assign each unique LA set a number, we
1101 can just compare the symset and the data values together.
1104 static int itemset_cmp(struct symset a, struct symset b,
1105 enum grammar_type type)
1111 i < a.cnt && i < b.cnt &&
1112 item_index(a.syms[i]) > 0 &&
1113 item_index(b.syms[i]) > 0;
1115 int diff = a.syms[i] - b.syms[i];
1119 diff = a.data[i] - b.data[i];
1124 if (i == a.cnt || item_index(a.syms[i]) == 0)
1128 if (i == b.cnt || item_index(b.syms[i]) == 0)
1134 if (type < LR1 || av == -1)
1137 a.data[i] - b.data[i];
1140 And now we can build the list of itemsets. The lookup routine returns
1141 both a success flag and a pointer to where in the list an insert
1142 should happen, so we don't need to search a second time.
1146 struct itemset *next;
1148 struct symset items;
1149 struct symset go_to;
1154 ###### grammar fields
1155 struct itemset *items;
1159 static int itemset_find(struct grammar *g, struct itemset ***where,
1160 struct symset kernel, enum grammar_type type)
1162 struct itemset **ip;
1164 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1165 struct itemset *i = *ip;
1167 diff = itemset_cmp(i->items, kernel, type);
1180 Adding an itemset may require merging the LA sets if LALR analysis is
1181 happening. If any new LA set add symbol that weren't in the old LA set, we
1182 clear the `completed` flag so that the dependants of this itemset will be
1183 recalculated and their LA sets updated.
1185 `add_itemset` must consume the symsets it is passed, either by adding
1186 them to a data structure, of freeing them.
1188 static int add_itemset(struct grammar *g, struct symset ss,
1189 enum grammar_type type, int starts_line)
1191 struct itemset **where, *is;
1193 int found = itemset_find(g, &where, ss, type);
1195 is = calloc(1, sizeof(*is));
1196 is->state = g->states;
1200 is->go_to = INIT_DATASET;
1201 is->starts_line = starts_line;
1210 for (i = 0; i < ss.cnt; i++) {
1211 struct symset temp = INIT_SYMSET, s;
1212 if (ss.data[i] == is->items.data[i])
1214 s = set_find(g, is->items.data[i]);
1215 symset_union(&temp, &s);
1216 s = set_find(g, ss.data[i]);
1217 if (symset_union(&temp, &s)) {
1218 is->items.data[i] = save_set(g, temp);
1229 To build all the itemsets, we first insert the initial itemset made from the
1230 start symbol, complete each itemset, and then generate new itemsets from old
1231 until no new ones can be made.
1233 Completing an itemset means finding all the items where "DOT" is followed by
1234 a nonterminal and adding "DOT=0" items for every production from that
1235 non-terminal - providing each item hasn't already been added.
1237 If LA sets are needed, the LA set for each new item is found using
1238 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1239 be supplemented by the LA set for the item which produce the new item.
1241 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1242 is used in the next stage.
1244 NOTE: precedence handling should happen here - I haven't written this yet
1247 ###### complete itemset
1248 for (i = 0; i < is->items.cnt; i++) {
1249 int p = item_prod(is->items.syms[i]);
1250 int bs = item_index(is->items.syms[i]);
1251 struct production *pr = g->productions[p];
1254 struct symset LA = INIT_SYMSET;
1255 unsigned short sn = 0;
1257 if (bs == pr->body_size)
1260 if (symset_find(&done, s->num) < 0)
1261 symset_add(&done, s->num, 0);
1262 if (s->type != Nonterminal)
1268 add_first(pr, bs+1, &LA, g, &to_end);
1270 struct symset ss = set_find(g, is->items.data[i]);
1271 symset_union(&LA, &ss);
1273 sn = save_set(g, LA);
1274 LA = set_find(g, sn);
1277 /* Add productions for this symbol */
1278 for (p2 = s->first_production;
1279 p2 < g->production_count &&
1280 g->productions[p2]->head == s;
1282 int itm = item_num(p2, 0);
1283 int pos = symset_find(&is->items, itm);
1285 symset_add(&is->items, itm, sn);
1286 /* Will have re-ordered, so start
1287 * from beginning again */
1289 } else if (type >= LALR) {
1290 struct symset ss = set_find(g, is->items.data[pos]);
1291 struct symset tmp = INIT_SYMSET;
1293 symset_union(&tmp, &ss);
1294 if (symset_union(&tmp, &LA)) {
1295 is->items.data[pos] = save_set(g, tmp);
1303 For each symbol we found (and placed in `done`) we collect all the items for
1304 which this symbol is next, and create a new itemset with "DOT" advanced over
1305 the symbol. This is then added to the collection of itemsets (or merged
1306 with a pre-existing itemset).
1308 ###### derive itemsets
1309 // Now we have a completed itemset, so we need to
1310 // compute all the 'next' itemsets and create them
1311 // if they don't exist.
1312 for (i = 0; i < done.cnt; i++) {
1314 unsigned short state;
1315 int starts_line = 0;
1316 struct symbol *sym = g->symtab[done.syms[i]];
1317 struct symset newitemset = INIT_SYMSET;
1319 newitemset = INIT_DATASET;
1322 (sym->nullable && is->starts_line))
1324 for (j = 0; j < is->items.cnt; j++) {
1325 int itm = is->items.syms[j];
1326 int p = item_prod(itm);
1327 int bp = item_index(itm);
1328 struct production *pr = g->productions[p];
1329 unsigned short la = 0;
1332 if (bp == pr->body_size)
1334 if (pr->body[bp] != sym)
1337 la = is->items.data[j];
1338 pos = symset_find(&newitemset, pr->head->num);
1340 symset_add(&newitemset, item_num(p, bp+1), la);
1341 else if (type >= LALR) {
1342 // Need to merge la set.
1343 int la2 = newitemset.data[pos];
1345 struct symset ss = set_find(g, la2);
1346 struct symset LA = INIT_SYMSET;
1347 symset_union(&LA, &ss);
1348 ss = set_find(g, la);
1349 if (symset_union(&LA, &ss))
1350 newitemset.data[pos] = save_set(g, LA);
1356 state = add_itemset(g, newitemset, type, starts_line);
1357 if (symset_find(&is->go_to, done.syms[i]) < 0)
1358 symset_add(&is->go_to, done.syms[i], state);
1361 All that is left is to crate the initial itemset from production zero, and
1362 with `TK_eof` as the LA set.
1365 static void build_itemsets(struct grammar *g, enum grammar_type type)
1367 struct symset first = INIT_SYMSET;
1370 unsigned short la = 0;
1372 // LA set just has eof
1373 struct symset eof = INIT_SYMSET;
1374 symset_add(&eof, TK_eof, 0);
1375 la = save_set(g, eof);
1376 first = INIT_DATASET;
1378 // production 0, offset 0 (with no data)
1379 symset_add(&first, item_num(0, 0), la);
1380 add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
1381 for (again = 0, is = g->items;
1383 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1385 struct symset done = INIT_SYMSET;
1396 ### Completing the analysis.
1398 The exact process of analysis depends on which level was requested. For
1399 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1400 `LR1` we need first, but not follow. For `SLR` we need both.
1402 We don't build the "action" tables that you might expect as the parser
1403 doesn't use them. They will be calculated to some extent if needed for
1406 Once we have built everything we allocate arrays for the two lists:
1407 symbols and itemsets. This allows more efficient access during reporting.
1408 The symbols are grouped as terminals and non-terminals and we record the
1409 changeover point in `first_nonterm`.
1411 ###### grammar fields
1412 struct symbol **symtab;
1413 struct itemset **statetab;
1416 ###### grammar_analyse
1418 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1422 int snum = TK_reserved;
1423 for (s = g->syms; s; s = s->next)
1424 if (s->num < 0 && s->type == Terminal) {
1428 g->first_nonterm = snum;
1429 for (s = g->syms; s; s = s->next)
1435 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1436 for (s = g->syms; s; s = s->next)
1437 g->symtab[s->num] = s;
1447 build_itemsets(g, type);
1449 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1450 for (is = g->items; is ; is = is->next)
1451 g->statetab[is->state] = is;
1454 ## Reporting on the Grammar
1456 The purpose of the report is to give the grammar developer insight into
1457 how the grammar parser will work. It is basically a structured dump of
1458 all the tables that have been generated, plus an description of any conflicts.
1460 ###### grammar_report
1461 static int grammar_report(struct grammar *g, enum grammar_type type)
1467 return report_conflicts(g, type);
1470 Firstly we have the complete list of symbols, together with the "FIRST"
1471 set if that was generated.
1475 static void report_symbols(struct grammar *g)
1479 printf("SYMBOLS + FIRST:\n");
1481 printf("SYMBOLS:\n");
1483 for (n = 0; n < g->num_syms; n++) {
1484 struct symbol *s = g->symtab[n];
1488 printf(" %c%c%3d%c: ",
1489 s->nullable ? '.':' ',
1490 s->can_eol ? '>':' ',
1491 s->num, symtypes[s->type]);
1494 printf(" (%d%s)", s->precedence,
1495 assoc_names[s->assoc]);
1497 if (g->first && s->type == Nonterminal) {
1500 for (j = 0; j < g->first[n].cnt; j++) {
1503 prtxt(g->symtab[g->first[n].syms[j]]->name);
1510 Then we have to follow sets if they were computed.
1512 static void report_follow(struct grammar *g)
1515 printf("FOLLOW:\n");
1516 for (n = 0; n < g->num_syms; n++)
1517 if (g->follow[n].cnt) {
1521 prtxt(g->symtab[n]->name);
1522 for (j = 0; j < g->follow[n].cnt; j++) {
1525 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1531 And finally the item sets. These include the GO TO tables and, for
1532 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1533 it up a bit. First the items, with production number and associativity.
1535 static void report_item(struct grammar *g, int itm)
1537 int p = item_prod(itm);
1538 int dot = item_index(itm);
1539 struct production *pr = g->productions[p];
1543 prtxt(pr->head->name);
1545 for (i = 0; i < pr->body_size; i++) {
1546 printf(" %s", (dot == i ? ". ": ""));
1547 prtxt(pr->body[i]->name);
1549 if (dot == pr->body_size)
1553 printf(" (%d%s)", pr->precedence,
1554 assoc_names[pr->assoc]);
1558 The LA sets which are (possibly) reported with each item:
1560 static void report_la(struct grammar *g, int lanum)
1562 struct symset la = set_find(g, lanum);
1566 printf(" LOOK AHEAD(%d)", lanum);
1567 for (i = 0; i < la.cnt; i++) {
1570 prtxt(g->symtab[la.syms[i]]->name);
1575 Then the go to sets:
1578 static void report_goto(struct grammar *g, struct symset gt)
1583 for (i = 0; i < gt.cnt; i++) {
1585 prtxt(g->symtab[gt.syms[i]]->name);
1586 printf(" -> %d\n", gt.data[i]);
1590 Now we can report all the item sets complete with items, LA sets, and GO TO.
1592 static void report_itemsets(struct grammar *g)
1595 printf("ITEM SETS(%d)\n", g->states);
1596 for (s = 0; s < g->states; s++) {
1598 struct itemset *is = g->statetab[s];
1599 printf(" Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
1600 for (j = 0; j < is->items.cnt; j++) {
1601 report_item(g, is->items.syms[j]);
1602 if (is->items.data != NO_DATA)
1603 report_la(g, is->items.data[j]);
1605 report_goto(g, is->go_to);
1609 ### Reporting conflicts
1611 Conflict detection varies a lot among different analysis levels. However
1612 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1613 LR1 are also similar as they have FOLLOW or LA sets.
1617 ## conflict functions
1619 static int report_conflicts(struct grammar *g, enum grammar_type type)
1622 printf("Conflicts:\n");
1624 cnt = conflicts_lr0(g, type);
1626 cnt = conflicts_slr(g, type);
1628 printf(" - no conflicts\n");
1632 LR0 conflicts are any state which have both a reducible item and
1635 LR05 conflicts only occurs if two possibly reductions exist,
1636 as shifts always over-ride reductions.
1638 ###### conflict functions
1639 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1643 for (i = 0; i < g->states; i++) {
1644 struct itemset *is = g->statetab[i];
1645 int last_reduce = -1;
1646 int prev_reduce = -1;
1647 int last_shift = -1;
1651 for (j = 0; j < is->items.cnt; j++) {
1652 int itm = is->items.syms[j];
1653 int p = item_prod(itm);
1654 int bp = item_index(itm);
1655 struct production *pr = g->productions[p];
1657 if (bp == pr->body_size) {
1658 prev_reduce = last_reduce;
1662 if (pr->body[bp]->type == Terminal)
1665 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1666 printf(" State %d has both SHIFT and REDUCE:\n", i);
1667 report_item(g, is->items.syms[last_shift]);
1668 report_item(g, is->items.syms[last_reduce]);
1671 if (prev_reduce >= 0) {
1672 printf(" State %d has 2 (or more) reducible items\n", i);
1673 report_item(g, is->items.syms[prev_reduce]);
1674 report_item(g, is->items.syms[last_reduce]);
1681 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1682 look ahead, or if a symbol in a look-ahead can be shifted. The differ only
1683 in the source of the look ahead set.
1685 We build a dataset mapping terminal to item for possible SHIFTs and then
1686 another for possible REDUCE operations. We report when we get conflicts
1689 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1694 for (i = 0; i < g->states; i++) {
1695 struct itemset *is = g->statetab[i];
1696 struct symset shifts = INIT_DATASET;
1697 struct symset reduce = INIT_DATASET;
1701 /* First collect the shifts */
1702 for (j = 0; j < is->items.cnt; j++) {
1703 unsigned short itm = is->items.syms[j];
1704 int p = item_prod(itm);
1705 int bp = item_index(itm);
1706 struct production *pr = g->productions[p];
1708 if (bp < pr->body_size &&
1709 pr->body[bp]->type == Terminal) {
1711 int sym = pr->body[bp]->num;
1712 if (symset_find(&shifts, sym) < 0)
1713 symset_add(&shifts, sym, itm);
1716 /* Now look for reduction and conflicts */
1717 for (j = 0; j < is->items.cnt; j++) {
1718 unsigned short itm = is->items.syms[j];
1719 int p = item_prod(itm);
1720 int bp = item_index(itm);
1721 struct production *pr = g->productions[p];
1723 if (bp < pr->body_size)
1728 la = g->follow[pr->head->num];
1730 la = set_find(g, is->items.data[j]);
1732 for (k = 0; k < la.cnt; k++) {
1733 int pos = symset_find(&shifts, la.syms[k]);
1735 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1736 prtxt(g->symtab[la.syms[k]]->name);
1738 report_item(g, shifts.data[pos]);
1739 report_item(g, itm);
1742 pos = symset_find(&reduce, la.syms[k]);
1744 symset_add(&reduce, la.syms[k], itm);
1747 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1748 prtxt(g->symtab[la.syms[k]]->name);
1750 report_item(g, itm);
1751 report_item(g, reduce.data[pos]);
1755 symset_free(shifts);
1756 symset_free(reduce);
1762 ## Generating the parser
1764 The export part of the parser is the `parse_XX` function, where the name
1765 `XX` is based on the name of the parser files.
1767 This takes a `code_node`, a partially initialized `token_config`, and an
1768 optional `FILE` to send tracing to. The `token_config` gets the list of
1769 known words added and then is used with the `code_node` to initialize the
1772 `parse_XX` then call the library function `parser_run` to actually complete
1773 the parse. This needs the `states` table and function to call the various
1774 pieces of code provided in the grammar file, so they are generated first.
1776 ###### parser_generate
1778 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1784 gen_reduce(f, g, file);
1787 fprintf(f, "#line 0 \"gen_parser\"\n");
1788 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1791 fprintf(f, "\tstruct token_state *tokens;\n");
1792 fprintf(f, "\tconfig->words_marks = known;\n");
1793 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1794 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1795 fprintf(f, "\ttokens = token_open(code, config);\n");
1796 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
1797 fprintf(f, "\ttoken_close(tokens);\n");
1798 fprintf(f, "\treturn rv;\n");
1799 fprintf(f, "}\n\n");
1802 ### Table words table
1804 The know words is simply an array of terminal symbols.
1805 The table of nonterminals used for tracing is a similar array.
1809 static void gen_known(FILE *f, struct grammar *g)
1812 fprintf(f, "#line 0 \"gen_known\"\n");
1813 fprintf(f, "static const char *known[] = {\n");
1814 for (i = TK_reserved;
1815 i < g->num_syms && g->symtab[i]->type == Terminal;
1817 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1818 g->symtab[i]->name.txt);
1819 fprintf(f, "};\n\n");
1822 static void gen_non_term(FILE *f, struct grammar *g)
1825 fprintf(f, "#line 0 \"gen_non_term\"\n");
1826 fprintf(f, "static const char *non_term[] = {\n");
1827 for (i = TK_reserved;
1830 if (g->symtab[i]->type == Nonterminal)
1831 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1832 g->symtab[i]->name.txt);
1833 fprintf(f, "};\n\n");
1836 ### States and the goto tables.
1838 For each state we record the goto table, the reducible production if
1839 there is one, or a symbol to shift for error recovery.
1840 Some of the details of the reducible production are stored in the
1841 `do_reduce` function to come later. Here we store the production number,
1842 the body size (useful for stack management) and the resulting symbol (useful
1843 for knowing how to free data later).
1845 The go to table is stored in a simple array of `sym` and corresponding
1848 ###### exported types
1856 const struct lookup * go_to;
1867 static void gen_goto(FILE *f, struct grammar *g)
1870 fprintf(f, "#line 0 \"gen_goto\"\n");
1871 for (i = 0; i < g->states; i++) {
1873 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1875 struct symset gt = g->statetab[i]->go_to;
1876 for (j = 0; j < gt.cnt; j++)
1877 fprintf(f, "\t{ %d, %d },\n",
1878 gt.syms[j], gt.data[j]);
1885 static void gen_states(FILE *f, struct grammar *g)
1888 fprintf(f, "#line 0 \"gen_states\"\n");
1889 fprintf(f, "static const struct state states[] = {\n");
1890 for (i = 0; i < g->states; i++) {
1891 struct itemset *is = g->statetab[i];
1892 int j, prod = -1, prod_len;
1894 int shift_len = 0, shift_remain = 0;
1895 for (j = 0; j < is->items.cnt; j++) {
1896 int itm = is->items.syms[j];
1897 int p = item_prod(itm);
1898 int bp = item_index(itm);
1899 struct production *pr = g->productions[p];
1901 if (bp < pr->body_size) {
1902 if (shift_sym < 0 ||
1903 (shift_len == bp && shift_remain > pr->body_size - bp)) {
1904 shift_sym = pr->body[bp]->num;
1906 shift_remain = pr->body_size - bp;
1910 /* This is what we reduce */
1911 if (prod < 0 || prod_len < pr->body_size) {
1913 prod_len = pr->body_size;
1918 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
1919 i, is->go_to.cnt, i, prod,
1920 g->productions[prod]->body_size,
1921 g->productions[prod]->head->num,
1924 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
1925 i, is->go_to.cnt, i, shift_sym,
1928 fprintf(f, "};\n\n");
1931 ### The `do_reduce` function and the code
1933 When the parser engine decides to reduce a production, it calls `do_reduce`.
1934 This has two functions.
1936 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1937 production being reduced. Secondly it runs the code that was included with
1938 the production if any.
1940 This code needs to be able to store data somewhere. Rather than requiring
1941 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1942 `do_reduce` return the size to be saved.
1944 The code fragment requires translation when written out. Any `$N` needs to
1945 be converted to a reference either to that buffer (if `$0`) or to the
1946 structure returned by a previous reduction. These pointer need to be cast
1947 to the appropriate type for each access. All this is handling in
1953 static void gen_code(struct production *p, FILE *f, struct grammar *g)
1956 fprintf(f, "\t\t\t");
1957 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1966 if (*c < '0' || *c > '9') {
1971 while (c[1] >= '0' && c[1] <= '9') {
1973 n = n * 10 + *c - '0';
1976 fprintf(f, "(*(struct %.*s*)ret)",
1977 p->head->struct_name.len,
1978 p->head->struct_name.txt);
1979 else if (n > p->body_size)
1980 fprintf(f, "$%d", n);
1981 else if (p->body[n-1]->type == Terminal)
1982 fprintf(f, "(*(struct token *)body[%d])",
1984 else if (p->body[n-1]->struct_name.txt == NULL)
1985 fprintf(f, "$%d", n);
1987 fprintf(f, "(*(struct %.*s*)body[%d])",
1988 p->body[n-1]->struct_name.len,
1989 p->body[n-1]->struct_name.txt, n-1);
1996 static void gen_reduce(FILE *f, struct grammar *g, char *file)
1999 fprintf(f, "#line 0 \"gen_reduce\"\n");
2000 fprintf(f, "static int do_reduce(int prod, void **body, void *ret)\n");
2002 fprintf(f, "\tint ret_size = 0;\n");
2004 fprintf(f, "\tswitch(prod) {\n");
2005 for (i = 0; i < g->production_count; i++) {
2006 struct production *p = g->productions[i];
2007 fprintf(f, "\tcase %d:\n", i);
2012 if (p->head->struct_name.txt)
2013 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
2014 p->head->struct_name.len,
2015 p->head->struct_name.txt);
2017 fprintf(f, "\t\tbreak;\n");
2019 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2024 As each non-terminal can potentially cause a different type of data
2025 structure to be allocated and filled in, we need to be able to free it when
2028 It is particularly important to have fine control over freeing during error
2029 recovery where individual stack frames might need to be freed.
2031 For this, the grammar author required to defined a `free_XX` function for
2032 each structure that is used by a non-terminal. `do_free` all call whichever
2033 is appropriate given a symbol number, and will call `free` (as is
2034 appropriate for tokens` on any terminal symbol.
2038 static void gen_free(FILE *f, struct grammar *g)
2042 fprintf(f, "#line 0 \"gen_free\"\n");
2043 fprintf(f, "static void do_free(short sym, void *asn)\n");
2045 fprintf(f, "\tif (!asn) return;\n");
2046 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2047 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2048 fprintf(f, "\tswitch(sym) {\n");
2050 for (i = 0; i < g->num_syms; i++) {
2051 struct symbol *s = g->symtab[i];
2053 s->type != Nonterminal ||
2054 s->struct_name.txt == NULL)
2057 fprintf(f, "\tcase %d:\n", s->num);
2058 fprintf(f, "\t\tfree_%.*s(asn);\n",
2060 s->struct_name.txt);
2061 fprintf(f, "\t\tbreak;\n");
2063 fprintf(f, "\t}\n}\n\n");
2066 ## The main routine.
2068 There are three key parts to the "main" function of parsergen: processing
2069 the arguments, loading the grammar file, and dealing with the grammar.
2071 ### Argument processing.
2073 Command line options allow the selection of analysis mode, name of output
2074 file, and whether or not a report should be generated. By default we create
2075 a report only if no code output was requested.
2077 The `parse_XX` function name uses the basename of the output file which
2078 should not have a suffix (`.c`). `.c` is added to the given name for the
2079 code, and `.h` is added for the header (if header text is specifed in the
2086 static const struct option long_options[] = {
2087 { "LR0", 0, NULL, '0' },
2088 { "LR05", 0, NULL, '5' },
2089 { "SLR", 0, NULL, 'S' },
2090 { "LALR", 0, NULL, 'L' },
2091 { "LR1", 0, NULL, '1' },
2092 { "tag", 1, NULL, 't' },
2093 { "report", 0, NULL, 'R' },
2094 { "output", 1, NULL, 'o' },
2095 { NULL, 0, NULL, 0 }
2097 const char *options = "05SL1t:Ro:";
2099 ###### process arguments
2101 char *outfile = NULL;
2106 enum grammar_type type = LR05;
2107 while ((opt = getopt_long(argc, argv, options,
2108 long_options, NULL)) != -1) {
2123 outfile = optarg; break;
2125 tag = optarg; break;
2127 fprintf(stderr, "Usage: parsergen ...\n");
2132 infile = argv[optind++];
2134 fprintf(stderr, "No input file given\n");
2137 if (outfile && report == 1)
2140 if (name && strchr(name, '/'))
2141 name = strrchr(name, '/')+1;
2143 if (optind < argc) {
2144 fprintf(stderr, "Excess command line arguments\n");
2148 ### Loading the grammar file
2150 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2153 One we have extracted the code (with `mdcode`) we expect to file three
2154 sections: header, code, and grammar. Anything else is an error.
2156 "header" and "code" are optional, though it is hard to build a working
2157 parser with neither. "grammar" must be provided.
2161 #include <sys/mman.h>
2166 static void pr_err(char *msg)
2169 fprintf(stderr, "%s\n", msg);
2173 struct section *table;
2177 fd = open(infile, O_RDONLY);
2179 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2180 infile, strerror(errno));
2183 len = lseek(fd, 0, 2);
2184 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2185 table = code_extract(file, file+len, pr_err);
2187 struct code_node *hdr = NULL;
2188 struct code_node *code = NULL;
2189 struct code_node *gram = NULL;
2190 for (s = table; s; s = s->next) {
2191 struct text sec = s->section;
2192 if (tag && !strip_tag(&sec, tag))
2194 if (text_is(sec, "header"))
2196 else if (text_is(sec, "code"))
2198 else if (text_is(sec, "grammar"))
2201 fprintf(stderr, "Unknown content section: %.*s\n",
2202 s->section.len, s->section.txt);
2207 ### Processing the input
2209 As we need to append an extention to a filename and then open it for
2210 writing, and we need to do this twice, it helps to have a separate function.
2214 static FILE *open_ext(char *base, char *ext)
2216 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2218 strcat(strcpy(fn, base), ext);
2224 If we can read the grammar, then we analyse and optionally report on it. If
2225 the report finds conflicts we will exit with an error status.
2227 ###### process input
2228 struct grammar *g = NULL;
2230 fprintf(stderr, "No grammar section provided\n");
2233 g = grammar_read(gram);
2235 fprintf(stderr, "Failure to parse grammar\n");
2240 grammar_analyse(g, type);
2242 if (grammar_report(g, type))
2246 If a headers section is defined, we write it out and include a declaration
2247 for the `parse_XX` function so it can be used from separate code.
2249 if (rv == 0 && hdr && outfile) {
2250 FILE *f = open_ext(outfile, ".h");
2252 code_node_print(f, hdr, infile);
2253 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2257 fprintf(stderr, "Cannot create %s.h\n",
2263 And if all goes well and an output file was provided, we create the `.c`
2264 file with the code section (if any) and the parser tables and function.
2266 if (rv == 0 && outfile) {
2267 FILE *f = open_ext(outfile, ".c");
2270 code_node_print(f, code, infile);
2271 gen_parser(f, g, infile, name);
2274 fprintf(stderr, "Cannot create %s.c\n",
2280 And that about wraps it up. We need to set the locale so that UTF-8 is
2281 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2283 ###### File: parsergen.mk
2284 parsergen : parsergen.o libscanner.o libmdcode.o
2285 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2292 int main(int argc, char *argv[])
2297 setlocale(LC_ALL,"");
2299 ## process arguments
2306 ## The SHIFT/REDUCE parser
2308 Having analysed the grammar and generated all the table, we only need the
2309 shift/reduce engine to bring it all together.
2311 ### Goto table lookup
2313 The parser generator has nicely provided us with goto tables sorted by
2314 symbol number. We need a binary search function to find a symbol in the
2317 ###### parser functions
2319 static int search(const struct state *l, int sym)
2322 int hi = l->go_to_cnt;
2326 while (lo + 1 < hi) {
2327 int mid = (lo + hi) / 2;
2328 if (l->go_to[mid].sym <= sym)
2333 if (l->go_to[lo].sym == sym)
2334 return l->go_to[lo].state;
2339 ### The state stack.
2341 The core data structure for the parser is the stack. This tracks all the
2342 symbols that have been recognised or partially recognised.
2344 The stack usually won't grow very large - maybe a few tens of entries. So
2345 we dynamically resize and array as required but never bother to shrink it
2348 We keep the stack as two separate allocations. One, `asn_stack` stores the
2349 "abstract syntax nodes" which are created by each reduction. When we call
2350 `do_reduce` we need to pass an array of the `asn`s of the body of the
2351 production, and by keeping a separate `asn` stack, we can just pass a
2352 pointer into this stack.
2354 The other allocation stores all other stack fields of which there are four.
2355 The `state` is the most important one and guides the parsing process. The
2356 `sym` is nearly unnecessary. However when we want to free entries from the
2357 `asn_stack`, it helps to know what type they are so we can call the right
2358 freeing function. The symbol leads us to the right free function through
2361 The `indents` count and the `starts_indented` flag track the line
2362 indents in the symbol. These are used to allow indent information to
2363 guide parsing and error recovery.
2365 As well as the stack of frames we have a `next` frame which is
2366 assembled from the incoming token and other information prior to
2367 pushing it onto the stack.
2369 ###### parser functions
2375 short starts_indented;
2377 short newline_permitted;
2386 Two operations are needed on the stack - shift (which is like push) and pop.
2388 Shift applies not only to terminals but also to non-terminals. When we
2389 reduce a production we will pop off entries corresponding to the body
2390 symbols, then push on an item for the head of the production. This last is
2391 exactly the same process as shifting in a terminal so we use the same
2394 To simplify other code we arrange for `shift` to fail if there is no `goto`
2395 state for the symbol. This is useful in basic parsing due to our design
2396 that we shift when we can, and reduce when we cannot. So the `shift`
2397 function reports if it could.
2399 So `shift` finds the next state. If that succeed it extends the allocations
2400 if needed and pushes all the information onto the stacks.
2402 ###### parser functions
2404 static int shift(struct parser *p,
2406 const struct state states[])
2408 // Push an entry onto the stack
2409 int newstate = search(&states[p->next.state], p->next.sym);
2412 if (p->tos >= p->stack_size) {
2413 p->stack_size += 10;
2414 p->stack = realloc(p->stack, p->stack_size
2415 * sizeof(p->stack[0]));
2416 p->asn_stack = realloc(p->asn_stack, p->stack_size
2417 * sizeof(p->asn_stack[0]));
2419 p->stack[p->tos] = p->next;
2420 p->asn_stack[p->tos] = asn;
2422 p->next.state = newstate;
2423 p->next.indents = 0;
2424 p->next.starts_indented = 0;
2425 // if new state doesn't start a line, we inherit newline_permitted status
2426 if (states[newstate].starts_line)
2427 p->next.newline_permitted = 1;
2431 `pop` simply moves the top of stack (`tos`) back down the required amount
2432 and frees any `asn` entries that need to be freed. It is called _after_ we
2433 reduce a production, just before we `shift` the nonterminal in.
2435 ###### parser functions
2437 static void pop(struct parser *p, int num,
2438 void(*do_free)(short sym, void *asn))
2442 for (i = 0; i < num; i++) {
2443 p->next.indents += p->stack[p->tos+i].indents;
2444 do_free(p->stack[p->tos+i].sym,
2445 p->asn_stack[p->tos+i]);
2449 p->next.state = p->stack[p->tos].state;
2450 p->next.starts_indented = p->stack[p->tos].starts_indented;
2451 p->next.newline_permitted = p->stack[p->tos].newline_permitted;
2452 if (p->next.indents > p->next.starts_indented)
2453 p->next.newline_permitted = 0;
2457 ### Memory allocation
2459 The `scanner` returns tokens in a local variable - we want them in allocated
2460 memory so they can live in the `asn_stack`. Similarly the `asn` produced by
2461 a reduce is in a large buffer. Both of these require some allocation and
2462 copying, hence `memdup` and `tokcopy`.
2464 ###### parser includes
2467 ###### parser functions
2469 void *memdup(void *m, int len)
2475 memcpy(ret, m, len);
2479 static struct token *tok_copy(struct token tk)
2481 struct token *new = malloc(sizeof(*new));
2486 ### The heart of the parser.
2488 Now we have the parser. If we can shift, we do. If not and we can reduce
2489 we do. If the production we reduced was production zero, then we have
2490 accepted the input and can finish.
2492 We return whatever `asn` was returned by reducing production zero.
2494 If we can neither shift nor reduce we have an error to handle. We pop
2495 single entries off the stack until we can shift the `TK_error` symbol, then
2496 drop input tokens until we find one we can shift into the new error state.
2498 When we find `TK_in` and `TK_out` tokens which report indents we need
2499 to handle them directly as the grammar cannot express what we want to
2502 `TK_in` tokens are easy: we simply update the `next` stack frame to
2503 record how many indents there are and that the next token started with
2506 `TK_out` tokens must either be counted off against any pending indent,
2507 or must force reductions until there is a pending indent which isn't
2508 at the start of a production.
2510 ###### parser includes
2513 void *parser_run(struct token_state *tokens,
2514 const struct state states[],
2515 int (*do_reduce)(int, void**, void*),
2516 void (*do_free)(short, void*),
2517 FILE *trace, const char *non_term[], int knowns)
2519 struct parser p = { 0 };
2520 struct token *tk = NULL;
2524 p.next.newline_permitted = states[0].starts_line;
2526 struct token *err_tk;
2528 tk = tok_copy(token_next(tokens));
2529 p.next.sym = tk->num;
2531 parser_trace(trace, &p, tk, states, non_term, knowns);
2533 if (p.next.sym == TK_in) {
2534 p.next.starts_indented = 1;
2540 if (p.next.sym == TK_out) {
2541 if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
2542 (p.stack[p.tos-1].indents == 1 &&
2543 states[p.next.state].reduce_size > 1)) {
2544 p.stack[p.tos-1].indents -= 1;
2545 if (p.stack[p.tos-1].indents == p.stack[p.tos-1].starts_indented) {
2546 // no internal indent any more, reassess 'newline_permitted'
2547 if (states[p.stack[p.tos-1].state].starts_line)
2548 p.stack[p.tos-1].newline_permitted = 1;
2550 p.stack[p.tos-1].newline_permitted = p.stack[p.tos-2].newline_permitted;
2556 // fall through and force a REDUCE (as 'shift'
2559 if (p.next.sym == TK_newline) {
2560 if (!p.tos || ! p.stack[p.tos-1].newline_permitted) {
2566 if (shift(&p, tk, states)) {
2570 if (states[p.next.state].reduce_prod >= 0) {
2572 int prod = states[p.next.state].reduce_prod;
2573 int size = states[p.next.state].reduce_size;
2575 static char buf[16*1024];
2576 p.next.sym = states[p.next.state].reduce_sym;
2578 body = p.asn_stack +
2579 (p.tos - states[p.next.state].reduce_size);
2581 bufsize = do_reduce(prod, body, buf);
2583 pop(&p, size, do_free);
2584 shift(&p, memdup(buf, bufsize), states);
2589 if (tk->num == TK_out) {
2590 // Indent problem - synthesise tokens to get us
2592 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
2593 p.next.sym = states[p.next.state].shift_sym;
2594 shift(&p, tok_copy(*tk), states);
2595 // FIXME need to report this error somehow
2598 /* Error. We walk up the stack until we
2599 * find a state which will accept TK_error.
2600 * We then shift in TK_error and see what state
2601 * that takes us too.
2602 * Then we discard input tokens until
2603 * we find one that is acceptable.
2606 err_tk = tok_copy(*tk);
2607 p.next.sym = TK_error;
2608 while (shift(&p, err_tk, states) == 0
2610 // discard this state
2611 pop(&p, 1, do_free);
2614 // no state accepted TK_error
2617 while (search(&states[p.next.state], tk->num) < 0 &&
2618 tk->num != TK_eof) {
2620 tk = tok_copy(token_next(tokens));
2621 if (tk->num == TK_in)
2622 p.next.indents += 1;
2623 if (tk->num == TK_out) {
2624 if (p.next.indents == 0)
2626 p.next.indents -= 1;
2629 if (p.tos == 0 && tk->num == TK_eof)
2634 ret = p.asn_stack[0];
2636 pop(&p, p.tos, do_free);
2642 ###### exported functions
2643 void *parser_run(struct token_state *tokens,
2644 const struct state states[],
2645 int (*do_reduce)(int, void**, void*),
2646 void (*do_free)(short, void*),
2647 FILE *trace, const char *non_term[], int knowns);
2651 Being able to visualize the parser in action can be invaluable when
2652 debugging the parser code, or trying to understand how the parse of a
2653 particular grammar progresses. The stack contains all the important
2654 state, so just printing out the stack every time around the parse loop
2655 can make it possible to see exactly what is happening.
2657 This doesn't explicitly show each SHIFT and REDUCE action. However they
2658 are easily deduced from the change between consecutive lines, and the
2659 details of each state can be found by cross referencing the states list
2660 in the stack with the "report" that parsergen can generate.
2662 For terminal symbols, we just dump the token. For non-terminals we
2663 print the name of the symbol. The look ahead token is reported at the
2664 end inside square brackets.
2666 ###### parser functions
2668 static char *reserved_words[] = {
2669 [TK_error] = "ERROR",
2672 [TK_newline] = "NEWLINE",
2675 static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2677 fprintf(trace, "(%d", f->state);
2679 fprintf(trace, "%c%d", f->starts_indented?':':'.',
2681 if (states[f->state].starts_line)
2682 fprintf(trace, "s");
2683 if (f->newline_permitted)
2684 fprintf(trace, "n");
2685 fprintf(trace, ") ");
2688 void parser_trace(FILE *trace, struct parser *p,
2689 struct token *tk, const struct state states[],
2690 const char *non_term[], int knowns)
2693 for (i = 0; i < p->tos; i++) {
2694 int sym = p->stack[i].sym;
2695 parser_trace_state(trace, &p->stack[i], states);
2696 if (sym < TK_reserved &&
2697 reserved_words[sym] != NULL)
2698 fputs(reserved_words[sym], trace);
2699 else if (sym < TK_reserved + knowns) {
2700 struct token *t = p->asn_stack[i];
2701 text_dump(trace, t->txt, 20);
2703 fputs(non_term[sym - TK_reserved - knowns],
2707 parser_trace_state(trace, &p->next, states);
2708 fprintf(trace, " [");
2709 if (tk->num < TK_reserved &&
2710 reserved_words[tk->num] != NULL)
2711 fputs(reserved_words[tk->num], trace);
2713 text_dump(trace, tk->txt, 20);
2714 fputs("]\n", trace);
2719 The obvious example for a parser is a calculator.
2721 As `scanner` provides number parsing function using `libgmp` is it not much
2722 work to perform arbitrary rational number calculations.
2724 This calculator takes one expression, or an equality test per line. The
2725 results are printed and in any equality test fails, the program exits with
2728 Embedding mdcode inside mdcode is rather horrible. I'd like to find a
2729 better approach, but as the grammar file must have 3 components I need
2730 something like this.
2732 ###### File: parsergen.mk
2733 calc.c calc.h : parsergen parsergen.mdc
2734 ./parsergen --tag calc -o calc parsergen.mdc
2735 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2736 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2741 // what do we use for a demo-grammar? A calculator of course.
2753 #include <sys/mman.h>
2758 #include "scanner.h"
2764 static void free_number(struct number *n)
2770 int main(int argc, char *argv[])
2772 int fd = open(argv[1], O_RDONLY);
2773 int len = lseek(fd, 0, 2);
2774 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2775 struct section *s = code_extract(file, file+len, NULL);
2776 struct token_config config = {
2777 .ignored = (1 << TK_line_comment)
2778 | (1 << TK_block_comment)
2781 .number_chars = ".,_+-",
2785 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2791 Session -> Session Line
2794 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2795 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2796 gmp_printf(" or as a decimal: %Fg\n", fl);
2800 | Expression = Expression NEWLINE ${
2801 if (mpq_equal($1.val, $3.val))
2802 gmp_printf("Both equal %Qd\n", $1.val);
2804 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
2809 | NEWLINE ${ printf("Blank line\n"); }$
2810 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2813 Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2814 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2815 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2817 Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2818 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2819 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2821 Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2822 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$