1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. Finally `calc` is a test grammar for a simple calculator. The
21 `parsergen` program built from the C code in this file can extract
22 that grammar directly from this file and process it.
25 ###### File: parsergen.c
30 ## forward declarations
41 ###### File: libparser.c
48 ###### File: parsergen.mk
51 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
54 ## Reading the grammar
56 The grammar must be stored in a markdown literate code file as parsed
57 by "[mdcode][]". It must have three top level (i.e. unreferenced)
58 sections: `header`, `code`, and `grammar`. The first two will be
59 literally copied into the generated `.c` and `.h`. files. The last
60 contains the grammar. This is tokenised with "[scanner][]".
62 If the `--tag` option is given, then any top level header that doesn't
63 start with the tag is ignored, and the tag is striped from the rest. So
65 means that the three needed sections must be `Foo: header`, `Foo: code`,
69 [scanner]: scanner.html
75 ###### parser includes
79 The grammar contains production sets, precedence/associativity
80 declarations, and data type declarations. These are all parsed with
81 _ad hoc_ parsing as we don't have a parser generator yet.
83 The precedence and associativity can be set for each production, but
84 can be inherited from symbols. The data type is potentially defined
85 for each non-terminal and describes what C structure is used to store
86 information about each symbol.
89 enum assoc {Left, Right, Non};
90 char *assoc_names[] = {"Left","Right","Non"};
93 struct text struct_name;
95 unsigned short precedence;
99 unsigned short precedence;
107 The strings reported by `mdcode` and `scanner` are `struct text` which have
108 length rather than being null terminated. To help with printing and
109 comparing we define `text_is` and `prtxt`, which should possibly go in
110 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
111 which might contain control characters.
113 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
114 because that is what we need to detect tags.
117 static int text_is(struct text t, char *s)
119 return (strlen(s) == t.len &&
120 strncmp(s, t.txt, t.len) == 0);
122 static void prtxt(struct text t)
124 printf("%.*s", t.len, t.txt);
127 static int strip_tag(struct text *t, char *tag)
129 int skip = strlen(tag) + 1;
130 if (skip >= t->len ||
131 strncmp(t->txt, tag, skip-1) != 0 ||
132 t->txt[skip-1] != ':')
134 while (skip < t->len && t->txt[skip] == ' ')
143 Productions are comprised primarily of symbols - terminal and
144 non-terminal. We do not make a syntactic distinction between the two,
145 though non-terminals must be identifiers. Non-terminal symbols are
146 those which appear in the head of a production, terminal symbols are
147 those which don't. There are also "virtual" symbols used for precedence
148 marking discussed later, and sometimes we won't know what type a symbol
151 ###### forward declarations
152 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
153 char *symtypes = "UVTN";
157 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
158 table of known symbols and the resulting parser will report them as
159 `TK_reserved + N`. A small set of identifiers are reserved for the
160 different token types that `scanner` can report.
163 static char *reserved_words[] = {
164 [TK_error] = "ERROR",
165 [TK_number] = "NUMBER",
166 [TK_ident] = "IDENTIFIER",
168 [TK_string] = "STRING",
169 [TK_multi_string] = "MULTI_STRING",
172 [TK_newline] = "NEWLINE",
178 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
179 recognised. The former is automatically expected at the end of the text
180 being parsed. The latter are always ignored by the parser.
182 All of these symbols are stored in a simple symbol table. We use the
183 `struct text` from `mdcode` to store the name and link them together
184 into a sorted list using an insertion sort.
186 We don't have separate `find` and `insert` functions as any symbol we
187 find needs to be remembered. We simply expect `find` to always return a
188 symbol, but its type might be `Unknown`.
197 ###### grammar fields
202 static int text_cmp(struct text a, struct text b)
207 int cmp = strncmp(a.txt, b.txt, len);
211 return a.len - b.len;
214 static struct symbol *sym_find(struct grammar *g, struct text s)
216 struct symbol **l = &g->syms;
221 (cmp = text_cmp((*l)->name, s)) < 0)
225 n = calloc(1, sizeof(*n));
234 static void symbols_init(struct grammar *g)
236 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
238 for (i = 0; i < entries; i++) {
241 t.txt = reserved_words[i];
244 t.len = strlen(t.txt);
251 ### Data types and precedence.
253 Data type specification and precedence specification are both
254 introduced by a dollar sign at the start of the line. If the next
255 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
256 precedence, otherwise it specifies a data type.
258 The data type name is simply stored and applied to the head of all
259 subsequent productions. It must be the name of a structure, so `$expr`
260 maps to `struct expr`.
262 Any productions given before the first data type will have no data type
263 and can carry no information. In order to allow other non-terminals to
264 have no type, the data type `$void` can be given. This does *not* mean
265 that `struct void` will be used, but rather than no type will be
266 associated with future non-terminals.
268 The precedence line must contain a list of symbols - typically
269 terminal symbols, but not necessarily. It can only contain symbols
270 that have not been seen yet, so precedence declaration must precede
271 the productions that they affect.
273 A precedence line may also contain a "virtual" symbol which is an
274 identifier preceded by `$$`. Virtual terminals carry precedence
275 information but are not included in the grammar. A production can
276 declare that it inherits the precedence of a given virtual symbol.
278 This use for `$$` precludes it from being used as a symbol in the
279 described language. Two other symbols: `${` and `}$` are also
282 Each new precedence line introduces a new precedence level and
283 declares how it associates. This level is stored in each symbol
284 listed and may be inherited by any production which uses the symbol. A
285 production inherits from the last symbol which has a precedence.
287 ###### grammar fields
288 struct text current_type;
292 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
293 static const char *known[] = { "$$", "${", "}$" };
296 static char *dollar_line(struct token_state *ts, struct grammar *g)
298 struct token t = token_next(ts);
303 if (t.num != TK_ident) {
304 err = "type or assoc expected after '$'";
307 if (text_is(t.txt, "LEFT"))
309 else if (text_is(t.txt, "RIGHT"))
311 else if (text_is(t.txt, "NON"))
314 g->current_type = t.txt;
315 if (text_is(t.txt, "void"))
316 g->current_type.txt = NULL;
318 if (t.num != TK_newline) {
319 err = "Extra tokens after type name";
325 // This is a precedence line, need some symbols.
329 while (t.num != TK_newline) {
330 enum symtype type = Terminal;
332 if (t.num == TK_virtual) {
335 if (t.num != TK_ident) {
336 err = "$$ must be followed by a word";
339 } else if (t.num != TK_ident &&
341 err = "Illegal token in precedence line";
344 s = sym_find(g, t.txt);
345 if (s->type != Unknown) {
346 err = "Symbols in precedence line must not already be known.";
350 s->precedence = g->prec_levels;
355 err = "No symbols given on precedence line";
359 while (t.num != TK_newline && t.num != TK_eof)
366 A production either starts with an identifier which is the head
367 non-terminal, or a vertical bar (`|`) in which case this production
368 uses the same head as the previous one. The identifier must be
369 followed by a `->` mark. All productions for a given non-terminal must
370 be grouped together with the `nonterminal ->` given only once.
372 After this a (possibly empty) sequence of identifiers and marks form
373 the body of the production. A virtual symbol may be given after the
374 body by preceding it with `$$`. If a virtual symbol is given, the
375 precedence of the production is that for the virtual symbol. If none
376 is given, the precedence is inherited from the last symbol in the
377 production which has a precedence specified.
379 After the optional precedence may come the `${` mark. This indicates
380 the start of a code fragment. If present, this must be on the same
381 line as the start of the production.
383 All of the text from the `${` through to the matching `}$` is
384 collected and forms the code-fragment for the production. It must all
385 be in one `code_node` of the literate code. The `}$` must be
386 at the end of a line.
388 Text in the code fragment will undergo substitutions where `$N` for
389 some numeric `N` will be replaced with a variable holding the parse
390 information for the particular symbol in the production. `$0` is the
391 head of the production, `$1` is the first symbol of the body, etc.
392 The type of `$N` for a terminal symbol is `struct token`. For
393 a non-terminal, it is whatever has been declared for that symbol.
395 While building productions we will need to add to an array which needs to
399 static void array_add(void *varray, int *cnt, void *new)
401 void ***array = varray;
404 current = ((*cnt-1) | (step-1))+1;
405 if (*cnt == current) {
408 *array = realloc(*array, current * sizeof(void*));
410 (*array)[*cnt] = new;
414 Collecting the code fragment simply involves reading tokens until we
415 hit the end token `}$`, and noting the character position of the start and
419 static struct text collect_code(struct token_state *state,
424 code.txt = start.txt.txt + start.txt.len;
426 t = token_next(state);
427 while (t.node == start.node &&
428 t.num != TK_close && t.num != TK_error &&
430 if (t.num == TK_close && t.node == start.node)
431 code.len = t.txt.txt - code.txt;
437 Now we have all the bit we need to parse a full production.
442 ###### grammar fields
443 struct production **productions;
444 int production_count;
446 ###### production fields
448 struct symbol **body;
453 int first_production;
456 static char *parse_production(struct grammar *g,
458 struct token_state *state)
460 /* Head has already been parsed. */
463 struct production p, *pp;
465 memset(&p, 0, sizeof(p));
467 tk = token_next(state);
468 while (tk.num == TK_ident || tk.num == TK_mark) {
469 struct symbol *bs = sym_find(g, tk.txt);
470 if (bs->type == Unknown)
472 if (bs->type == Virtual) {
473 err = "Virtual symbol not permitted in production";
476 if (bs->precedence) {
477 p.precedence = bs->precedence;
480 array_add(&p.body, &p.body_size, bs);
481 tk = token_next(state);
483 if (tk.num == TK_virtual) {
485 tk = token_next(state);
486 if (tk.num != TK_ident) {
487 err = "word required after $$";
490 vs = sym_find(g, tk.txt);
491 if (vs->type != Virtual) {
492 err = "symbol after $$ must be virtual";
495 p.precedence = vs->precedence;
497 tk = token_next(state);
499 if (tk.num == TK_open) {
500 p.code = collect_code(state, tk);
501 if (p.code.txt == NULL) {
502 err = "code fragment not closed properly";
505 tk = token_next(state);
507 if (tk.num != TK_newline && tk.num != TK_eof) {
508 err = "stray tokens at end of line";
511 pp = malloc(sizeof(*pp));
513 array_add(&g->productions, &g->production_count, pp);
516 while (tk.num != TK_newline && tk.num != TK_eof)
517 tk = token_next(state);
521 With the ability to parse production and dollar-lines, we have nearly all
522 that we need to parse a grammar from a `code_node`.
524 The head of the first production will effectively be the `start` symbol of
525 the grammar. However it won't _actually_ be so. Processing the grammar is
526 greatly simplified if the real start symbol only has a single production,
527 and expects `$eof` as the final terminal. So when we find the first
528 explicit production we insert an extra production as production zero which
531 ###### Example: production 0
534 where `START` is the first non-terminal given.
536 ###### create production zero
537 struct production *p = calloc(1,sizeof(*p));
538 struct text start = {"$start",6};
539 struct text eof = {"$eof",4};
540 p->head = sym_find(g, start);
541 p->head->type = Nonterminal;
542 array_add(&p->body, &p->body_size, head);
543 array_add(&p->body, &p->body_size, sym_find(g, eof));
544 g->start = p->head->num;
545 p->head->first_production = g->production_count;
546 array_add(&g->productions, &g->production_count, p);
548 Now we are ready to read in the grammar.
550 ###### grammar fields
551 int start; // the 'start' symbol.
554 static struct grammar *grammar_read(struct code_node *code)
556 struct token_config conf = {
559 .words_marks = known,
560 .known_count = sizeof(known)/sizeof(known[0]),
562 .ignored = (1 << TK_line_comment)
563 | (1 << TK_block_comment)
566 | (1 << TK_multi_string)
571 struct token_state *state = token_open(code, &conf);
573 struct symbol *head = NULL;
577 g = calloc(1, sizeof(*g));
580 for (tk = token_next(state); tk.num != TK_eof;
581 tk = token_next(state)) {
582 if (tk.num == TK_newline)
584 if (tk.num == TK_ident) {
586 head = sym_find(g, tk.txt);
587 if (head->type == Nonterminal)
588 err = "This non-terminal has already be used.";
589 else if (head->type == Virtual)
590 err = "Virtual symbol not permitted in head of production";
592 head->type = Nonterminal;
593 head->struct_name = g->current_type;
595 ## create production zero
597 head->first_production = g->production_count;
598 tk = token_next(state);
599 if (tk.num == TK_mark &&
600 text_is(tk.txt, "->"))
601 err = parse_production(g, head, state);
603 err = "'->' missing in production";
605 } else if (tk.num == TK_mark
606 && text_is(tk.txt, "|")) {
607 // another production for same non-term
609 err = parse_production(g, head, state);
611 err = "First production must have a head";
612 } else if (tk.num == TK_mark
613 && text_is(tk.txt, "$")) {
614 err = dollar_line(state, g);
616 err = "Unrecognised token at start of line.";
624 fprintf(stderr, "Error at line %d: %s\n",
631 ## Analysing the grammar
633 The central task in analysing the grammar is to determine a set of
634 states to drive the parsing process. This will require finding
635 various sets of symbols and of "items". Some of these sets will need
636 to attach information to each element in the set, so they are more
639 Each "item" may have a 'look-ahead' or `LA` set associated with
640 it. Many of these will be the same as each other. To avoid memory
641 wastage, and to simplify some comparisons of sets, these sets will be
642 stored in a list of unique sets, each assigned a number.
644 Once we have the data structures in hand to manage these sets and
645 lists, we can start setting the 'nullable' flag, build the 'FIRST'
646 sets, and then create the item sets which define the various states.
650 Though we don't only store symbols in these sets, they are the main
651 things we store, so they are called symbol sets or "symsets".
653 A symset has a size, an array of shorts, and an optional array of data
654 values, which are also shorts. If the array of data is not present,
655 we store an impossible pointer, as `NULL` is used to indicate that no
656 memory has been allocated yet;
661 unsigned short *syms, *data;
663 #define NO_DATA ((unsigned short *)1)
664 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
665 const struct symset INIT_DATASET = { 0, NULL, NULL };
667 The arrays of shorts are allocated in blocks of 8 and are kept sorted
668 by using an insertion sort. We don't explicitly record the amount of
669 allocated space as it can be derived directly from the current `cnt` using
670 `((cnt - 1) | 7) + 1`.
673 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
676 int current = ((s->cnt-1) | 7) + 1;
677 if (current == s->cnt) {
679 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
680 if (s->data != NO_DATA)
681 s->data = realloc(s->data, sizeof(*s->data) * current);
684 while (i > 0 && s->syms[i-1] > key) {
685 s->syms[i] = s->syms[i-1];
686 if (s->data != NO_DATA)
687 s->data[i] = s->data[i-1];
691 if (s->data != NO_DATA)
696 Finding a symbol (or item) in a `symset` uses a simple binary search.
697 We return the index where the value was found (so data can be accessed),
698 or `-1` to indicate failure.
700 static int symset_find(struct symset *ss, unsigned short key)
707 while (lo + 1 < hi) {
708 int mid = (lo + hi) / 2;
709 if (ss->syms[mid] <= key)
714 if (ss->syms[lo] == key)
719 We will often want to form the union of two symsets. When we do, we
720 will often want to know if anything changed (as they might mean there
721 is more work to do). So `symset_union` reports whether anything was
722 added to the first set. We use a slow quadratic approach as these
723 sets don't really get very big. If profiles shows this to be a problem is
724 can be optimised later.
726 static int symset_union(struct symset *a, struct symset *b)
730 for (i = 0; i < b->cnt; i++)
731 if (symset_find(a, b->syms[i]) < 0) {
732 unsigned short data = 0;
733 if (b->data != NO_DATA)
735 symset_add(a, b->syms[i], data);
741 And of course we must be able to free a symset.
743 static void symset_free(struct symset ss)
746 if (ss.data != NO_DATA)
752 Some symsets are simply stored somewhere appropriate in the `grammar`
753 data structure, others needs a bit of indirection. This applies
754 particularly to "LA" sets. These will be paired with "items" in an
755 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
756 stored in the `data` field, so we need a mapping from a small (short)
757 number to an LA `symset`.
759 As mentioned earlier this involves creating a list of unique symsets.
761 For now, we just use a linear list sorted by insertion. A skiplist
762 would be more efficient and may be added later.
769 struct setlist *next;
772 ###### grammar fields
773 struct setlist *sets;
778 static int ss_cmp(struct symset a, struct symset b)
781 int diff = a.cnt - b.cnt;
784 for (i = 0; i < a.cnt; i++) {
785 diff = (int)a.syms[i] - (int)b.syms[i];
792 static int save_set(struct grammar *g, struct symset ss)
794 struct setlist **sl = &g->sets;
798 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
805 s = malloc(sizeof(*s));
814 Finding a set by number is currently performed by a simple linear search.
815 If this turns out to hurt performance, we can store the sets in a dynamic
816 array like the productions.
818 static struct symset set_find(struct grammar *g, int num)
820 struct setlist *sl = g->sets;
821 while (sl && sl->num != num)
827 ### Setting `nullable`
829 We set `nullable` on the head symbol for any production for which all
830 the body symbols (if any) are nullable. As this is a recursive
831 definition, any change in the `nullable` setting means that we need to
832 re-evaluate where it needs to be set.
834 We simply loop around performing the same calculations until no more
841 static void set_nullable(struct grammar *g)
844 while (check_again) {
847 for (p = 0; p < g->production_count; p++) {
848 struct production *pr = g->productions[p];
851 if (pr->head->nullable)
853 for (s = 0; s < pr->body_size; s++)
854 if (! pr->body[s]->nullable)
856 if (s == pr->body_size) {
857 pr->head->nullable = 1;
864 ### Setting `can_eol`
866 In order to be able to ignore newline tokens when not relevant, but
867 still include them in the parse when needed, we will need to know
868 which states can start a "line-like" section of code. We ignore
869 newlines when there is an indent since the most recent start of a
872 To know what is line-like, we first need to know which symbols can end
873 a line-like section, which is precisely those which can end with a
874 newline token. These symbols don't necessarily alway end with a
875 newline, but they can. Hence they are not described as "lines" but
878 Clearly the `TK_newline` token can end with a newline. Any symbol
879 which is the head of a production that contains a line-ending symbol
880 followed only by nullable symbols is also a line-ending symbol. We
881 use a new field `can_eol` to record this attribute of symbols, and
882 compute it in a repetitive manner similar to `set_nullable`.
888 static void set_can_eol(struct grammar *g)
891 g->symtab[TK_newline]->can_eol = 1;
892 while (check_again) {
895 for (p = 0; p < g->production_count; p++) {
896 struct production *pr = g->productions[p];
899 if (pr->head->can_eol)
902 for (s = pr->body_size - 1; s >= 0; s--) {
903 if (pr->body[s]->can_eol) {
904 pr->head->can_eol = 1;
908 if (!pr->body[s]->nullable)
915 ### Building the `first` sets
917 When calculating what can follow a particular non-terminal, we will need to
918 know what the "first" terminal in any subsequent non-terminal might be. So
919 we calculate the `first` set for every non-terminal and store them in an
920 array. We don't bother recording the "first" set for terminals as they are
923 As the "first" for one symbol might depend on the "first" of another,
924 we repeat the calculations until no changes happen, just like with
925 `set_nullable`. This makes use of the fact that `symset_union`
926 reports if any change happens.
928 The core of this which finds the "first" of part of a production body
929 will be reused for computing the "follow" sets, so we split it out
930 into a separate function.
932 ###### grammar fields
933 struct symset *first;
937 static int add_first(struct production *pr, int start,
938 struct symset *target, struct grammar *g,
943 for (s = start; s < pr->body_size; s++) {
944 struct symbol *bs = pr->body[s];
945 if (bs->type == Terminal) {
946 if (symset_find(target, bs->num) < 0) {
947 symset_add(target, bs->num, 0);
951 } else if (symset_union(target, &g->first[bs->num]))
957 *to_end = (s == pr->body_size);
961 static void build_first(struct grammar *g)
965 g->first = calloc(g->num_syms, sizeof(g->first[0]));
966 for (s = 0; s < g->num_syms; s++)
967 g->first[s] = INIT_SYMSET;
969 while (check_again) {
972 for (p = 0; p < g->production_count; p++) {
973 struct production *pr = g->productions[p];
974 struct symset *head = &g->first[pr->head->num];
976 if (add_first(pr, 0, head, g, NULL))
982 ### Building the `follow` sets.
984 There are two different situations when we will want to generate "follow"
985 sets. If we are doing an SLR analysis, we want to generate a single
986 "follow" set for each non-terminal in the grammar. That is what is
987 happening here. If we are doing an LALR or LR analysis we will want
988 to generate a separate "LA" set for each item. We do that later
991 There are two parts to generating a "follow" set. Firstly we look at
992 every place that any non-terminal appears in the body of any
993 production, and we find the set of possible "first" symbols after
994 there. This is done using `add_first` above and only needs to be done
995 once as the "first" sets are now stable and will not change.
999 for (p = 0; p < g->production_count; p++) {
1000 struct production *pr = g->productions[p];
1003 for (b = 0; b < pr->body_size - 1; b++) {
1004 struct symbol *bs = pr->body[b];
1005 if (bs->type == Terminal)
1007 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1011 The second part is to add the "follow" set of the head of a production
1012 to the "follow" sets of the final symbol in the production, and any
1013 other symbol which is followed only by `nullable` symbols. As this
1014 depends on "follow" itself we need to repeatedly perform the process
1015 until no further changes happen.
1019 for (again = 0, p = 0;
1020 p < g->production_count;
1021 p < g->production_count-1
1022 ? p++ : again ? (again = 0, p = 0)
1024 struct production *pr = g->productions[p];
1027 for (b = pr->body_size - 1; b >= 0; b--) {
1028 struct symbol *bs = pr->body[b];
1029 if (bs->type == Terminal)
1031 if (symset_union(&g->follow[bs->num],
1032 &g->follow[pr->head->num]))
1039 We now just need to create and initialise the `follow` list to get a
1042 ###### grammar fields
1043 struct symset *follow;
1046 static void build_follow(struct grammar *g)
1049 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1050 for (s = 0; s < g->num_syms; s++)
1051 g->follow[s] = INIT_SYMSET;
1055 ### Building itemsets and states
1057 There are three different levels of detail that can be chosen for
1058 building the itemsets and states for the LR grammar. They are:
1060 1. LR(0) or SLR(1), where no look-ahead is considered.
1061 2. LALR(1) where we build look-ahead sets with each item and merge
1062 the LA sets when we find two paths to the same "kernel" set of items.
1063 3. LR(1) where different look-ahead for any item in the set means
1064 a different state must be created.
1066 ###### forward declarations
1067 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1069 We need to be able to look through existing states to see if a newly
1070 generated state already exists. For now we use a simple sorted linked
1073 An item is a pair of numbers: the production number and the position of
1074 "DOT", which is an index into the body of the production.
1075 As the numbers are not enormous we can combine them into a single "short"
1076 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1077 production number (so 4000 productions with maximum of 15 symbols in the
1080 Comparing the itemsets will be a little different to comparing symsets
1081 as we want to do the lookup after generating the "kernel" of an
1082 itemset, so we need to ignore the offset=zero items which are added during
1085 To facilitate this, we modify the "DOT" number so that "0" sorts to
1086 the end of the list in the symset, and then only compare items before
1090 static inline unsigned short item_num(int production, int index)
1092 return production | ((31-index) << 11);
1094 static inline int item_prod(unsigned short item)
1096 return item & 0x7ff;
1098 static inline int item_index(unsigned short item)
1100 return (31-(item >> 11)) & 0x1f;
1103 For LR(1) analysis we need to compare not just the itemset in a state
1104 but also the LA sets. As we assign each unique LA set a number, we
1105 can just compare the symset and the data values together.
1108 static int itemset_cmp(struct symset a, struct symset b,
1109 enum grammar_type type)
1115 i < a.cnt && i < b.cnt &&
1116 item_index(a.syms[i]) > 0 &&
1117 item_index(b.syms[i]) > 0;
1119 int diff = a.syms[i] - b.syms[i];
1123 diff = a.data[i] - b.data[i];
1128 if (i == a.cnt || item_index(a.syms[i]) == 0)
1132 if (i == b.cnt || item_index(b.syms[i]) == 0)
1138 if (type < LR1 || av == -1)
1141 a.data[i] - b.data[i];
1144 And now we can build the list of itemsets. The lookup routine returns
1145 both a success flag and a pointer to where in the list an insert
1146 should happen, so we don't need to search a second time.
1150 struct itemset *next;
1152 struct symset items;
1153 struct symset go_to;
1158 ###### grammar fields
1159 struct itemset *items;
1163 static int itemset_find(struct grammar *g, struct itemset ***where,
1164 struct symset kernel, enum grammar_type type)
1166 struct itemset **ip;
1168 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1169 struct itemset *i = *ip;
1171 diff = itemset_cmp(i->items, kernel, type);
1184 Adding an itemset may require merging the LA sets if LALR analysis is
1185 happening. If any new LA set add symbol that weren't in the old LA set, we
1186 clear the `completed` flag so that the dependants of this itemset will be
1187 recalculated and their LA sets updated.
1189 `add_itemset` must consume the symsets it is passed, either by adding
1190 them to a data structure, of freeing them.
1192 static int add_itemset(struct grammar *g, struct symset ss,
1193 enum grammar_type type, int starts_line)
1195 struct itemset **where, *is;
1197 int found = itemset_find(g, &where, ss, type);
1199 is = calloc(1, sizeof(*is));
1200 is->state = g->states;
1204 is->go_to = INIT_DATASET;
1205 is->starts_line = starts_line;
1214 for (i = 0; i < ss.cnt; i++) {
1215 struct symset temp = INIT_SYMSET, s;
1216 if (ss.data[i] == is->items.data[i])
1218 s = set_find(g, is->items.data[i]);
1219 symset_union(&temp, &s);
1220 s = set_find(g, ss.data[i]);
1221 if (symset_union(&temp, &s)) {
1222 is->items.data[i] = save_set(g, temp);
1233 To build all the itemsets, we first insert the initial itemset made from the
1234 start symbol, complete each itemset, and then generate new itemsets from old
1235 until no new ones can be made.
1237 Completing an itemset means finding all the items where "DOT" is followed by
1238 a nonterminal and adding "DOT=0" items for every production from that
1239 non-terminal - providing each item hasn't already been added.
1241 If LA sets are needed, the LA set for each new item is found using
1242 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1243 be supplemented by the LA set for the item which produce the new item.
1245 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1246 is used in the next stage.
1248 NOTE: precedence handling should happen here - I haven't written this yet
1251 ###### complete itemset
1252 for (i = 0; i < is->items.cnt; i++) {
1253 int p = item_prod(is->items.syms[i]);
1254 int bs = item_index(is->items.syms[i]);
1255 struct production *pr = g->productions[p];
1258 struct symset LA = INIT_SYMSET;
1259 unsigned short sn = 0;
1261 if (bs == pr->body_size)
1264 if (symset_find(&done, s->num) < 0)
1265 symset_add(&done, s->num, 0);
1266 if (s->type != Nonterminal)
1272 add_first(pr, bs+1, &LA, g, &to_end);
1274 struct symset ss = set_find(g, is->items.data[i]);
1275 symset_union(&LA, &ss);
1277 sn = save_set(g, LA);
1278 LA = set_find(g, sn);
1281 /* Add productions for this symbol */
1282 for (p2 = s->first_production;
1283 p2 < g->production_count &&
1284 g->productions[p2]->head == s;
1286 int itm = item_num(p2, 0);
1287 int pos = symset_find(&is->items, itm);
1289 symset_add(&is->items, itm, sn);
1290 /* Will have re-ordered, so start
1291 * from beginning again */
1293 } else if (type >= LALR) {
1294 struct symset ss = set_find(g, is->items.data[pos]);
1295 struct symset tmp = INIT_SYMSET;
1297 symset_union(&tmp, &ss);
1298 if (symset_union(&tmp, &LA)) {
1299 is->items.data[pos] = save_set(g, tmp);
1307 For each symbol we found (and placed in `done`) we collect all the items for
1308 which this symbol is next, and create a new itemset with "DOT" advanced over
1309 the symbol. This is then added to the collection of itemsets (or merged
1310 with a pre-existing itemset).
1312 ###### derive itemsets
1313 // Now we have a completed itemset, so we need to
1314 // compute all the 'next' itemsets and create them
1315 // if they don't exist.
1316 for (i = 0; i < done.cnt; i++) {
1318 unsigned short state;
1319 int starts_line = 0;
1320 struct symbol *sym = g->symtab[done.syms[i]];
1321 struct symset newitemset = INIT_SYMSET;
1323 newitemset = INIT_DATASET;
1326 (sym->nullable && is->starts_line))
1328 for (j = 0; j < is->items.cnt; j++) {
1329 int itm = is->items.syms[j];
1330 int p = item_prod(itm);
1331 int bp = item_index(itm);
1332 struct production *pr = g->productions[p];
1333 unsigned short la = 0;
1336 if (bp == pr->body_size)
1338 if (pr->body[bp] != sym)
1341 la = is->items.data[j];
1342 pos = symset_find(&newitemset, pr->head->num);
1344 symset_add(&newitemset, item_num(p, bp+1), la);
1345 else if (type >= LALR) {
1346 // Need to merge la set.
1347 int la2 = newitemset.data[pos];
1349 struct symset ss = set_find(g, la2);
1350 struct symset LA = INIT_SYMSET;
1351 symset_union(&LA, &ss);
1352 ss = set_find(g, la);
1353 if (symset_union(&LA, &ss))
1354 newitemset.data[pos] = save_set(g, LA);
1360 state = add_itemset(g, newitemset, type, starts_line);
1361 if (symset_find(&is->go_to, done.syms[i]) < 0)
1362 symset_add(&is->go_to, done.syms[i], state);
1365 All that is left is to crate the initial itemset from production zero, and
1366 with `TK_eof` as the LA set.
1369 static void build_itemsets(struct grammar *g, enum grammar_type type)
1371 struct symset first = INIT_SYMSET;
1374 unsigned short la = 0;
1376 // LA set just has eof
1377 struct symset eof = INIT_SYMSET;
1378 symset_add(&eof, TK_eof, 0);
1379 la = save_set(g, eof);
1380 first = INIT_DATASET;
1382 // production 0, offset 0 (with no data)
1383 symset_add(&first, item_num(0, 0), la);
1384 add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
1385 for (again = 0, is = g->items;
1387 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1389 struct symset done = INIT_SYMSET;
1400 ### Completing the analysis.
1402 The exact process of analysis depends on which level was requested. For
1403 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1404 `LR1` we need first, but not follow. For `SLR` we need both.
1406 We don't build the "action" tables that you might expect as the parser
1407 doesn't use them. They will be calculated to some extent if needed for
1410 Once we have built everything we allocate arrays for the two lists:
1411 symbols and itemsets. This allows more efficient access during reporting.
1412 The symbols are grouped as terminals and non-terminals and we record the
1413 changeover point in `first_nonterm`.
1415 ###### grammar fields
1416 struct symbol **symtab;
1417 struct itemset **statetab;
1420 ###### grammar_analyse
1422 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1426 int snum = TK_reserved;
1427 for (s = g->syms; s; s = s->next)
1428 if (s->num < 0 && s->type == Terminal) {
1432 g->first_nonterm = snum;
1433 for (s = g->syms; s; s = s->next)
1439 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1440 for (s = g->syms; s; s = s->next)
1441 g->symtab[s->num] = s;
1451 build_itemsets(g, type);
1453 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1454 for (is = g->items; is ; is = is->next)
1455 g->statetab[is->state] = is;
1458 ## Reporting on the Grammar
1460 The purpose of the report is to give the grammar developer insight into
1461 how the grammar parser will work. It is basically a structured dump of
1462 all the tables that have been generated, plus an description of any conflicts.
1464 ###### grammar_report
1465 static int grammar_report(struct grammar *g, enum grammar_type type)
1471 return report_conflicts(g, type);
1474 Firstly we have the complete list of symbols, together with the "FIRST"
1475 set if that was generated.
1479 static void report_symbols(struct grammar *g)
1483 printf("SYMBOLS + FIRST:\n");
1485 printf("SYMBOLS:\n");
1487 for (n = 0; n < g->num_syms; n++) {
1488 struct symbol *s = g->symtab[n];
1492 printf(" %c%c%3d%c: ",
1493 s->nullable ? '.':' ',
1494 s->can_eol ? '>':' ',
1495 s->num, symtypes[s->type]);
1498 printf(" (%d%s)", s->precedence,
1499 assoc_names[s->assoc]);
1501 if (g->first && s->type == Nonterminal) {
1504 for (j = 0; j < g->first[n].cnt; j++) {
1507 prtxt(g->symtab[g->first[n].syms[j]]->name);
1514 Then we have to follow sets if they were computed.
1516 static void report_follow(struct grammar *g)
1519 printf("FOLLOW:\n");
1520 for (n = 0; n < g->num_syms; n++)
1521 if (g->follow[n].cnt) {
1525 prtxt(g->symtab[n]->name);
1526 for (j = 0; j < g->follow[n].cnt; j++) {
1529 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1535 And finally the item sets. These include the GO TO tables and, for
1536 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1537 it up a bit. First the items, with production number and associativity.
1539 static void report_item(struct grammar *g, int itm)
1541 int p = item_prod(itm);
1542 int dot = item_index(itm);
1543 struct production *pr = g->productions[p];
1547 prtxt(pr->head->name);
1549 for (i = 0; i < pr->body_size; i++) {
1550 printf(" %s", (dot == i ? ". ": ""));
1551 prtxt(pr->body[i]->name);
1553 if (dot == pr->body_size)
1557 printf(" (%d%s)", pr->precedence,
1558 assoc_names[pr->assoc]);
1562 The LA sets which are (possibly) reported with each item:
1564 static void report_la(struct grammar *g, int lanum)
1566 struct symset la = set_find(g, lanum);
1570 printf(" LOOK AHEAD(%d)", lanum);
1571 for (i = 0; i < la.cnt; i++) {
1574 prtxt(g->symtab[la.syms[i]]->name);
1579 Then the go to sets:
1582 static void report_goto(struct grammar *g, struct symset gt)
1587 for (i = 0; i < gt.cnt; i++) {
1589 prtxt(g->symtab[gt.syms[i]]->name);
1590 printf(" -> %d\n", gt.data[i]);
1594 Now we can report all the item sets complete with items, LA sets, and GO TO.
1596 static void report_itemsets(struct grammar *g)
1599 printf("ITEM SETS(%d)\n", g->states);
1600 for (s = 0; s < g->states; s++) {
1602 struct itemset *is = g->statetab[s];
1603 printf(" Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
1604 for (j = 0; j < is->items.cnt; j++) {
1605 report_item(g, is->items.syms[j]);
1606 if (is->items.data != NO_DATA)
1607 report_la(g, is->items.data[j]);
1609 report_goto(g, is->go_to);
1613 ### Reporting conflicts
1615 Conflict detection varies a lot among different analysis levels. However
1616 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1617 LR1 are also similar as they have FOLLOW or LA sets.
1621 ## conflict functions
1623 static int report_conflicts(struct grammar *g, enum grammar_type type)
1626 printf("Conflicts:\n");
1628 cnt = conflicts_lr0(g, type);
1630 cnt = conflicts_slr(g, type);
1632 printf(" - no conflicts\n");
1636 LR0 conflicts are any state which have both a reducible item and
1639 LR05 conflicts only occurs if two possibly reductions exist,
1640 as shifts always over-ride reductions.
1642 ###### conflict functions
1643 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1647 for (i = 0; i < g->states; i++) {
1648 struct itemset *is = g->statetab[i];
1649 int last_reduce = -1;
1650 int prev_reduce = -1;
1651 int last_shift = -1;
1655 for (j = 0; j < is->items.cnt; j++) {
1656 int itm = is->items.syms[j];
1657 int p = item_prod(itm);
1658 int bp = item_index(itm);
1659 struct production *pr = g->productions[p];
1661 if (bp == pr->body_size) {
1662 prev_reduce = last_reduce;
1666 if (pr->body[bp]->type == Terminal)
1669 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1670 printf(" State %d has both SHIFT and REDUCE:\n", i);
1671 report_item(g, is->items.syms[last_shift]);
1672 report_item(g, is->items.syms[last_reduce]);
1675 if (prev_reduce >= 0) {
1676 printf(" State %d has 2 (or more) reducible items\n", i);
1677 report_item(g, is->items.syms[prev_reduce]);
1678 report_item(g, is->items.syms[last_reduce]);
1685 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1686 look ahead, or if a symbol in a look-ahead can be shifted. The differ only
1687 in the source of the look ahead set.
1689 We build a dataset mapping terminal to item for possible SHIFTs and then
1690 another for possible REDUCE operations. We report when we get conflicts
1693 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1698 for (i = 0; i < g->states; i++) {
1699 struct itemset *is = g->statetab[i];
1700 struct symset shifts = INIT_DATASET;
1701 struct symset reduce = INIT_DATASET;
1705 /* First collect the shifts */
1706 for (j = 0; j < is->items.cnt; j++) {
1707 unsigned short itm = is->items.syms[j];
1708 int p = item_prod(itm);
1709 int bp = item_index(itm);
1710 struct production *pr = g->productions[p];
1712 if (bp < pr->body_size &&
1713 pr->body[bp]->type == Terminal) {
1715 int sym = pr->body[bp]->num;
1716 if (symset_find(&shifts, sym) < 0)
1717 symset_add(&shifts, sym, itm);
1720 /* Now look for reduction and conflicts */
1721 for (j = 0; j < is->items.cnt; j++) {
1722 unsigned short itm = is->items.syms[j];
1723 int p = item_prod(itm);
1724 int bp = item_index(itm);
1725 struct production *pr = g->productions[p];
1727 if (bp < pr->body_size)
1732 la = g->follow[pr->head->num];
1734 la = set_find(g, is->items.data[j]);
1736 for (k = 0; k < la.cnt; k++) {
1737 int pos = symset_find(&shifts, la.syms[k]);
1739 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1740 prtxt(g->symtab[la.syms[k]]->name);
1742 report_item(g, shifts.data[pos]);
1743 report_item(g, itm);
1746 pos = symset_find(&reduce, la.syms[k]);
1748 symset_add(&reduce, la.syms[k], itm);
1751 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1752 prtxt(g->symtab[la.syms[k]]->name);
1754 report_item(g, itm);
1755 report_item(g, reduce.data[pos]);
1759 symset_free(shifts);
1760 symset_free(reduce);
1766 ## Generating the parser
1768 The export part of the parser is the `parse_XX` function, where the name
1769 `XX` is based on the name of the parser files.
1771 This takes a `code_node`, a partially initialized `token_config`, and an
1772 optional `FILE` to send tracing to. The `token_config` gets the list of
1773 known words added and then is used with the `code_node` to initialize the
1776 `parse_XX` then call the library function `parser_run` to actually complete
1777 the parse. This needs the `states` table and function to call the various
1778 pieces of code provided in the grammar file, so they are generated first.
1780 ###### parser_generate
1782 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1788 gen_reduce(f, g, file);
1791 fprintf(f, "#line 0 \"gen_parser\"\n");
1792 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1795 fprintf(f, "\tstruct token_state *tokens;\n");
1796 fprintf(f, "\tconfig->words_marks = known;\n");
1797 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1798 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1799 fprintf(f, "\ttokens = token_open(code, config);\n");
1800 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
1801 fprintf(f, "\ttoken_close(tokens);\n");
1802 fprintf(f, "\treturn rv;\n");
1803 fprintf(f, "}\n\n");
1806 ### Table words table
1808 The know words is simply an array of terminal symbols.
1809 The table of nonterminals used for tracing is a similar array.
1813 static void gen_known(FILE *f, struct grammar *g)
1816 fprintf(f, "#line 0 \"gen_known\"\n");
1817 fprintf(f, "static const char *known[] = {\n");
1818 for (i = TK_reserved;
1819 i < g->num_syms && g->symtab[i]->type == Terminal;
1821 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1822 g->symtab[i]->name.txt);
1823 fprintf(f, "};\n\n");
1826 static void gen_non_term(FILE *f, struct grammar *g)
1829 fprintf(f, "#line 0 \"gen_non_term\"\n");
1830 fprintf(f, "static const char *non_term[] = {\n");
1831 for (i = TK_reserved;
1834 if (g->symtab[i]->type == Nonterminal)
1835 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1836 g->symtab[i]->name.txt);
1837 fprintf(f, "};\n\n");
1840 ### States and the goto tables.
1842 For each state we record the goto table, the reducible production if
1843 there is one, or a symbol to shift for error recovery.
1844 Some of the details of the reducible production are stored in the
1845 `do_reduce` function to come later. Here we store the production number,
1846 the body size (useful for stack management) and the resulting symbol (useful
1847 for knowing how to free data later).
1849 The go to table is stored in a simple array of `sym` and corresponding
1852 ###### exported types
1860 const struct lookup * go_to;
1871 static void gen_goto(FILE *f, struct grammar *g)
1874 fprintf(f, "#line 0 \"gen_goto\"\n");
1875 for (i = 0; i < g->states; i++) {
1877 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1879 struct symset gt = g->statetab[i]->go_to;
1880 for (j = 0; j < gt.cnt; j++)
1881 fprintf(f, "\t{ %d, %d },\n",
1882 gt.syms[j], gt.data[j]);
1889 static void gen_states(FILE *f, struct grammar *g)
1892 fprintf(f, "#line 0 \"gen_states\"\n");
1893 fprintf(f, "static const struct state states[] = {\n");
1894 for (i = 0; i < g->states; i++) {
1895 struct itemset *is = g->statetab[i];
1896 int j, prod = -1, prod_len;
1898 int shift_len = 0, shift_remain = 0;
1899 for (j = 0; j < is->items.cnt; j++) {
1900 int itm = is->items.syms[j];
1901 int p = item_prod(itm);
1902 int bp = item_index(itm);
1903 struct production *pr = g->productions[p];
1905 if (bp < pr->body_size) {
1906 if (shift_sym < 0 ||
1907 (shift_len == bp && shift_remain > pr->body_size - bp)) {
1908 shift_sym = pr->body[bp]->num;
1910 shift_remain = pr->body_size - bp;
1914 /* This is what we reduce */
1915 if (prod < 0 || prod_len < pr->body_size) {
1917 prod_len = pr->body_size;
1922 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
1923 i, is->go_to.cnt, i, prod,
1924 g->productions[prod]->body_size,
1925 g->productions[prod]->head->num,
1928 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
1929 i, is->go_to.cnt, i, shift_sym,
1932 fprintf(f, "};\n\n");
1935 ### The `do_reduce` function and the code
1937 When the parser engine decides to reduce a production, it calls `do_reduce`.
1938 This has two functions.
1940 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1941 production being reduced. Secondly it runs the code that was included with
1942 the production if any.
1944 This code needs to be able to store data somewhere. Rather than requiring
1945 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1946 `do_reduce` return the size to be saved.
1948 The code fragment requires translation when written out. Any `$N` needs to
1949 be converted to a reference either to that buffer (if `$0`) or to the
1950 structure returned by a previous reduction. These pointer need to be cast
1951 to the appropriate type for each access. All this is handling in
1957 static void gen_code(struct production *p, FILE *f, struct grammar *g)
1960 fprintf(f, "\t\t\t");
1961 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1970 if (*c < '0' || *c > '9') {
1975 while (c[1] >= '0' && c[1] <= '9') {
1977 n = n * 10 + *c - '0';
1980 fprintf(f, "(*(struct %.*s*)ret)",
1981 p->head->struct_name.len,
1982 p->head->struct_name.txt);
1983 else if (n > p->body_size)
1984 fprintf(f, "$%d", n);
1985 else if (p->body[n-1]->type == Terminal)
1986 fprintf(f, "(*(struct token *)body[%d])",
1988 else if (p->body[n-1]->struct_name.txt == NULL)
1989 fprintf(f, "$%d", n);
1991 fprintf(f, "(*(struct %.*s*)body[%d])",
1992 p->body[n-1]->struct_name.len,
1993 p->body[n-1]->struct_name.txt, n-1);
2000 static void gen_reduce(FILE *f, struct grammar *g, char *file)
2003 fprintf(f, "#line 0 \"gen_reduce\"\n");
2004 fprintf(f, "static int do_reduce(int prod, void **body, void *ret)\n");
2006 fprintf(f, "\tint ret_size = 0;\n");
2008 fprintf(f, "\tswitch(prod) {\n");
2009 for (i = 0; i < g->production_count; i++) {
2010 struct production *p = g->productions[i];
2011 fprintf(f, "\tcase %d:\n", i);
2016 if (p->head->struct_name.txt)
2017 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
2018 p->head->struct_name.len,
2019 p->head->struct_name.txt);
2021 fprintf(f, "\t\tbreak;\n");
2023 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2028 As each non-terminal can potentially cause a different type of data
2029 structure to be allocated and filled in, we need to be able to free it when
2032 It is particularly important to have fine control over freeing during error
2033 recovery where individual stack frames might need to be freed.
2035 For this, the grammar author required to defined a `free_XX` function for
2036 each structure that is used by a non-terminal. `do_free` all call whichever
2037 is appropriate given a symbol number, and will call `free` (as is
2038 appropriate for tokens` on any terminal symbol.
2042 static void gen_free(FILE *f, struct grammar *g)
2046 fprintf(f, "#line 0 \"gen_free\"\n");
2047 fprintf(f, "static void do_free(short sym, void *asn)\n");
2049 fprintf(f, "\tif (!asn) return;\n");
2050 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2051 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2052 fprintf(f, "\tswitch(sym) {\n");
2054 for (i = 0; i < g->num_syms; i++) {
2055 struct symbol *s = g->symtab[i];
2057 s->type != Nonterminal ||
2058 s->struct_name.txt == NULL)
2061 fprintf(f, "\tcase %d:\n", s->num);
2062 fprintf(f, "\t\tfree_%.*s(asn);\n",
2064 s->struct_name.txt);
2065 fprintf(f, "\t\tbreak;\n");
2067 fprintf(f, "\t}\n}\n\n");
2070 ## The main routine.
2072 There are three key parts to the "main" function of parsergen: processing
2073 the arguments, loading the grammar file, and dealing with the grammar.
2075 ### Argument processing.
2077 Command line options allow the selection of analysis mode, name of output
2078 file, and whether or not a report should be generated. By default we create
2079 a report only if no code output was requested.
2081 The `parse_XX` function name uses the basename of the output file which
2082 should not have a suffix (`.c`). `.c` is added to the given name for the
2083 code, and `.h` is added for the header (if header text is specifed in the
2090 static const struct option long_options[] = {
2091 { "LR0", 0, NULL, '0' },
2092 { "LR05", 0, NULL, '5' },
2093 { "SLR", 0, NULL, 'S' },
2094 { "LALR", 0, NULL, 'L' },
2095 { "LR1", 0, NULL, '1' },
2096 { "tag", 1, NULL, 't' },
2097 { "report", 0, NULL, 'R' },
2098 { "output", 1, NULL, 'o' },
2099 { NULL, 0, NULL, 0 }
2101 const char *options = "05SL1t:Ro:";
2103 ###### process arguments
2105 char *outfile = NULL;
2110 enum grammar_type type = LR05;
2111 while ((opt = getopt_long(argc, argv, options,
2112 long_options, NULL)) != -1) {
2127 outfile = optarg; break;
2129 tag = optarg; break;
2131 fprintf(stderr, "Usage: parsergen ...\n");
2136 infile = argv[optind++];
2138 fprintf(stderr, "No input file given\n");
2141 if (outfile && report == 1)
2144 if (name && strchr(name, '/'))
2145 name = strrchr(name, '/')+1;
2147 if (optind < argc) {
2148 fprintf(stderr, "Excess command line arguments\n");
2152 ### Loading the grammar file
2154 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2157 One we have extracted the code (with `mdcode`) we expect to file three
2158 sections: header, code, and grammar. Anything else is an error.
2160 "header" and "code" are optional, though it is hard to build a working
2161 parser with neither. "grammar" must be provided.
2165 #include <sys/mman.h>
2170 static void pr_err(char *msg)
2173 fprintf(stderr, "%s\n", msg);
2177 struct section *table;
2181 fd = open(infile, O_RDONLY);
2183 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2184 infile, strerror(errno));
2187 len = lseek(fd, 0, 2);
2188 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2189 table = code_extract(file, file+len, pr_err);
2191 struct code_node *hdr = NULL;
2192 struct code_node *code = NULL;
2193 struct code_node *gram = NULL;
2194 for (s = table; s; s = s->next) {
2195 struct text sec = s->section;
2196 if (tag && !strip_tag(&sec, tag))
2198 if (text_is(sec, "header"))
2200 else if (text_is(sec, "code"))
2202 else if (text_is(sec, "grammar"))
2205 fprintf(stderr, "Unknown content section: %.*s\n",
2206 s->section.len, s->section.txt);
2211 ### Processing the input
2213 As we need to append an extention to a filename and then open it for
2214 writing, and we need to do this twice, it helps to have a separate function.
2218 static FILE *open_ext(char *base, char *ext)
2220 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2222 strcat(strcpy(fn, base), ext);
2228 If we can read the grammar, then we analyse and optionally report on it. If
2229 the report finds conflicts we will exit with an error status.
2231 ###### process input
2232 struct grammar *g = NULL;
2234 fprintf(stderr, "No grammar section provided\n");
2237 g = grammar_read(gram);
2239 fprintf(stderr, "Failure to parse grammar\n");
2244 grammar_analyse(g, type);
2246 if (grammar_report(g, type))
2250 If a headers section is defined, we write it out and include a declaration
2251 for the `parse_XX` function so it can be used from separate code.
2253 if (rv == 0 && hdr && outfile) {
2254 FILE *f = open_ext(outfile, ".h");
2256 code_node_print(f, hdr, infile);
2257 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2261 fprintf(stderr, "Cannot create %s.h\n",
2267 And if all goes well and an output file was provided, we create the `.c`
2268 file with the code section (if any) and the parser tables and function.
2270 if (rv == 0 && outfile) {
2271 FILE *f = open_ext(outfile, ".c");
2274 code_node_print(f, code, infile);
2275 gen_parser(f, g, infile, name);
2278 fprintf(stderr, "Cannot create %s.c\n",
2284 And that about wraps it up. We need to set the locale so that UTF-8 is
2285 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2287 ###### File: parsergen.mk
2288 parsergen : parsergen.o libscanner.o libmdcode.o
2289 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2296 int main(int argc, char *argv[])
2301 setlocale(LC_ALL,"");
2303 ## process arguments
2310 ## The SHIFT/REDUCE parser
2312 Having analysed the grammar and generated all the table, we only need the
2313 shift/reduce engine to bring it all together.
2315 ### Goto table lookup
2317 The parser generator has nicely provided us with goto tables sorted by
2318 symbol number. We need a binary search function to find a symbol in the
2321 ###### parser functions
2323 static int search(const struct state *l, int sym)
2326 int hi = l->go_to_cnt;
2330 while (lo + 1 < hi) {
2331 int mid = (lo + hi) / 2;
2332 if (l->go_to[mid].sym <= sym)
2337 if (l->go_to[lo].sym == sym)
2338 return l->go_to[lo].state;
2343 ### The state stack.
2345 The core data structure for the parser is the stack. This tracks all the
2346 symbols that have been recognised or partially recognised.
2348 The stack usually won't grow very large - maybe a few tens of entries. So
2349 we dynamically resize and array as required but never bother to shrink it
2352 We keep the stack as two separate allocations. One, `asn_stack` stores the
2353 "abstract syntax nodes" which are created by each reduction. When we call
2354 `do_reduce` we need to pass an array of the `asn`s of the body of the
2355 production, and by keeping a separate `asn` stack, we can just pass a
2356 pointer into this stack.
2358 The other allocation stores all other stack fields of which there are four.
2359 The `state` is the most important one and guides the parsing process. The
2360 `sym` is nearly unnecessary. However when we want to free entries from the
2361 `asn_stack`, it helps to know what type they are so we can call the right
2362 freeing function. The symbol leads us to the right free function through
2365 The `indents` count and the `starts_indented` flag track the line
2366 indents in the symbol. These are used to allow indent information to
2367 guide parsing and error recovery.
2369 As well as the stack of frames we have a `next` frame which is
2370 assembled from the incoming token and other information prior to
2371 pushing it onto the stack.
2373 ###### parser functions
2379 short starts_indented;
2381 short newline_permitted;
2390 Two operations are needed on the stack - shift (which is like push) and pop.
2392 Shift applies not only to terminals but also to non-terminals. When we
2393 reduce a production we will pop off entries corresponding to the body
2394 symbols, then push on an item for the head of the production. This last is
2395 exactly the same process as shifting in a terminal so we use the same
2398 To simplify other code we arrange for `shift` to fail if there is no `goto`
2399 state for the symbol. This is useful in basic parsing due to our design
2400 that we shift when we can, and reduce when we cannot. So the `shift`
2401 function reports if it could.
2403 So `shift` finds the next state. If that succeed it extends the allocations
2404 if needed and pushes all the information onto the stacks.
2406 ###### parser functions
2408 static int shift(struct parser *p,
2410 const struct state states[])
2412 // Push an entry onto the stack
2413 int newstate = search(&states[p->next.state], p->next.sym);
2416 if (p->tos >= p->stack_size) {
2417 p->stack_size += 10;
2418 p->stack = realloc(p->stack, p->stack_size
2419 * sizeof(p->stack[0]));
2420 p->asn_stack = realloc(p->asn_stack, p->stack_size
2421 * sizeof(p->asn_stack[0]));
2423 p->stack[p->tos] = p->next;
2424 p->asn_stack[p->tos] = asn;
2426 p->next.state = newstate;
2427 p->next.indents = 0;
2428 p->next.starts_indented = 0;
2429 // if new state doesn't start a line, we inherit newline_permitted status
2430 if (states[newstate].starts_line)
2431 p->next.newline_permitted = 1;
2435 `pop` simply moves the top of stack (`tos`) back down the required amount
2436 and frees any `asn` entries that need to be freed. It is called _after_ we
2437 reduce a production, just before we `shift` the nonterminal in.
2439 ###### parser functions
2441 static void pop(struct parser *p, int num,
2442 void(*do_free)(short sym, void *asn))
2446 for (i = 0; i < num; i++) {
2447 p->next.indents += p->stack[p->tos+i].indents;
2448 do_free(p->stack[p->tos+i].sym,
2449 p->asn_stack[p->tos+i]);
2453 p->next.state = p->stack[p->tos].state;
2454 p->next.starts_indented = p->stack[p->tos].starts_indented;
2455 p->next.newline_permitted = p->stack[p->tos].newline_permitted;
2456 if (p->next.indents > p->next.starts_indented)
2457 p->next.newline_permitted = 0;
2461 ### Memory allocation
2463 The `scanner` returns tokens in a local variable - we want them in allocated
2464 memory so they can live in the `asn_stack`. Similarly the `asn` produced by
2465 a reduce is in a large buffer. Both of these require some allocation and
2466 copying, hence `memdup` and `tokcopy`.
2468 ###### parser includes
2471 ###### parser functions
2473 void *memdup(void *m, int len)
2479 memcpy(ret, m, len);
2483 static struct token *tok_copy(struct token tk)
2485 struct token *new = malloc(sizeof(*new));
2490 ### The heart of the parser.
2492 Now we have the parser. If we can shift, we do. If not and we can reduce
2493 we do. If the production we reduced was production zero, then we have
2494 accepted the input and can finish.
2496 We return whatever `asn` was returned by reducing production zero.
2498 If we can neither shift nor reduce we have an error to handle. We pop
2499 single entries off the stack until we can shift the `TK_error` symbol, then
2500 drop input tokens until we find one we can shift into the new error state.
2502 When we find `TK_in` and `TK_out` tokens which report indents we need
2503 to handle them directly as the grammar cannot express what we want to
2506 `TK_in` tokens are easy: we simply update the `next` stack frame to
2507 record how many indents there are and that the next token started with
2510 `TK_out` tokens must either be counted off against any pending indent,
2511 or must force reductions until there is a pending indent which isn't
2512 at the start of a production.
2514 ###### parser includes
2517 void *parser_run(struct token_state *tokens,
2518 const struct state states[],
2519 int (*do_reduce)(int, void**, void*),
2520 void (*do_free)(short, void*),
2521 FILE *trace, const char *non_term[], int knowns)
2523 struct parser p = { 0 };
2524 struct token *tk = NULL;
2528 p.next.newline_permitted = states[0].starts_line;
2530 struct token *err_tk;
2532 tk = tok_copy(token_next(tokens));
2533 p.next.sym = tk->num;
2535 parser_trace(trace, &p, tk, states, non_term, knowns);
2537 if (p.next.sym == TK_in) {
2538 p.next.starts_indented = 1;
2544 if (p.next.sym == TK_out) {
2545 if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
2546 (p.stack[p.tos-1].indents == 1 &&
2547 states[p.next.state].reduce_size > 1)) {
2548 p.stack[p.tos-1].indents -= 1;
2549 if (p.stack[p.tos-1].indents == p.stack[p.tos-1].starts_indented) {
2550 // no internal indent any more, reassess 'newline_permitted'
2551 if (states[p.stack[p.tos-1].state].starts_line)
2552 p.stack[p.tos-1].newline_permitted = 1;
2554 p.stack[p.tos-1].newline_permitted = p.stack[p.tos-2].newline_permitted;
2560 // fall through and force a REDUCE (as 'shift'
2563 if (p.next.sym == TK_newline) {
2564 if (!p.tos || ! p.stack[p.tos-1].newline_permitted) {
2570 if (shift(&p, tk, states)) {
2574 if (states[p.next.state].reduce_prod >= 0) {
2576 int prod = states[p.next.state].reduce_prod;
2577 int size = states[p.next.state].reduce_size;
2579 static char buf[16*1024];
2580 p.next.sym = states[p.next.state].reduce_sym;
2582 body = p.asn_stack +
2583 (p.tos - states[p.next.state].reduce_size);
2585 bufsize = do_reduce(prod, body, buf);
2587 pop(&p, size, do_free);
2588 shift(&p, memdup(buf, bufsize), states);
2593 if (tk->num == TK_out) {
2594 // Indent problem - synthesise tokens to get us
2596 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
2597 p.next.sym = states[p.next.state].shift_sym;
2598 shift(&p, tok_copy(*tk), states);
2599 // FIXME need to report this error somehow
2602 /* Error. We walk up the stack until we
2603 * find a state which will accept TK_error.
2604 * We then shift in TK_error and see what state
2605 * that takes us too.
2606 * Then we discard input tokens until
2607 * we find one that is acceptable.
2610 err_tk = tok_copy(*tk);
2611 p.next.sym = TK_error;
2612 while (shift(&p, err_tk, states) == 0
2614 // discard this state
2615 pop(&p, 1, do_free);
2618 // no state accepted TK_error
2621 while (search(&states[p.next.state], tk->num) < 0 &&
2622 tk->num != TK_eof) {
2624 tk = tok_copy(token_next(tokens));
2625 if (tk->num == TK_in)
2626 p.next.indents += 1;
2627 if (tk->num == TK_out) {
2628 if (p.next.indents == 0)
2630 p.next.indents -= 1;
2633 if (p.tos == 0 && tk->num == TK_eof)
2638 ret = p.asn_stack[0];
2640 pop(&p, p.tos, do_free);
2646 ###### exported functions
2647 void *parser_run(struct token_state *tokens,
2648 const struct state states[],
2649 int (*do_reduce)(int, void**, void*),
2650 void (*do_free)(short, void*),
2651 FILE *trace, const char *non_term[], int knowns);
2655 Being able to visualize the parser in action can be invaluable when
2656 debugging the parser code, or trying to understand how the parse of a
2657 particular grammar progresses. The stack contains all the important
2658 state, so just printing out the stack every time around the parse loop
2659 can make it possible to see exactly what is happening.
2661 This doesn't explicitly show each SHIFT and REDUCE action. However they
2662 are easily deduced from the change between consecutive lines, and the
2663 details of each state can be found by cross referencing the states list
2664 in the stack with the "report" that parsergen can generate.
2666 For terminal symbols, we just dump the token. For non-terminals we
2667 print the name of the symbol. The look ahead token is reported at the
2668 end inside square brackets.
2670 ###### parser functions
2672 static char *reserved_words[] = {
2673 [TK_error] = "ERROR",
2676 [TK_newline] = "NEWLINE",
2679 static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2681 fprintf(trace, "(%d", f->state);
2683 fprintf(trace, "%c%d", f->starts_indented?':':'.',
2685 if (states[f->state].starts_line)
2686 fprintf(trace, "s");
2687 if (f->newline_permitted)
2688 fprintf(trace, "n");
2689 fprintf(trace, ") ");
2692 void parser_trace(FILE *trace, struct parser *p,
2693 struct token *tk, const struct state states[],
2694 const char *non_term[], int knowns)
2697 for (i = 0; i < p->tos; i++) {
2698 int sym = p->stack[i].sym;
2699 parser_trace_state(trace, &p->stack[i], states);
2700 if (sym < TK_reserved &&
2701 reserved_words[sym] != NULL)
2702 fputs(reserved_words[sym], trace);
2703 else if (sym < TK_reserved + knowns) {
2704 struct token *t = p->asn_stack[i];
2705 text_dump(trace, t->txt, 20);
2707 fputs(non_term[sym - TK_reserved - knowns],
2711 parser_trace_state(trace, &p->next, states);
2712 fprintf(trace, " [");
2713 if (tk->num < TK_reserved &&
2714 reserved_words[tk->num] != NULL)
2715 fputs(reserved_words[tk->num], trace);
2717 text_dump(trace, tk->txt, 20);
2718 fputs("]\n", trace);
2723 The obvious example for a parser is a calculator.
2725 As `scanner` provides number parsing function using `libgmp` is it not much
2726 work to perform arbitrary rational number calculations.
2728 This calculator takes one expression, or an equality test per line. The
2729 results are printed and in any equality test fails, the program exits with
2732 Embedding mdcode inside mdcode is rather horrible. I'd like to find a
2733 better approach, but as the grammar file must have 3 components I need
2734 something like this.
2736 ###### File: parsergen.mk
2737 calc.c calc.h : parsergen parsergen.mdc
2738 ./parsergen --tag calc -o calc parsergen.mdc
2739 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2740 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2745 // what do we use for a demo-grammar? A calculator of course.
2757 #include <sys/mman.h>
2762 #include "scanner.h"
2768 static void free_number(struct number *n)
2774 int main(int argc, char *argv[])
2776 int fd = open(argv[1], O_RDONLY);
2777 int len = lseek(fd, 0, 2);
2778 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2779 struct section *s = code_extract(file, file+len, NULL);
2780 struct token_config config = {
2781 .ignored = (1 << TK_line_comment)
2782 | (1 << TK_block_comment)
2785 .number_chars = ".,_+-",
2789 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2795 Session -> Session Line
2798 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2799 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2800 gmp_printf(" or as a decimal: %Fg\n", fl);
2804 | Expression = Expression NEWLINE ${
2805 if (mpq_equal($1.val, $3.val))
2806 gmp_printf("Both equal %Qd\n", $1.val);
2808 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
2813 | NEWLINE ${ printf("Blank line\n"); }$
2814 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2817 Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2818 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2819 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2821 Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2822 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2823 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2825 Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2826 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$