1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. Finally `calc` is a test grammar for a simple calculator. The
21 `parsergen` program built from the C code in this file can extract
22 that grammar directly from this file and process it.
25 ###### File: parsergen.c
30 ## forward declarations
41 ###### File: libparser.c
48 ###### File: parsergen.mk
51 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
54 ## Reading the grammar
56 The grammar must be stored in a markdown literate code file as parsed
57 by "[mdcode][]". It must have three top level (i.e. unreferenced)
58 sections: `header`, `code`, and `grammar`. The first two will be
59 literally copied into the generated `.c` and `.h`. files. The last
60 contains the grammar. This is tokenised with "[scanner][]".
62 If the `--tag` option is given, then any top level header that doesn't
63 start with the tag is ignored, and the tag is striped from the rest. So
65 means that the three needed sections must be `Foo: header`, `Foo: code`,
69 [scanner]: scanner.html
75 ###### parser includes
79 The grammar contains production sets, precedence/associativity
80 declarations, and data type declarations. These are all parsed with
81 _ad hoc_ parsing as we don't have a parser generator yet.
83 The precedence and associativity can be set for each production, but
84 can be inherited from symbols. The data type (either structure or a
85 reference to a structure) is potentially defined for each non-terminal
86 and describes what C structure is used to store information about each
90 enum assoc {Left, Right, Non};
91 char *assoc_names[] = {"Left","Right","Non"};
94 struct text struct_name;
97 unsigned short precedence;
101 unsigned short precedence;
109 The strings reported by `mdcode` and `scanner` are `struct text` which have
110 length rather than being null terminated. To help with printing and
111 comparing we define `text_is` and `prtxt`, which should possibly go in
112 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
113 which might contain control characters.
115 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
116 because that is what we need to detect tags.
119 static int text_is(struct text t, char *s)
121 return (strlen(s) == t.len &&
122 strncmp(s, t.txt, t.len) == 0);
124 static void prtxt(struct text t)
126 printf("%.*s", t.len, t.txt);
129 static int strip_tag(struct text *t, char *tag)
131 int skip = strlen(tag) + 1;
132 if (skip >= t->len ||
133 strncmp(t->txt, tag, skip-1) != 0 ||
134 t->txt[skip-1] != ':')
136 while (skip < t->len && t->txt[skip] == ' ')
145 Productions are comprised primarily of symbols - terminal and
146 non-terminal. We do not make a syntactic distinction between the two,
147 though non-terminals must be identifiers. Non-terminal symbols are
148 those which appear in the head of a production, terminal symbols are
149 those which don't. There are also "virtual" symbols used for precedence
150 marking discussed later, and sometimes we won't know what type a symbol
153 ###### forward declarations
154 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
155 char *symtypes = "UVTN";
159 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
160 table of known symbols and the resulting parser will report them as
161 `TK_reserved + N`. A small set of identifiers are reserved for the
162 different token types that `scanner` can report.
165 static char *reserved_words[] = {
166 [TK_error] = "ERROR",
167 [TK_number] = "NUMBER",
168 [TK_ident] = "IDENTIFIER",
170 [TK_string] = "STRING",
171 [TK_multi_string] = "MULTI_STRING",
174 [TK_newline] = "NEWLINE",
180 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
181 recognised. The former is automatically expected at the end of the text
182 being parsed. The latter are always ignored by the parser.
184 All of these symbols are stored in a simple symbol table. We use the
185 `struct text` from `mdcode` to store the name and link them together
186 into a sorted list using an insertion sort.
188 We don't have separate `find` and `insert` functions as any symbol we
189 find needs to be remembered. We simply expect `find` to always return a
190 symbol, but its type might be `Unknown`.
199 ###### grammar fields
204 static struct symbol *sym_find(struct grammar *g, struct text s)
206 struct symbol **l = &g->syms;
211 (cmp = text_cmp((*l)->name, s)) < 0)
215 n = calloc(1, sizeof(*n));
224 static void symbols_init(struct grammar *g)
226 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
228 for (i = 0; i < entries; i++) {
231 t.txt = reserved_words[i];
234 t.len = strlen(t.txt);
241 ### Data types and precedence.
243 Data type specification and precedence specification are both
244 introduced by a dollar sign at the start of the line. If the next
245 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
246 precedence, otherwise it specifies a data type.
248 The data type name is simply stored and applied to the head of all
249 subsequent productions. It must be the name of a structure optionally
250 preceded by an asterisk which means a reference or "pointer". So
251 `$expression` maps to `struct expression` and `$*statement` maps to
252 `struct statement *`.
254 Any productions given before the first data type declaration will have
255 no data type associated with them and can carry no information. In
256 order to allow other non-terminals to have no type, the data type
257 `$void` can be given. This does *not* mean that `struct void` will be
258 used, but rather than no type will be associated with future
261 The precedence line must contain a list of symbols - typically
262 terminal symbols, but not necessarily. It can only contain symbols
263 that have not been seen yet, so precedence declaration must precede
264 the productions that they affect.
266 A precedence line may also contain a "virtual" symbol which is an
267 identifier preceded by `$$`. Virtual terminals carry precedence
268 information but are not included in the grammar. A production can
269 declare that it inherits the precedence of a given virtual symbol.
271 This use for `$$` precludes it from being used as a symbol in the
272 described language. Two other symbols: `${` and `}$` are also
275 Each new precedence line introduces a new precedence level and
276 declares how it associates. This level is stored in each symbol
277 listed and may be inherited by any production which uses the symbol. A
278 production inherits from the last symbol which has a precedence.
280 ###### grammar fields
281 struct text current_type;
286 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
287 static const char *known[] = { "$$", "${", "}$" };
290 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
292 struct token t = token_next(ts);
297 if (t.num != TK_ident) {
298 err = "type or assoc expected after '$'";
301 if (text_is(t.txt, "LEFT"))
303 else if (text_is(t.txt, "RIGHT"))
305 else if (text_is(t.txt, "NON"))
308 g->current_type = t.txt;
309 g->type_isref = isref;
310 if (text_is(t.txt, "void"))
311 g->current_type.txt = NULL;
313 if (t.num != TK_newline) {
314 err = "Extra tokens after type name";
321 err = "$* cannot be followed by a precedence";
325 // This is a precedence line, need some symbols.
329 while (t.num != TK_newline) {
330 enum symtype type = Terminal;
332 if (t.num == TK_virtual) {
335 if (t.num != TK_ident) {
336 err = "$$ must be followed by a word";
339 } else if (t.num != TK_ident &&
341 err = "Illegal token in precedence line";
344 s = sym_find(g, t.txt);
345 if (s->type != Unknown) {
346 err = "Symbols in precedence line must not already be known.";
350 s->precedence = g->prec_levels;
355 err = "No symbols given on precedence line";
359 while (t.num != TK_newline && t.num != TK_eof)
366 A production either starts with an identifier which is the head
367 non-terminal, or a vertical bar (`|`) in which case this production
368 uses the same head as the previous one. The identifier must be
369 followed by a `->` mark. All productions for a given non-terminal must
370 be grouped together with the `nonterminal ->` given only once.
372 After this a (possibly empty) sequence of identifiers and marks form
373 the body of the production. A virtual symbol may be given after the
374 body by preceding it with `$$`. If a virtual symbol is given, the
375 precedence of the production is that for the virtual symbol. If none
376 is given, the precedence is inherited from the last symbol in the
377 production which has a precedence specified.
379 After the optional precedence may come the `${` mark. This indicates
380 the start of a code fragment. If present, this must be on the same
381 line as the start of the production.
383 All of the text from the `${` through to the matching `}$` is
384 collected and forms the code-fragment for the production. It must all
385 be in one `code_node` of the literate code. The `}$` must be
386 at the end of a line.
388 Text in the code fragment will undergo substitutions where `$N` or
389 `$<N`,for some numeric `N`, will be replaced with a variable holding
390 the parse information for the particular symbol in the production.
391 `$0` is the head of the production, `$1` is the first symbol of the
392 body, etc. The type of `$N` for a terminal symbol is `struct token`.
393 For a non-terminal, it is whatever has been declared for that symbol.
394 The `<` may be included for symbols declared as storing a reference
395 (not a structure) and means that the reference is being moved out, so
396 it will not automatically be freed.
398 While building productions we will need to add to an array which needs to
402 static void array_add(void *varray, int *cnt, void *new)
404 void ***array = varray;
407 current = ((*cnt-1) | (step-1))+1;
408 if (*cnt == current) {
411 *array = realloc(*array, current * sizeof(void*));
413 (*array)[*cnt] = new;
417 Collecting the code fragment simply involves reading tokens until we
418 hit the end token `}$`, and noting the character position of the start and
422 static struct text collect_code(struct token_state *state,
427 code.txt = start.txt.txt + start.txt.len;
429 t = token_next(state);
430 while (t.node == start.node &&
431 t.num != TK_close && t.num != TK_error &&
433 if (t.num == TK_close && t.node == start.node)
434 code.len = t.txt.txt - code.txt;
440 Now we have all the bit we need to parse a full production.
445 ###### grammar fields
446 struct production **productions;
447 int production_count;
449 ###### production fields
451 struct symbol **body;
456 int first_production;
459 static char *parse_production(struct grammar *g,
461 struct token_state *state)
463 /* Head has already been parsed. */
466 struct production p, *pp;
468 memset(&p, 0, sizeof(p));
470 tk = token_next(state);
471 while (tk.num == TK_ident || tk.num == TK_mark) {
472 struct symbol *bs = sym_find(g, tk.txt);
473 if (bs->type == Unknown)
475 if (bs->type == Virtual) {
476 err = "Virtual symbol not permitted in production";
479 if (bs->precedence) {
480 p.precedence = bs->precedence;
483 array_add(&p.body, &p.body_size, bs);
484 tk = token_next(state);
486 if (tk.num == TK_virtual) {
488 tk = token_next(state);
489 if (tk.num != TK_ident) {
490 err = "word required after $$";
493 vs = sym_find(g, tk.txt);
494 if (vs->type != Virtual) {
495 err = "symbol after $$ must be virtual";
498 p.precedence = vs->precedence;
500 tk = token_next(state);
502 if (tk.num == TK_open) {
503 p.code = collect_code(state, tk);
504 if (p.code.txt == NULL) {
505 err = "code fragment not closed properly";
508 tk = token_next(state);
510 if (tk.num != TK_newline && tk.num != TK_eof) {
511 err = "stray tokens at end of line";
514 pp = malloc(sizeof(*pp));
516 array_add(&g->productions, &g->production_count, pp);
519 while (tk.num != TK_newline && tk.num != TK_eof)
520 tk = token_next(state);
524 With the ability to parse production and dollar-lines, we have nearly all
525 that we need to parse a grammar from a `code_node`.
527 The head of the first production will effectively be the `start` symbol of
528 the grammar. However it won't _actually_ be so. Processing the grammar is
529 greatly simplified if the real start symbol only has a single production,
530 and expects `$eof` as the final terminal. So when we find the first
531 explicit production we insert an extra production as production zero which
534 ###### Example: production 0
537 where `START` is the first non-terminal given.
539 ###### create production zero
540 struct production *p = calloc(1,sizeof(*p));
541 struct text start = {"$start",6};
542 struct text eof = {"$eof",4};
543 struct text code = {"$0 = $<1;", 9};
544 p->head = sym_find(g, start);
545 p->head->type = Nonterminal;
546 p->head->struct_name = g->current_type;
547 p->head->isref = g->type_isref;
548 if (g->current_type.txt)
550 array_add(&p->body, &p->body_size, head);
551 array_add(&p->body, &p->body_size, sym_find(g, eof));
552 p->head->first_production = g->production_count;
553 array_add(&g->productions, &g->production_count, p);
555 Now we are ready to read in the grammar.
558 static struct grammar *grammar_read(struct code_node *code)
560 struct token_config conf = {
563 .words_marks = known,
564 .known_count = sizeof(known)/sizeof(known[0]),
566 .ignored = (1 << TK_line_comment)
567 | (1 << TK_block_comment)
570 | (1 << TK_multi_string)
575 struct token_state *state = token_open(code, &conf);
577 struct symbol *head = NULL;
581 g = calloc(1, sizeof(*g));
584 for (tk = token_next(state); tk.num != TK_eof;
585 tk = token_next(state)) {
586 if (tk.num == TK_newline)
588 if (tk.num == TK_ident) {
590 head = sym_find(g, tk.txt);
591 if (head->type == Nonterminal)
592 err = "This non-terminal has already be used.";
593 else if (head->type == Virtual)
594 err = "Virtual symbol not permitted in head of production";
596 head->type = Nonterminal;
597 head->struct_name = g->current_type;
598 head->isref = g->type_isref;
599 if (g->production_count == 0) {
600 ## create production zero
602 head->first_production = g->production_count;
603 tk = token_next(state);
604 if (tk.num == TK_mark &&
605 text_is(tk.txt, "->"))
606 err = parse_production(g, head, state);
608 err = "'->' missing in production";
610 } else if (tk.num == TK_mark
611 && text_is(tk.txt, "|")) {
612 // another production for same non-term
614 err = parse_production(g, head, state);
616 err = "First production must have a head";
617 } else if (tk.num == TK_mark
618 && text_is(tk.txt, "$")) {
619 err = dollar_line(state, g, 0);
620 } else if (tk.num == TK_mark
621 && text_is(tk.txt, "$*")) {
622 err = dollar_line(state, g, 1);
624 err = "Unrecognised token at start of line.";
632 fprintf(stderr, "Error at line %d: %s\n",
639 ## Analysing the grammar
641 The central task in analysing the grammar is to determine a set of
642 states to drive the parsing process. This will require finding
643 various sets of symbols and of "items". Some of these sets will need
644 to attach information to each element in the set, so they are more
647 Each "item" may have a 'look-ahead' or `LA` set associated with
648 it. Many of these will be the same as each other. To avoid memory
649 wastage, and to simplify some comparisons of sets, these sets will be
650 stored in a list of unique sets, each assigned a number.
652 Once we have the data structures in hand to manage these sets and
653 lists, we can start setting the 'nullable' flag, build the 'FIRST'
654 sets, and then create the item sets which define the various states.
658 Though we don't only store symbols in these sets, they are the main
659 things we store, so they are called symbol sets or "symsets".
661 A symset has a size, an array of shorts, and an optional array of data
662 values, which are also shorts. If the array of data is not present,
663 we store an impossible pointer, as `NULL` is used to indicate that no
664 memory has been allocated yet;
669 unsigned short *syms, *data;
671 #define NO_DATA ((unsigned short *)1)
672 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
673 const struct symset INIT_DATASET = { 0, NULL, NULL };
675 The arrays of shorts are allocated in blocks of 8 and are kept sorted
676 by using an insertion sort. We don't explicitly record the amount of
677 allocated space as it can be derived directly from the current `cnt` using
678 `((cnt - 1) | 7) + 1`.
681 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
684 int current = ((s->cnt-1) | 7) + 1;
685 if (current == s->cnt) {
687 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
688 if (s->data != NO_DATA)
689 s->data = realloc(s->data, sizeof(*s->data) * current);
692 while (i > 0 && s->syms[i-1] > key) {
693 s->syms[i] = s->syms[i-1];
694 if (s->data != NO_DATA)
695 s->data[i] = s->data[i-1];
699 if (s->data != NO_DATA)
704 Finding a symbol (or item) in a `symset` uses a simple binary search.
705 We return the index where the value was found (so data can be accessed),
706 or `-1` to indicate failure.
708 static int symset_find(struct symset *ss, unsigned short key)
715 while (lo + 1 < hi) {
716 int mid = (lo + hi) / 2;
717 if (ss->syms[mid] <= key)
722 if (ss->syms[lo] == key)
727 We will often want to form the union of two symsets. When we do, we
728 will often want to know if anything changed (as they might mean there
729 is more work to do). So `symset_union` reports whether anything was
730 added to the first set. We use a slow quadratic approach as these
731 sets don't really get very big. If profiles shows this to be a problem is
732 can be optimised later.
734 static int symset_union(struct symset *a, struct symset *b)
738 for (i = 0; i < b->cnt; i++)
739 if (symset_find(a, b->syms[i]) < 0) {
740 unsigned short data = 0;
741 if (b->data != NO_DATA)
743 symset_add(a, b->syms[i], data);
749 And of course we must be able to free a symset.
751 static void symset_free(struct symset ss)
754 if (ss.data != NO_DATA)
760 Some symsets are simply stored somewhere appropriate in the `grammar`
761 data structure, others needs a bit of indirection. This applies
762 particularly to "LA" sets. These will be paired with "items" in an
763 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
764 stored in the `data` field, so we need a mapping from a small (short)
765 number to an LA `symset`.
767 As mentioned earlier this involves creating a list of unique symsets.
769 For now, we just use a linear list sorted by insertion. A skiplist
770 would be more efficient and may be added later.
777 struct setlist *next;
780 ###### grammar fields
781 struct setlist *sets;
786 static int ss_cmp(struct symset a, struct symset b)
789 int diff = a.cnt - b.cnt;
792 for (i = 0; i < a.cnt; i++) {
793 diff = (int)a.syms[i] - (int)b.syms[i];
800 static int save_set(struct grammar *g, struct symset ss)
802 struct setlist **sl = &g->sets;
806 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
813 s = malloc(sizeof(*s));
822 Finding a set by number is currently performed by a simple linear search.
823 If this turns out to hurt performance, we can store the sets in a dynamic
824 array like the productions.
826 static struct symset set_find(struct grammar *g, int num)
828 struct setlist *sl = g->sets;
829 while (sl && sl->num != num)
835 ### Setting `nullable`
837 We set `nullable` on the head symbol for any production for which all
838 the body symbols (if any) are nullable. As this is a recursive
839 definition, any change in the `nullable` setting means that we need to
840 re-evaluate where it needs to be set.
842 We simply loop around performing the same calculations until no more
849 static void set_nullable(struct grammar *g)
852 while (check_again) {
855 for (p = 0; p < g->production_count; p++) {
856 struct production *pr = g->productions[p];
859 if (pr->head->nullable)
861 for (s = 0; s < pr->body_size; s++)
862 if (! pr->body[s]->nullable)
864 if (s == pr->body_size) {
865 pr->head->nullable = 1;
872 ### Setting `can_eol` and `starts_line`
874 In order to be able to ignore newline tokens when not relevant, but
875 still include them in the parse when needed, we will need to know
876 which states can start a "line-like" section of code. We ignore
877 newlines when there is an indent since the most recent start of a
880 To know what is line-like, we first need to know which symbols can end
881 a line-like section, which is precisely those which can end with a
882 newline token. These symbols don't necessarily alway end with a
883 newline, but they can. Hence they are not described as "lines" but
886 Clearly the `TK_newline` token can end with a newline. Any symbol
887 which is the head of a production that contains a line-ending symbol
888 followed only by nullable symbols is also a line-ending symbol. We
889 use a new field `can_eol` to record this attribute of symbols, and
890 compute it in a repetitive manner similar to `set_nullable`.
897 static void set_can_eol(struct grammar *g)
900 g->symtab[TK_newline]->can_eol = 1;
901 while (check_again) {
904 for (p = 0; p < g->production_count; p++) {
905 struct production *pr = g->productions[p];
908 if (pr->head->can_eol)
911 for (s = pr->body_size - 1; s >= 0; s--) {
912 if (pr->body[s]->can_eol) {
913 pr->head->can_eol = 1;
917 if (!pr->body[s]->nullable)
924 static void set_starts_line(struct grammar *g)
927 for (p = 0; p < g->production_count; p++) {
928 struct production *pr = g->productions[p];
931 for (s = 0; s < pr->body_size - 1; s++)
932 if (pr->body[s]->can_eol)
933 pr->body[s+1]->starts_line = 1;
937 ### Building the `first` sets
939 When calculating what can follow a particular non-terminal, we will need to
940 know what the "first" terminal in any subsequent non-terminal might be. So
941 we calculate the `first` set for every non-terminal and store them in an
942 array. We don't bother recording the "first" set for terminals as they are
945 As the "first" for one symbol might depend on the "first" of another,
946 we repeat the calculations until no changes happen, just like with
947 `set_nullable`. This makes use of the fact that `symset_union`
948 reports if any change happens.
950 The core of this which finds the "first" of part of a production body
951 will be reused for computing the "follow" sets, so we split it out
952 into a separate function.
954 ###### grammar fields
955 struct symset *first;
959 static int add_first(struct production *pr, int start,
960 struct symset *target, struct grammar *g,
965 for (s = start; s < pr->body_size; s++) {
966 struct symbol *bs = pr->body[s];
967 if (bs->type == Terminal) {
968 if (symset_find(target, bs->num) < 0) {
969 symset_add(target, bs->num, 0);
973 } else if (symset_union(target, &g->first[bs->num]))
979 *to_end = (s == pr->body_size);
983 static void build_first(struct grammar *g)
987 g->first = calloc(g->num_syms, sizeof(g->first[0]));
988 for (s = 0; s < g->num_syms; s++)
989 g->first[s] = INIT_SYMSET;
991 while (check_again) {
994 for (p = 0; p < g->production_count; p++) {
995 struct production *pr = g->productions[p];
996 struct symset *head = &g->first[pr->head->num];
998 if (add_first(pr, 0, head, g, NULL))
1004 ### Building the `follow` sets.
1006 There are two different situations when we will want to generate "follow"
1007 sets. If we are doing an SLR analysis, we want to generate a single
1008 "follow" set for each non-terminal in the grammar. That is what is
1009 happening here. If we are doing an LALR or LR analysis we will want
1010 to generate a separate "LA" set for each item. We do that later
1011 in state generation.
1013 There are two parts to generating a "follow" set. Firstly we look at
1014 every place that any non-terminal appears in the body of any
1015 production, and we find the set of possible "first" symbols after
1016 there. This is done using `add_first` above and only needs to be done
1017 once as the "first" sets are now stable and will not change.
1021 for (p = 0; p < g->production_count; p++) {
1022 struct production *pr = g->productions[p];
1025 for (b = 0; b < pr->body_size - 1; b++) {
1026 struct symbol *bs = pr->body[b];
1027 if (bs->type == Terminal)
1029 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1033 The second part is to add the "follow" set of the head of a production
1034 to the "follow" sets of the final symbol in the production, and any
1035 other symbol which is followed only by `nullable` symbols. As this
1036 depends on "follow" itself we need to repeatedly perform the process
1037 until no further changes happen.
1041 for (again = 0, p = 0;
1042 p < g->production_count;
1043 p < g->production_count-1
1044 ? p++ : again ? (again = 0, p = 0)
1046 struct production *pr = g->productions[p];
1049 for (b = pr->body_size - 1; b >= 0; b--) {
1050 struct symbol *bs = pr->body[b];
1051 if (bs->type == Terminal)
1053 if (symset_union(&g->follow[bs->num],
1054 &g->follow[pr->head->num]))
1061 We now just need to create and initialise the `follow` list to get a
1064 ###### grammar fields
1065 struct symset *follow;
1068 static void build_follow(struct grammar *g)
1071 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1072 for (s = 0; s < g->num_syms; s++)
1073 g->follow[s] = INIT_SYMSET;
1077 ### Building itemsets and states
1079 There are three different levels of detail that can be chosen for
1080 building the itemsets and states for the LR grammar. They are:
1082 1. LR(0) or SLR(1), where no look-ahead is considered.
1083 2. LALR(1) where we build look-ahead sets with each item and merge
1084 the LA sets when we find two paths to the same "kernel" set of items.
1085 3. LR(1) where different look-ahead for any item in the set means
1086 a different state must be created.
1088 ###### forward declarations
1089 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1091 We need to be able to look through existing states to see if a newly
1092 generated state already exists. For now we use a simple sorted linked
1095 An item is a pair of numbers: the production number and the position of
1096 "DOT", which is an index into the body of the production.
1097 As the numbers are not enormous we can combine them into a single "short"
1098 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1099 production number (so 4000 productions with maximum of 15 symbols in the
1102 Comparing the itemsets will be a little different to comparing symsets
1103 as we want to do the lookup after generating the "kernel" of an
1104 itemset, so we need to ignore the offset=zero items which are added during
1107 To facilitate this, we modify the "DOT" number so that "0" sorts to
1108 the end of the list in the symset, and then only compare items before
1112 static inline unsigned short item_num(int production, int index)
1114 return production | ((31-index) << 11);
1116 static inline int item_prod(unsigned short item)
1118 return item & 0x7ff;
1120 static inline int item_index(unsigned short item)
1122 return (31-(item >> 11)) & 0x1f;
1125 For LR(1) analysis we need to compare not just the itemset in a state
1126 but also the LA sets. As we assign each unique LA set a number, we
1127 can just compare the symset and the data values together.
1130 static int itemset_cmp(struct symset a, struct symset b,
1131 enum grammar_type type)
1137 i < a.cnt && i < b.cnt &&
1138 item_index(a.syms[i]) > 0 &&
1139 item_index(b.syms[i]) > 0;
1141 int diff = a.syms[i] - b.syms[i];
1145 diff = a.data[i] - b.data[i];
1150 if (i == a.cnt || item_index(a.syms[i]) == 0)
1154 if (i == b.cnt || item_index(b.syms[i]) == 0)
1160 if (type < LR1 || av == -1)
1163 a.data[i] - b.data[i];
1166 And now we can build the list of itemsets. The lookup routine returns
1167 both a success flag and a pointer to where in the list an insert
1168 should happen, so we don't need to search a second time.
1172 struct itemset *next;
1174 struct symset items;
1175 struct symset go_to;
1180 ###### grammar fields
1181 struct itemset *items;
1185 static int itemset_find(struct grammar *g, struct itemset ***where,
1186 struct symset kernel, enum grammar_type type)
1188 struct itemset **ip;
1190 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1191 struct itemset *i = *ip;
1193 diff = itemset_cmp(i->items, kernel, type);
1206 Adding an itemset may require merging the LA sets if LALR analysis is
1207 happening. If any new LA set adds any symbols that weren't in the old LA set, we
1208 clear the `completed` flag so that the dependants of this itemset will be
1209 recalculated and their LA sets updated.
1211 `add_itemset` must consume the symsets it is passed, either by adding
1212 them to a data structure, of freeing them.
1214 static int add_itemset(struct grammar *g, struct symset ss,
1215 enum grammar_type type)
1217 struct itemset **where, *is;
1219 int found = itemset_find(g, &where, ss, type);
1221 is = calloc(1, sizeof(*is));
1222 is->state = g->states;
1226 is->go_to = INIT_DATASET;
1235 for (i = 0; i < ss.cnt; i++) {
1236 struct symset temp = INIT_SYMSET, s;
1237 if (ss.data[i] == is->items.data[i])
1239 s = set_find(g, is->items.data[i]);
1240 symset_union(&temp, &s);
1241 s = set_find(g, ss.data[i]);
1242 if (symset_union(&temp, &s)) {
1243 is->items.data[i] = save_set(g, temp);
1254 To build all the itemsets, we first insert the initial itemset made
1255 from production zero, complete each itemset, and then generate new
1256 itemsets from old until no new ones can be made.
1258 Completing an itemset means finding all the items where "DOT" is followed by
1259 a nonterminal and adding "DOT=0" items for every production from that
1260 non-terminal - providing each item hasn't already been added.
1262 If LA sets are needed, the LA set for each new item is found using
1263 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1264 be supplemented by the LA set for the item which produce the new item.
1266 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1267 is used in the next stage.
1268 If any of these symbols are flagged as starting a line, then this
1269 state must be a `starts_line` state so now is a good time to record that.
1271 NOTE: precedence handling should happen here - I haven't written this yet
1274 ###### complete itemset
1275 for (i = 0; i < is->items.cnt; i++) {
1276 int p = item_prod(is->items.syms[i]);
1277 int bs = item_index(is->items.syms[i]);
1278 struct production *pr = g->productions[p];
1281 struct symset LA = INIT_SYMSET;
1282 unsigned short sn = 0;
1284 if (bs == pr->body_size)
1287 if (symset_find(&done, s->num) < 0) {
1288 symset_add(&done, s->num, 0);
1290 is->starts_line = 1;
1292 if (s->type != Nonterminal)
1298 add_first(pr, bs+1, &LA, g, &to_end);
1300 struct symset ss = set_find(g, is->items.data[i]);
1301 symset_union(&LA, &ss);
1303 sn = save_set(g, LA);
1304 LA = set_find(g, sn);
1307 /* Add productions for this symbol */
1308 for (p2 = s->first_production;
1309 p2 < g->production_count &&
1310 g->productions[p2]->head == s;
1312 int itm = item_num(p2, 0);
1313 int pos = symset_find(&is->items, itm);
1315 symset_add(&is->items, itm, sn);
1316 /* Will have re-ordered, so start
1317 * from beginning again */
1319 } else if (type >= LALR) {
1320 struct symset ss = set_find(g, is->items.data[pos]);
1321 struct symset tmp = INIT_SYMSET;
1323 symset_union(&tmp, &ss);
1324 if (symset_union(&tmp, &LA)) {
1325 is->items.data[pos] = save_set(g, tmp);
1333 For each symbol we found (and placed in `done`) we collect all the items for
1334 which this symbol is next, and create a new itemset with "DOT" advanced over
1335 the symbol. This is then added to the collection of itemsets (or merged
1336 with a pre-existing itemset).
1338 ###### derive itemsets
1339 // Now we have a completed itemset, so we need to
1340 // compute all the 'next' itemsets and create them
1341 // if they don't exist.
1342 for (i = 0; i < done.cnt; i++) {
1344 unsigned short state;
1345 struct symbol *sym = g->symtab[done.syms[i]];
1346 struct symset newitemset = INIT_SYMSET;
1348 newitemset = INIT_DATASET;
1350 for (j = 0; j < is->items.cnt; j++) {
1351 int itm = is->items.syms[j];
1352 int p = item_prod(itm);
1353 int bp = item_index(itm);
1354 struct production *pr = g->productions[p];
1355 unsigned short la = 0;
1358 if (bp == pr->body_size)
1360 if (pr->body[bp] != sym)
1363 la = is->items.data[j];
1364 pos = symset_find(&newitemset, pr->head->num);
1366 symset_add(&newitemset, item_num(p, bp+1), la);
1367 else if (type >= LALR) {
1368 // Need to merge la set.
1369 int la2 = newitemset.data[pos];
1371 struct symset ss = set_find(g, la2);
1372 struct symset LA = INIT_SYMSET;
1373 symset_union(&LA, &ss);
1374 ss = set_find(g, la);
1375 if (symset_union(&LA, &ss))
1376 newitemset.data[pos] = save_set(g, LA);
1382 state = add_itemset(g, newitemset, type);
1383 if (symset_find(&is->go_to, done.syms[i]) < 0)
1384 symset_add(&is->go_to, done.syms[i], state);
1387 All that is left is to crate the initial itemset from production zero, and
1388 with `TK_eof` as the LA set.
1391 static void build_itemsets(struct grammar *g, enum grammar_type type)
1393 struct symset first = INIT_SYMSET;
1396 unsigned short la = 0;
1398 // LA set just has eof
1399 struct symset eof = INIT_SYMSET;
1400 symset_add(&eof, TK_eof, 0);
1401 la = save_set(g, eof);
1402 first = INIT_DATASET;
1404 // production 0, offset 0 (with no data)
1405 symset_add(&first, item_num(0, 0), la);
1406 add_itemset(g, first, type);
1407 for (again = 0, is = g->items;
1409 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1411 struct symset done = INIT_SYMSET;
1422 ### Completing the analysis.
1424 The exact process of analysis depends on which level was requested. For
1425 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1426 `LR1` we need first, but not follow. For `SLR` we need both.
1428 We don't build the "action" tables that you might expect as the parser
1429 doesn't use them. They will be calculated to some extent if needed for
1432 Once we have built everything we allocate arrays for the two lists:
1433 symbols and itemsets. This allows more efficient access during reporting.
1434 The symbols are grouped as terminals and non-terminals and we record the
1435 changeover point in `first_nonterm`.
1437 ###### grammar fields
1438 struct symbol **symtab;
1439 struct itemset **statetab;
1442 ###### grammar_analyse
1444 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1448 int snum = TK_reserved;
1449 for (s = g->syms; s; s = s->next)
1450 if (s->num < 0 && s->type == Terminal) {
1454 g->first_nonterm = snum;
1455 for (s = g->syms; s; s = s->next)
1461 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1462 for (s = g->syms; s; s = s->next)
1463 g->symtab[s->num] = s;
1474 build_itemsets(g, type);
1476 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1477 for (is = g->items; is ; is = is->next)
1478 g->statetab[is->state] = is;
1481 ## Reporting on the Grammar
1483 The purpose of the report is to give the grammar developer insight into
1484 how the grammar parser will work. It is basically a structured dump of
1485 all the tables that have been generated, plus a description of any conflicts.
1487 ###### grammar_report
1488 static int grammar_report(struct grammar *g, enum grammar_type type)
1494 return report_conflicts(g, type);
1497 Firstly we have the complete list of symbols, together with the
1498 "FIRST" set if that was generated. We add a mark to each symbol to
1499 show if it can end in a newline (`>`), if it implies the start of a
1500 line (`<`), or if it is nullable (`.`).
1504 static void report_symbols(struct grammar *g)
1508 printf("SYMBOLS + FIRST:\n");
1510 printf("SYMBOLS:\n");
1512 for (n = 0; n < g->num_syms; n++) {
1513 struct symbol *s = g->symtab[n];
1517 printf(" %c%c%c%3d%c: ",
1518 s->nullable ? '.':' ',
1519 s->can_eol ? '>':' ',
1520 s->starts_line ? '<':' ',
1521 s->num, symtypes[s->type]);
1524 printf(" (%d%s)", s->precedence,
1525 assoc_names[s->assoc]);
1527 if (g->first && s->type == Nonterminal) {
1530 for (j = 0; j < g->first[n].cnt; j++) {
1533 prtxt(g->symtab[g->first[n].syms[j]]->name);
1540 Then we have the follow sets if they were computed.
1542 static void report_follow(struct grammar *g)
1545 printf("FOLLOW:\n");
1546 for (n = 0; n < g->num_syms; n++)
1547 if (g->follow[n].cnt) {
1551 prtxt(g->symtab[n]->name);
1552 for (j = 0; j < g->follow[n].cnt; j++) {
1555 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1561 And finally the item sets. These include the GO TO tables and, for
1562 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1563 it up a bit. First the items, with production number and associativity.
1565 static void report_item(struct grammar *g, int itm)
1567 int p = item_prod(itm);
1568 int dot = item_index(itm);
1569 struct production *pr = g->productions[p];
1573 prtxt(pr->head->name);
1575 for (i = 0; i < pr->body_size; i++) {
1576 printf(" %s", (dot == i ? ". ": ""));
1577 prtxt(pr->body[i]->name);
1579 if (dot == pr->body_size)
1583 printf(" (%d%s)", pr->precedence,
1584 assoc_names[pr->assoc]);
1588 The LA sets which are (possibly) reported with each item:
1590 static void report_la(struct grammar *g, int lanum)
1592 struct symset la = set_find(g, lanum);
1596 printf(" LOOK AHEAD(%d)", lanum);
1597 for (i = 0; i < la.cnt; i++) {
1600 prtxt(g->symtab[la.syms[i]]->name);
1605 Then the go to sets:
1608 static void report_goto(struct grammar *g, struct symset gt)
1613 for (i = 0; i < gt.cnt; i++) {
1615 prtxt(g->symtab[gt.syms[i]]->name);
1616 printf(" -> %d\n", gt.data[i]);
1620 Now we can report all the item sets complete with items, LA sets, and GO TO.
1622 static void report_itemsets(struct grammar *g)
1625 printf("ITEM SETS(%d)\n", g->states);
1626 for (s = 0; s < g->states; s++) {
1628 struct itemset *is = g->statetab[s];
1629 printf(" Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
1630 for (j = 0; j < is->items.cnt; j++) {
1631 report_item(g, is->items.syms[j]);
1632 if (is->items.data != NO_DATA)
1633 report_la(g, is->items.data[j]);
1635 report_goto(g, is->go_to);
1639 ### Reporting conflicts
1641 Conflict detection varies a lot among different analysis levels. However
1642 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1643 LR1 are also similar as they have FOLLOW or LA sets.
1647 ## conflict functions
1649 static int report_conflicts(struct grammar *g, enum grammar_type type)
1652 printf("Conflicts:\n");
1654 cnt = conflicts_lr0(g, type);
1656 cnt = conflicts_slr(g, type);
1658 printf(" - no conflicts\n");
1662 LR0 conflicts are any state which have both a reducible item and
1663 a shiftable item, or two reducible items.
1665 LR05 conflicts only occurs if two possibly reductions exist,
1666 as shifts always over-ride reductions.
1668 ###### conflict functions
1669 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1673 for (i = 0; i < g->states; i++) {
1674 struct itemset *is = g->statetab[i];
1675 int last_reduce = -1;
1676 int prev_reduce = -1;
1677 int last_shift = -1;
1681 for (j = 0; j < is->items.cnt; j++) {
1682 int itm = is->items.syms[j];
1683 int p = item_prod(itm);
1684 int bp = item_index(itm);
1685 struct production *pr = g->productions[p];
1687 if (bp == pr->body_size) {
1688 prev_reduce = last_reduce;
1692 if (pr->body[bp]->type == Terminal)
1695 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1696 printf(" State %d has both SHIFT and REDUCE:\n", i);
1697 report_item(g, is->items.syms[last_shift]);
1698 report_item(g, is->items.syms[last_reduce]);
1701 if (prev_reduce >= 0) {
1702 printf(" State %d has 2 (or more) reducible items\n", i);
1703 report_item(g, is->items.syms[prev_reduce]);
1704 report_item(g, is->items.syms[last_reduce]);
1711 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1712 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1713 in the source of the look ahead set.
1715 We build two datasets to reflect the "action" table: one which maps
1716 terminals to items where that terminal could be shifted and another
1717 which maps terminals to items that could be reduced when the terminal
1718 is in look-ahead. We report when we get conflicts between the two.
1720 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1725 for (i = 0; i < g->states; i++) {
1726 struct itemset *is = g->statetab[i];
1727 struct symset shifts = INIT_DATASET;
1728 struct symset reduce = INIT_DATASET;
1732 /* First collect the shifts */
1733 for (j = 0; j < is->items.cnt; j++) {
1734 unsigned short itm = is->items.syms[j];
1735 int p = item_prod(itm);
1736 int bp = item_index(itm);
1737 struct production *pr = g->productions[p];
1739 if (bp < pr->body_size &&
1740 pr->body[bp]->type == Terminal) {
1742 int sym = pr->body[bp]->num;
1743 if (symset_find(&shifts, sym) < 0)
1744 symset_add(&shifts, sym, itm);
1747 /* Now look for reduction and conflicts */
1748 for (j = 0; j < is->items.cnt; j++) {
1749 unsigned short itm = is->items.syms[j];
1750 int p = item_prod(itm);
1751 int bp = item_index(itm);
1752 struct production *pr = g->productions[p];
1754 if (bp < pr->body_size)
1759 la = g->follow[pr->head->num];
1761 la = set_find(g, is->items.data[j]);
1763 for (k = 0; k < la.cnt; k++) {
1764 int pos = symset_find(&shifts, la.syms[k]);
1766 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1767 prtxt(g->symtab[la.syms[k]]->name);
1769 report_item(g, shifts.data[pos]);
1770 report_item(g, itm);
1773 pos = symset_find(&reduce, la.syms[k]);
1775 symset_add(&reduce, la.syms[k], itm);
1778 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1779 prtxt(g->symtab[la.syms[k]]->name);
1781 report_item(g, itm);
1782 report_item(g, reduce.data[pos]);
1786 symset_free(shifts);
1787 symset_free(reduce);
1793 ## Generating the parser
1795 The exported part of the parser is the `parse_XX` function, where the name
1796 `XX` is based on the name of the parser files.
1798 This takes a `code_node`, a partially initialized `token_config`, and an
1799 optional `FILE` to send tracing to. The `token_config` gets the list of
1800 known words added and then is used with the `code_node` to initialize the
1803 `parse_XX` then calls the library function `parser_run` to actually complete
1804 the parse. This needs the `states` table and function to call the various
1805 pieces of code provided in the grammar file, so they are generated first.
1807 ###### parser_generate
1809 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1815 gen_reduce(f, g, file);
1818 fprintf(f, "#line 0 \"gen_parser\"\n");
1819 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1822 fprintf(f, "\tstruct token_state *tokens;\n");
1823 fprintf(f, "\tconfig->words_marks = known;\n");
1824 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1825 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1826 fprintf(f, "\ttokens = token_open(code, config);\n");
1827 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1828 fprintf(f, "\ttoken_close(tokens);\n");
1829 fprintf(f, "\treturn rv;\n");
1830 fprintf(f, "}\n\n");
1833 ### Known words table
1835 The known words table is simply an array of terminal symbols.
1836 The table of nonterminals used for tracing is a similar array.
1840 static void gen_known(FILE *f, struct grammar *g)
1843 fprintf(f, "#line 0 \"gen_known\"\n");
1844 fprintf(f, "static const char *known[] = {\n");
1845 for (i = TK_reserved;
1846 i < g->num_syms && g->symtab[i]->type == Terminal;
1848 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1849 g->symtab[i]->name.txt);
1850 fprintf(f, "};\n\n");
1853 static void gen_non_term(FILE *f, struct grammar *g)
1856 fprintf(f, "#line 0 \"gen_non_term\"\n");
1857 fprintf(f, "static const char *non_term[] = {\n");
1858 for (i = TK_reserved;
1861 if (g->symtab[i]->type == Nonterminal)
1862 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1863 g->symtab[i]->name.txt);
1864 fprintf(f, "};\n\n");
1867 ### States and the goto tables.
1869 For each state we record the goto table, the reducible production if
1870 there is one, or a symbol to shift for error recovery.
1871 Some of the details of the reducible production are stored in the
1872 `do_reduce` function to come later. Here we store the production number,
1873 the body size (useful for stack management) and the resulting symbol (useful
1874 for knowing how to free data later).
1876 The go to table is stored in a simple array of `sym` and corresponding
1879 ###### exported types
1887 const struct lookup * go_to;
1898 static void gen_goto(FILE *f, struct grammar *g)
1901 fprintf(f, "#line 0 \"gen_goto\"\n");
1902 for (i = 0; i < g->states; i++) {
1904 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1906 struct symset gt = g->statetab[i]->go_to;
1907 for (j = 0; j < gt.cnt; j++)
1908 fprintf(f, "\t{ %d, %d },\n",
1909 gt.syms[j], gt.data[j]);
1916 static void gen_states(FILE *f, struct grammar *g)
1919 fprintf(f, "#line 0 \"gen_states\"\n");
1920 fprintf(f, "static const struct state states[] = {\n");
1921 for (i = 0; i < g->states; i++) {
1922 struct itemset *is = g->statetab[i];
1923 int j, prod = -1, prod_len;
1925 int shift_len = 0, shift_remain = 0;
1926 for (j = 0; j < is->items.cnt; j++) {
1927 int itm = is->items.syms[j];
1928 int p = item_prod(itm);
1929 int bp = item_index(itm);
1930 struct production *pr = g->productions[p];
1932 if (bp < pr->body_size) {
1933 if (shift_sym < 0 ||
1934 (shift_len == bp && shift_remain > pr->body_size - bp)) {
1935 shift_sym = pr->body[bp]->num;
1937 shift_remain = pr->body_size - bp;
1941 /* This is what we reduce */
1942 if (prod < 0 || prod_len < pr->body_size) {
1944 prod_len = pr->body_size;
1949 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
1950 i, is->go_to.cnt, i, prod,
1951 g->productions[prod]->body_size,
1952 g->productions[prod]->head->num,
1955 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
1956 i, is->go_to.cnt, i, shift_sym,
1959 fprintf(f, "};\n\n");
1962 ### The `do_reduce` function and the code
1964 When the parser engine decides to reduce a production, it calls `do_reduce`.
1965 This has two functions.
1967 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1968 production being reduced. Secondly it runs the code that was included with
1969 the production if any.
1971 This code needs to be able to store data somewhere. Rather than requiring
1972 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1973 `do_reduce` return the size to be saved.
1975 In order for the code to access "global" context, we pass in the
1976 "config" pointer that was passed to parser function. If the `struct
1977 token_config` is embedded in some larger structure, the reducing code
1978 can access the larger structure using pointer manipulation.
1980 The code fragment requires translation when written out. Any `$N` needs to
1981 be converted to a reference either to that buffer (if `$0`) or to the
1982 structure returned by a previous reduction. These pointers need to be cast
1983 to the appropriate type for each access. All this is handling in
1986 `gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
1987 This applied only to symbols with references (or pointers), not those with structures.
1988 The `<` implies that the reference it being moved out, so the object will not be
1989 automatically freed. This is equivalent to assigning `NULL` to the pointer.
1993 static void gen_code(struct production *p, FILE *f, struct grammar *g)
1996 char *used = calloc(1, p->body_size);
1999 fprintf(f, "\t\t\t");
2000 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2014 if (*c < '0' || *c > '9') {
2021 while (c[1] >= '0' && c[1] <= '9') {
2023 n = n * 10 + *c - '0';
2026 fprintf(f, "(*(struct %.*s*%s)ret)",
2027 p->head->struct_name.len,
2028 p->head->struct_name.txt,
2029 p->head->isref ? "*":"");
2030 else if (n > p->body_size)
2031 fprintf(f, "$%d", n);
2032 else if (p->body[n-1]->type == Terminal)
2033 fprintf(f, "(*(struct token *)body[%d])",
2035 else if (p->body[n-1]->struct_name.txt == NULL)
2036 fprintf(f, "$%d", n);
2038 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2039 p->body[n-1]->struct_name.len,
2040 p->body[n-1]->struct_name.txt,
2041 p->body[n-1]->isref ? "*":"", n-1);
2046 for (i = 0; i < p->body_size; i++) {
2047 if (p->body[i]->struct_name.txt &&
2048 p->body[i]->isref &&
2050 // assume this has been copied out
2051 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2058 static void gen_reduce(FILE *f, struct grammar *g, char *file)
2061 fprintf(f, "#line 0 \"gen_reduce\"\n");
2062 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2064 fprintf(f, "\tint ret_size = 0;\n");
2066 fprintf(f, "\tswitch(prod) {\n");
2067 for (i = 0; i < g->production_count; i++) {
2068 struct production *p = g->productions[i];
2069 fprintf(f, "\tcase %d:\n", i);
2074 if (p->head->struct_name.txt)
2075 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2076 p->head->struct_name.len,
2077 p->head->struct_name.txt,
2078 p->head->isref ? "*":"");
2080 fprintf(f, "\t\tbreak;\n");
2082 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2087 As each non-terminal can potentially cause a different type of data
2088 structure to be allocated and filled in, we need to be able to free it when
2091 It is particularly important to have fine control over freeing during error
2092 recovery where individual stack frames might need to be freed.
2094 For this, the grammar author is required to defined a `free_XX` function for
2095 each structure that is used by a non-terminal. `do_free` will call whichever
2096 is appropriate given a symbol number, and will call `free` (as is
2097 appropriate for tokens on any terminal symbol.
2101 static void gen_free(FILE *f, struct grammar *g)
2105 fprintf(f, "#line 0 \"gen_free\"\n");
2106 fprintf(f, "static void do_free(short sym, void *asn)\n");
2108 fprintf(f, "\tif (!asn) return;\n");
2109 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2110 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2111 fprintf(f, "\tswitch(sym) {\n");
2113 for (i = 0; i < g->num_syms; i++) {
2114 struct symbol *s = g->symtab[i];
2116 s->type != Nonterminal ||
2117 s->struct_name.txt == NULL)
2120 fprintf(f, "\tcase %d:\n", s->num);
2122 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2124 s->struct_name.txt);
2125 fprintf(f, "\t\tfree(asn);\n");
2127 fprintf(f, "\t\tfree_%.*s(asn);\n",
2129 s->struct_name.txt);
2130 fprintf(f, "\t\tbreak;\n");
2132 fprintf(f, "\t}\n}\n\n");
2135 ## The main routine.
2137 There are three key parts to the "main" function of parsergen: processing
2138 the arguments, loading the grammar file, and dealing with the grammar.
2140 ### Argument processing.
2142 Command line options allow the selection of analysis mode, name of output
2143 file, and whether or not a report should be generated. By default we create
2144 a report only if no code output was requested.
2146 The `parse_XX` function name uses the basename of the output file which
2147 should not have a suffix (`.c`). `.c` is added to the given name for the
2148 code, and `.h` is added for the header (if header text is specifed in the
2155 static const struct option long_options[] = {
2156 { "LR0", 0, NULL, '0' },
2157 { "LR05", 0, NULL, '5' },
2158 { "SLR", 0, NULL, 'S' },
2159 { "LALR", 0, NULL, 'L' },
2160 { "LR1", 0, NULL, '1' },
2161 { "tag", 1, NULL, 't' },
2162 { "report", 0, NULL, 'R' },
2163 { "output", 1, NULL, 'o' },
2164 { NULL, 0, NULL, 0 }
2166 const char *options = "05SL1t:Ro:";
2168 ###### process arguments
2170 char *outfile = NULL;
2175 enum grammar_type type = LR05;
2176 while ((opt = getopt_long(argc, argv, options,
2177 long_options, NULL)) != -1) {
2192 outfile = optarg; break;
2194 tag = optarg; break;
2196 fprintf(stderr, "Usage: parsergen ...\n");
2201 infile = argv[optind++];
2203 fprintf(stderr, "No input file given\n");
2206 if (outfile && report == 1)
2209 if (name && strchr(name, '/'))
2210 name = strrchr(name, '/')+1;
2212 if (optind < argc) {
2213 fprintf(stderr, "Excess command line arguments\n");
2217 ### Loading the grammar file
2219 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2222 One we have extracted the code (with `mdcode`) we expect to find three
2223 sections: header, code, and grammar. Anything else that is not
2224 excluded by the `--tag` option is an error.
2226 "header" and "code" are optional, though it is hard to build a working
2227 parser with neither. "grammar" must be provided.
2231 #include <sys/mman.h>
2236 static void pr_err(char *msg)
2239 fprintf(stderr, "%s\n", msg);
2243 struct section *table;
2247 fd = open(infile, O_RDONLY);
2249 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2250 infile, strerror(errno));
2253 len = lseek(fd, 0, 2);
2254 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2255 table = code_extract(file, file+len, pr_err);
2257 struct code_node *hdr = NULL;
2258 struct code_node *code = NULL;
2259 struct code_node *gram = NULL;
2260 for (s = table; s; s = s->next) {
2261 struct text sec = s->section;
2262 if (tag && !strip_tag(&sec, tag))
2264 if (text_is(sec, "header"))
2266 else if (text_is(sec, "code"))
2268 else if (text_is(sec, "grammar"))
2271 fprintf(stderr, "Unknown content section: %.*s\n",
2272 s->section.len, s->section.txt);
2277 ### Processing the input
2279 As we need to append an extention to a filename and then open it for
2280 writing, and we need to do this twice, it helps to have a separate function.
2284 static FILE *open_ext(char *base, char *ext)
2286 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2288 strcat(strcpy(fn, base), ext);
2294 If we can read the grammar, then we analyse and optionally report on it. If
2295 the report finds conflicts we will exit with an error status.
2297 ###### process input
2298 struct grammar *g = NULL;
2300 fprintf(stderr, "No grammar section provided\n");
2303 g = grammar_read(gram);
2305 fprintf(stderr, "Failure to parse grammar\n");
2310 grammar_analyse(g, type);
2312 if (grammar_report(g, type))
2316 If a headers section is defined, we write it out and include a declaration
2317 for the `parse_XX` function so it can be used from separate code.
2319 if (rv == 0 && hdr && outfile) {
2320 FILE *f = open_ext(outfile, ".h");
2322 code_node_print(f, hdr, infile);
2323 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2327 fprintf(stderr, "Cannot create %s.h\n",
2333 And if all goes well and an output file was provided, we create the `.c`
2334 file with the code section (if any) and the parser tables and function.
2336 if (rv == 0 && outfile) {
2337 FILE *f = open_ext(outfile, ".c");
2340 code_node_print(f, code, infile);
2341 gen_parser(f, g, infile, name);
2344 fprintf(stderr, "Cannot create %s.c\n",
2350 And that about wraps it up. We need to set the locale so that UTF-8 is
2351 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2353 ###### File: parsergen.mk
2354 parsergen : parsergen.o libscanner.o libmdcode.o
2355 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2362 int main(int argc, char *argv[])
2367 setlocale(LC_ALL,"");
2369 ## process arguments
2376 ## The SHIFT/REDUCE parser
2378 Having analysed the grammar and generated all the tables, we only need the
2379 shift/reduce engine to bring it all together.
2381 ### Goto table lookup
2383 The parser generator has nicely provided us with goto tables sorted by
2384 symbol number. We need a binary search function to find a symbol in the
2387 ###### parser functions
2389 static int search(const struct state *l, int sym)
2392 int hi = l->go_to_cnt;
2396 while (lo + 1 < hi) {
2397 int mid = (lo + hi) / 2;
2398 if (l->go_to[mid].sym <= sym)
2403 if (l->go_to[lo].sym == sym)
2404 return l->go_to[lo].state;
2409 ### The state stack.
2411 The core data structure for the parser is the stack. This tracks all the
2412 symbols that have been recognised or partially recognised.
2414 The stack usually won't grow very large - maybe a few tens of entries. So
2415 we dynamically resize and array as required but never bother to shrink it
2418 We keep the stack as two separate allocations. One, `asn_stack` stores the
2419 "abstract syntax nodes" which are created by each reduction. When we call
2420 `do_reduce` we need to pass an array of the `asn`s of the body of the
2421 production, and by keeping a separate `asn` stack, we can just pass a
2422 pointer into this stack.
2424 The other allocation stores all other stack fields of which there are six.
2425 The `state` is the most important one and guides the parsing process. The
2426 `sym` is nearly unnecessary. However when we want to free entries from the
2427 `asn_stack`, it helps to know what type they are so we can call the right
2428 freeing function. The symbol leads us to the right free function through
2431 The `indents` count and the `starts_indented` flag track the line
2432 indents in the symbol. These are used to allow indent information to
2433 guide parsing and error recovery.
2435 `since_newline` tracks how many stack frames since the last
2436 start-of-line (whether indented or not). So if `since_newline` is
2437 zero, then this symbol is at the start of a line.
2439 `newline_permitted` keeps track of whether newlines should be ignored
2440 or not, and `starts_line` records if this state stated on a newline.
2442 The stack is most properly seen as alternating states and symbols -
2443 states, like the 'DOT' in items, are between symbols. Each frame in
2444 our stack holds a state and the symbol that was before it. The
2445 bottom of stack holds the start state, but no symbol, as nothing came
2446 before the beginning.
2448 ###### parser functions
2453 short newline_permitted;
2456 short starts_indented;
2458 short starts_newline;
2467 Two operations are needed on the stack - shift (which is like push) and pop.
2469 Shift applies not only to terminals but also to non-terminals. When we
2470 reduce a production we will pop off entries corresponding to the body
2471 symbols, then push on an item for the head of the production. This last is
2472 exactly the same process as shifting in a terminal so we use the same
2473 function for both. In both cases we provide a stack frame which
2474 contains the symbol to shift and related indent information.
2476 To simplify other code we arrange for `shift` to fail if there is no `goto`
2477 state for the symbol. This is useful in basic parsing due to our design
2478 that we shift when we can, and reduce when we cannot. So the `shift`
2479 function reports if it could.
2481 `shift` is also used to push state zero onto the stack, so if the
2482 stack is empty, it always chooses zero as the next state.
2484 So `shift` finds the next state. If that succeed it extends the allocations
2485 if needed and pushes all the information onto the stacks.
2487 Newlines are permitted after a starts_line state until an internal
2488 indent. So we need to find the topmost state which `starts_line` and
2489 see if there are any indents other than immediately after it.
2493 - if state starts_line, then newlines_permitted.
2494 - if any non-initial indents, newlines not permitted
2496 ###### parser functions
2498 static int shift(struct parser *p, struct frame *next,
2500 const struct state states[])
2502 // Push an entry onto the stack
2503 int newstate = p->tos
2504 ? search(&states[p->stack[p->tos-1].state],
2509 if (p->tos >= p->stack_size) {
2510 p->stack_size += 10;
2511 p->stack = realloc(p->stack, p->stack_size
2512 * sizeof(p->stack[0]));
2513 p->asn_stack = realloc(p->asn_stack, p->stack_size
2514 * sizeof(p->asn_stack[0]));
2516 next->state = newstate;
2517 next->newline_permitted = 0;
2519 next->newline_permitted =
2520 (p->stack[p->tos-1].newline_permitted?:-1)+1;
2521 if (next->indents > next->starts_indented)
2522 next->newline_permitted = 0;
2523 if (next->indents && next->newline_permitted > 2)
2524 next->newline_permitted = 0;
2525 if (states[newstate].starts_line)
2526 next->newline_permitted = 1;
2527 p->stack[p->tos] = *next;
2528 p->asn_stack[p->tos] = asn;
2533 `pop` primarily moves the top of stack (`tos`) back down the required
2534 amount and frees any `asn` entries that need to be freed. It also
2535 collects a summary of the indents in the symbols that are being
2536 removed. It is called _after_ we reduce a production, just before we
2537 `shift` the nonterminal in.
2539 `pop` is only called if there are entries to remove, so `num` is never zero.
2541 ###### parser functions
2543 static void pop(struct parser *p, int num, struct frame *next,
2544 void(*do_free)(short sym, void *asn))
2548 next->starts_indented =
2549 p->stack[p->tos].starts_indented;
2550 next->starts_newline =
2551 p->stack[p->tos].starts_newline;
2553 for (i = 0; i < num; i++) {
2554 next->indents += p->stack[p->tos+i].indents;
2555 do_free(p->stack[p->tos+i].sym,
2556 p->asn_stack[p->tos+i]);
2560 ### Memory allocation
2562 The `scanner` returns tokens in a local variable - we want them in allocated
2563 memory so they can live in the `asn_stack`. Similarly the `asn` produced by
2564 a reduce is in a large buffer. Both of these require some allocation and
2565 copying, hence `memdup` and `tokcopy`.
2567 ###### parser includes
2570 ###### parser functions
2572 void *memdup(void *m, int len)
2578 memcpy(ret, m, len);
2582 static struct token *tok_copy(struct token tk)
2584 struct token *new = malloc(sizeof(*new));
2589 ### The heart of the parser.
2591 Now we have the parser. If we can shift, we do, though newlines and
2592 reducing indenting may block that. If not and we can reduce we do.
2593 If the production we reduced was production zero, then we have
2594 accepted the input and can finish.
2596 We return whatever `asn` was returned by reducing production zero.
2598 If we can neither shift nor reduce we have an error to handle. We pop
2599 single entries off the stack until we can shift the `TK_error` symbol, then
2600 drop input tokens until we find one we can shift into the new error state.
2602 When we find `TK_in` and `TK_out` tokens which report indents we need
2603 to handle them directly as the grammar cannot express what we want to
2606 `TK_in` tokens are easy: we simply update the `next` stack frame to
2607 record how many indents there are and that the next token started with
2610 `TK_out` tokens must either be counted off against any pending indent,
2611 or must force reductions until there is a pending indent which isn't
2612 at the start of a production.
2614 `TK_newline` tokens are ignored precisely if there has been an indent
2615 since the last state which could have been at the start of a line.
2617 ###### parser includes
2620 void *parser_run(struct token_state *tokens,
2621 const struct state states[],
2622 int (*do_reduce)(int, void**, struct token_config*, void*),
2623 void (*do_free)(short, void*),
2624 FILE *trace, const char *non_term[],
2625 struct token_config *config)
2627 struct parser p = { 0 };
2628 struct frame next = { 0 };
2629 struct token *tk = NULL;
2633 next.starts_newline = 1;
2634 shift(&p, &next, NULL, states);
2636 struct token *err_tk;
2637 struct frame *tos = &p.stack[p.tos-1];
2639 tk = tok_copy(token_next(tokens));
2641 parser_trace(trace, &p, &next, tk, states, non_term, config->known_count);
2643 if (next.sym == TK_in) {
2644 next.starts_indented = 1;
2646 next.starts_newline = 1;
2649 parser_trace_action(trace, "Record");
2652 if (next.sym == TK_out) {
2653 if (tos->indents > tos->starts_indented ||
2654 (tos->indents == 1 &&
2655 states[tos->state].reduce_size != 1)) {
2657 if (tos->indents <= tos->starts_indented) {
2658 // no internal indent any more, reassess 'newline_permitted'
2659 if (states[tos->state].starts_line)
2660 tos->newline_permitted = 1;
2662 tos->newline_permitted = (p.stack[p.tos-2].newline_permitted ?:-1)+1;
2666 parser_trace_action(trace, "Cancel");
2669 // fall through and force a REDUCE (as 'shift'
2672 if (next.sym == TK_newline) {
2673 if (! tos->newline_permitted) {
2676 parser_trace_action(trace, "Discard");
2680 if (shift(&p, &next, tk, states)) {
2681 next.starts_newline =
2682 tk->num == TK_newline;
2683 next.starts_indented = 0;
2686 parser_trace_action(trace, "Shift");
2689 if (states[tos->state].reduce_prod >= 0) {
2692 const struct state *nextstate = &states[tos->state];
2693 int prod = nextstate->reduce_prod;
2694 int size = nextstate->reduce_size;
2696 static char buf[16*1024];
2698 frame.sym = nextstate->reduce_sym;
2700 body = p.asn_stack + (p.tos - size);
2702 bufsize = do_reduce(prod, body, config, buf);
2705 pop(&p, size, &frame, do_free);
2707 frame.indents = next.indents;
2708 frame.starts_indented = frame.indents;
2709 frame.starts_newline = 0;
2711 next.starts_indented = 0;
2713 res = memdup(buf, bufsize);
2714 memset(buf, 0, bufsize);
2715 if (!shift(&p, &frame, res, states)) {
2716 if (prod != 0) abort();
2720 parser_trace_action(trace, "Reduce");
2723 if (tk->num == TK_out) {
2724 // Indent problem - synthesise tokens to get us
2726 struct frame frame = { 0 };
2727 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[tos->state].shift_sym);
2728 frame.sym = states[tos->state].shift_sym;
2729 shift(&p, &frame, tok_copy(*tk), states);
2730 // FIXME need to report this error somehow
2731 parser_trace_action(trace, "Synthesize");
2734 /* Error. We walk up the stack until we
2735 * find a state which will accept TK_error.
2736 * We then shift in TK_error and see what state
2737 * that takes us too.
2738 * Then we discard input tokens until
2739 * we find one that is acceptable.
2741 parser_trace_action(trace, "ERROR");
2743 err_tk = tok_copy(*tk);
2744 next.sym = TK_error;
2745 while (shift(&p, &next, err_tk, states) == 0
2747 // discard this state
2748 pop(&p, 1, &next, do_free);
2751 // no state accepted TK_error
2754 tos = &p.stack[p.tos-1];
2755 while (search(&states[tos->state], tk->num) < 0 &&
2756 tk->num != TK_eof) {
2758 tk = tok_copy(token_next(tokens));
2759 if (tk->num == TK_in)
2761 if (tk->num == TK_out) {
2762 if (next.indents == 0)
2767 if (p.tos == 0 && tk->num == TK_eof)
2772 pop(&p, p.tos, &next, do_free);
2778 ###### exported functions
2779 void *parser_run(struct token_state *tokens,
2780 const struct state states[],
2781 int (*do_reduce)(int, void**, struct token_config*, void*),
2782 void (*do_free)(short, void*),
2783 FILE *trace, const char *non_term[],
2784 struct token_config *config);
2788 Being able to visualize the parser in action can be invaluable when
2789 debugging the parser code, or trying to understand how the parse of a
2790 particular grammar progresses. The stack contains all the important
2791 state, so just printing out the stack every time around the parse loop
2792 can make it possible to see exactly what is happening.
2794 This doesn't explicitly show each SHIFT and REDUCE action. However they
2795 are easily deduced from the change between consecutive lines, and the
2796 details of each state can be found by cross referencing the states list
2797 in the stack with the "report" that parsergen can generate.
2799 For terminal symbols, we just dump the token. For non-terminals we
2800 print the name of the symbol. The look ahead token is reported at the
2801 end inside square brackets.
2803 ###### parser functions
2805 static char *reserved_words[] = {
2806 [TK_error] = "ERROR",
2809 [TK_newline] = "NEWLINE",
2812 static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2814 fprintf(trace, "(%d", f->state);
2815 if (states[f->state].starts_line)
2816 fprintf(trace, "s");
2817 if (f->newline_permitted)
2818 fprintf(trace, "n%d", f->newline_permitted);
2819 fprintf(trace, ") ");
2822 void parser_trace(FILE *trace, struct parser *p, struct frame *n,
2823 struct token *tk, const struct state states[],
2824 const char *non_term[], int knowns)
2829 for (i = 0; i < p->tos; i++) {
2830 struct frame *f = &p->stack[i];
2833 if (sym < TK_reserved &&
2834 reserved_words[sym] != NULL)
2835 fputs(reserved_words[sym], trace);
2836 else if (sym < TK_reserved + knowns) {
2837 struct token *t = p->asn_stack[i];
2838 text_dump(trace, t->txt, 20);
2840 fputs(non_term[sym - TK_reserved - knowns],
2843 fprintf(trace, "%c%d", f->starts_indented?':':'.',
2845 if (f->starts_newline)
2849 parser_trace_state(trace, f, states);
2851 fprintf(trace, "[");
2852 if (tk->num < TK_reserved &&
2853 reserved_words[tk->num] != NULL)
2854 fputs(reserved_words[tk->num], trace);
2856 text_dump(trace, tk->txt, 20);
2858 fprintf(trace, "%c%d", n->starts_indented?':':'.',
2860 if (n->starts_newline)
2865 void parser_trace_action(FILE *trace, char *action)
2868 fprintf(trace, " - %s\n", action);
2873 The obvious example for a parser is a calculator.
2875 As `scanner` provides number parsing function using `libgmp` is it not much
2876 work to perform arbitrary rational number calculations.
2878 This calculator takes one expression, or an equality test per line. The
2879 results are printed and if any equality test fails, the program exits with
2882 ###### File: parsergen.mk
2883 calc.c calc.h : parsergen parsergen.mdc
2884 ./parsergen --tag calc -o calc parsergen.mdc
2885 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2886 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2891 // what do we use for a demo-grammar? A calculator of course.
2903 #include <sys/mman.h>
2908 #include "scanner.h"
2914 static void free_number(struct number *n)
2920 int main(int argc, char *argv[])
2922 int fd = open(argv[1], O_RDONLY);
2923 int len = lseek(fd, 0, 2);
2924 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2925 struct section *s = code_extract(file, file+len, NULL);
2926 struct token_config config = {
2927 .ignored = (1 << TK_line_comment)
2928 | (1 << TK_block_comment)
2931 .number_chars = ".,_+-",
2935 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2937 struct section *t = s->next;
2947 Session -> Session Line
2950 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2951 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2952 gmp_printf(" or as a decimal: %Fg\n", fl);
2956 | Expression = Expression NEWLINE ${
2957 if (mpq_equal($1.val, $3.val))
2958 gmp_printf("Both equal %Qd\n", $1.val);
2960 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
2965 | NEWLINE ${ printf("Blank line\n"); }$
2966 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2969 Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2970 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2971 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2973 Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2974 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2975 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2977 Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2978 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$