1 # LR(1) Parser Generator with 2D support #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 "2D support" means that indentation and line breaks can be significant.
9 There are several distinct sections.
11 1. `grammar_read` will read a grammar from a literate-code file and
12 store the productions, symbols, and code fragments.
13 2. `grammar_analyse` will take that grammar and build LR parsing
14 states and look-ahead tables.
15 3. `grammar_report` will present the details of the analysis
16 in a readable format and will report any conflicts.
17 4. `parser_generate` will write out C code files with various
18 tables and with the code fragments arranged in useful places.
19 5. `parser_run` is a library function intended to be linked together
20 with the generated parser tables to complete the implementation of
22 6. Finally `calc` is a test grammar for a simple calculator. The
23 `parsergen` program built from the C code in this file can extract
24 that grammar directly from this file and process it.
26 ###### File: parsergen.c
31 ## forward declarations
42 ###### File: libparser.c
49 ###### File: parsergen.mk
52 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
55 ## Reading the grammar
57 The grammar must be stored in a markdown literate code file as parsed by
58 "[mdcode][]". It may have up to four top level (i.e. unreferenced)
59 sections: `header`, `code`, `grammar` and `reduce`. The first two, if
60 present, will be literally copied into the generated `.c` and `.h`
61 files. The third is required and contains the grammar which is
62 tokenised with "[scanner][]". `reduce` can provide variable
63 declarations and common intialization code for all the reduction code
64 fragments in the grammar.
66 If the `--tag` option is given, then any top level header that doesn't
67 start with the tag is ignored, and the tag is striped from the rest. So
68 `--tag Foo` means that the four sections expected are `Foo: header`,
69 `Foo: code`, `Foo: grammar` and `Foo: reduce`. The tag `calc` is used
70 to extract the sample calculator from this file.
73 [scanner]: scanner.html
79 ###### parser includes
83 The grammar contains production sets, precedence/associativity
84 declarations, general terminal declarations, and data type declarations.
85 These are all parsed with _ad hoc_ parsing as we don't have a parser
88 The precedence and associativity can be set for each production, and
89 can be inherited from symbols. The data type (either structure or a
90 reference to a structure) is potentially defined for each non-terminal
91 and describes what C structure is used to store information about each
95 enum assoc {Left, Right, Non};
96 char *assoc_names[] = {"Left","Right","Non"};
99 struct text struct_name;
102 unsigned short precedence;
106 unsigned short precedence;
114 The strings reported by `mdcode` and `scanner` are `struct text` which
115 have length rather than being nul terminated. To help with printing and
116 comparing we define `text_is` and `prtxt`, which should possibly go in
117 `mdcode`. `scanner` does provide `text_dump` which is useful for
118 strings which might contain control characters.
120 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
121 because that is what we need to detect tags.
124 static int text_is(struct text t, char *s)
126 return (strlen(s) == t.len &&
127 strncmp(s, t.txt, t.len) == 0);
129 static void prtxt(struct text t)
131 printf("%.*s", t.len, t.txt);
134 static int strip_tag(struct text *t, char *tag)
136 int skip = strlen(tag) + 1;
137 if (skip >= t->len ||
138 strncmp(t->txt, tag, skip-1) != 0 ||
139 t->txt[skip-1] != ':')
141 while (skip < t->len && t->txt[skip] == ' ')
150 Productions are comprised primarily of symbols - terminal and
151 non-terminal. We do not make a syntactic distinction between the two,
152 though non-terminals must be identifiers. Non-terminal symbols are
153 those which appear in the head of a production, terminal symbols are
154 those which don't. There are also "virtual" symbols used for precedence
155 marking discussed later, and sometimes we won't know what type a symbol
158 To help with code safety it is possible to declare the terminal symbols.
159 If this is done, then any symbol used in a production that does not
160 appear in the head of any production and is not declared as a terminal
161 is treated as an error.
163 ###### forward declarations
164 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
165 char *symtypes = "UVTN";
168 ###### grammar fields
169 int terminals_declared;
171 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
172 table of known symbols and the resulting parser will report them as
173 `TK_reserved + N`. A small set of identifiers are reserved for the
174 different token types that `scanner` can report, and an even smaller set
175 are reserved for a special token that the parser can generate (`EOL`) as
176 will be described later. This latter set cannot use predefined numbers,
177 so they are marked as `isspecial` for now and will get assigned a number
178 with the non-terminals later.
182 static struct { int num; char *name; } reserved_words[] = {
183 { TK_error, "ERROR" },
184 { TK_number, "NUMBER" },
185 { TK_ident, "IDENTIFIER" },
187 { TK_string, "STRING" },
188 { TK_multi_string, "MULTI_STRING" },
191 { TK_newline, "NEWLINE" },
198 unsigned int isspecial:1;
200 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
201 recognised. The former is automatically expected at the end of the text
202 being parsed. The latter are always ignored by the parser.
204 All of these symbols are stored in a simple symbol table. We use the
205 `struct text` from `mdcode` to store the name and link them together
206 into a sorted list using an insertion sort.
208 We don't have separate `find` and `insert` functions as any symbol we
209 find needs to be remembered. We simply expect `find` to always return a
210 symbol, but its type might be `Unknown`.
219 ###### grammar fields
224 static struct symbol *sym_find(struct grammar *g, struct text s)
226 struct symbol **l = &g->syms;
231 (cmp = text_cmp((*l)->name, s)) < 0)
235 n = calloc(1, sizeof(*n));
244 static void symbols_init(struct grammar *g)
246 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
248 for (i = 0; i < entries; i++) {
251 t.txt = reserved_words[i].name;
252 t.len = strlen(t.txt);
255 s->num = reserved_words[i].num;
260 ### Data types and precedence.
262 Data type specification, precedence specification, and declaration of
263 terminals are all introduced by a dollar sign at the start of the line.
264 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
265 precedence, if it is `TERM` then the line declares terminals without
266 precedence, otherwise it specifies a data type.
268 The data type name is simply stored and applied to the head of all
269 subsequent productions. It must be the name of a structure optionally
270 preceded by an asterisk which means a reference or "pointer". So
271 `$expression` maps to `struct expression` and `$*statement` maps to
272 `struct statement *`.
274 Any productions given before the first data type declaration will have
275 no data type associated with them and can carry no information. In
276 order to allow other non-terminals to have no type, the data type
277 `$void` can be given. This does *not* mean that `struct void` will be
278 used, but rather than no type will be associated with subsequent
281 The precedence line must contain a list of symbols, either terminal
282 symbols or virtual symbols. It can only contain symbols that have not
283 been seen yet, so precedence declaration must precede the productions
286 A "virtual" symbol is an identifier preceded by `$$`. Virtual symbols
287 carry precedence information but are not included in the grammar. A
288 production can declare that it inherits the precedence of a given
291 This use for `$$` precludes it from being used as a symbol in the
292 described language. Two other symbols: `${` and `}$` are also
295 Each new precedence line introduces a new precedence level and
296 declares how it associates. This level is stored in each symbol
297 listed and may be inherited by any production which uses the symbol. A
298 production inherits from the last symbol which has a precedence.
300 The symbols on the first precedence line have the lowest precedence.
301 Subsequent lines introduce symbols with higher precedence and so bind
304 Note that a declaration line (or "dollar line") can start with either of
305 two different marks: "$" or "$*". The `dollar_line()` function defined
306 here is told which was found in the `isref` argument.
308 ###### grammar fields
309 struct text current_type;
314 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
315 static const char *known[] = { "$$", "${", "}$" };
318 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
320 struct token t = token_next(ts);
326 if (t.num != TK_ident) {
327 err = "type or assoc expected after '$'";
330 if (text_is(t.txt, "LEFT"))
332 else if (text_is(t.txt, "RIGHT"))
334 else if (text_is(t.txt, "NON"))
336 else if (text_is(t.txt, "TERM")) {
338 g->terminals_declared = 1;
340 g->current_type = t.txt;
341 g->type_isref = isref;
342 if (text_is(t.txt, "void"))
343 g->current_type.txt = NULL;
345 if (t.num != TK_newline) {
346 err = "Extra tokens after type name";
353 err = "$* cannot be followed by a precedence";
357 // This is a precedence or TERM line, need some symbols.
361 while (t.num != TK_newline) {
362 enum symtype type = Terminal;
364 if (t.num == TK_virtual) {
367 if (t.num != TK_ident) {
368 err = "$$ must be followed by a word";
372 err = "Virtual symbols not permitted on $TERM line";
375 } else if (t.num != TK_ident &&
377 err = "Illegal token in precedence line";
380 s = sym_find(g, t.txt);
381 if (s->type != Unknown) {
382 err = "Symbols in precedence/TERM line must not already be known.";
387 s->precedence = g->prec_levels;
394 err = "No symbols given on precedence/TERM line";
398 while (t.num != TK_newline && t.num != TK_eof)
405 A production either starts with an identifier which is the head
406 non-terminal, or a vertical bar (`|`) in which case this production
407 uses the same head as the previous one. The identifier must be
408 followed by a `->` mark. All productions for a given non-terminal must
409 be grouped together with the `nonterminal ->` given only once.
411 After this a (possibly empty) sequence of identifiers and marks form
412 the body of the production. A virtual symbol may be given after the
413 body by preceding it with `$$`. If a virtual symbol is given, the
414 precedence of the production is that for the virtual symbol. If none
415 is given, the precedence is inherited from the last symbol in the
416 production which has a precedence specified.
418 After the optional precedence may come the `${` mark. This indicates
419 the start of a code fragment. If present, this must be on the same
420 line as the start of the production.
422 All of the text from the `${` through to the matching `}$` is
423 collected and forms the code-fragment for the production. It must all
424 be in one `code_node` of the literate code. The `}$` must be
425 at the end of a line.
427 Text in the code fragment will undergo substitutions where `$N` or
428 `$<N`,for some numeric `N` (or non-numeric indicator as described
429 later), will be replaced with a variable holding the parse information
430 for the particular symbol in the production. `$0` is the head of the
431 production, `$1` is the first symbol of the body, etc. The type of `$N`
432 for a terminal symbol is `struct token`. For a non-terminal, it is
433 whatever has been declared for that symbol. The `<` may be included and
434 means that the value (usually a reference) is being moved out, so it
435 will not automatically be freed. The effect of using '<' is that the
436 variable is cleared to all-zeros.
438 While building productions we will need to add to an array which needs to
442 static void array_add(void *varray, int *cnt, void *new)
444 void ***array = varray;
447 current = ((*cnt-1) | (step-1))+1;
448 if (*cnt == current) {
451 *array = realloc(*array, current * sizeof(void*));
453 (*array)[*cnt] = new;
457 Collecting the code fragment simply involves reading tokens until we
458 hit the end token `}$`, and noting the character position of the start and
462 static struct text collect_code(struct token_state *state,
467 code.txt = start.txt.txt + start.txt.len;
469 t = token_next(state);
470 while (t.node == start.node &&
471 t.num != TK_close && t.num != TK_error &&
473 if (t.num == TK_close && t.node == start.node)
474 code.len = t.txt.txt - code.txt;
480 Now we have all the bits we need to parse a full production.
485 ###### grammar fields
486 struct production **productions;
487 int production_count;
489 ###### production fields
491 struct symbol **body;
497 int first_production;
500 static char *parse_production(struct grammar *g,
502 struct token_state *state)
504 /* Head has already been parsed. */
507 struct production p, *pp;
509 memset(&p, 0, sizeof(p));
511 tk = token_next(state);
512 while (tk.num == TK_ident || tk.num == TK_mark) {
513 struct symbol *bs = sym_find(g, tk.txt);
514 if (bs->type == Unknown) {
515 if (!g->terminals_declared)
518 if (bs->type == Virtual) {
519 err = "Virtual symbol not permitted in production";
522 if (bs->precedence) {
523 p.precedence = bs->precedence;
526 array_add(&p.body, &p.body_size, bs);
527 tk = token_next(state);
529 if (tk.num == TK_virtual) {
531 tk = token_next(state);
532 if (tk.num != TK_ident) {
533 err = "word required after $$";
536 vs = sym_find(g, tk.txt);
537 if (vs->precedence == 0) {
538 err = "symbol after $$ must have precedence";
541 p.precedence = vs->precedence;
544 tk = token_next(state);
546 if (tk.num == TK_open) {
547 p.code_line = tk.line;
548 p.code = collect_code(state, tk);
549 if (p.code.txt == NULL) {
550 err = "code fragment not closed properly";
553 tk = token_next(state);
556 if (tk.num != TK_newline && tk.num != TK_eof) {
557 err = "stray tokens at end of line";
560 pp = malloc(sizeof(*pp));
562 array_add(&g->productions, &g->production_count, pp);
565 while (tk.num != TK_newline && tk.num != TK_eof)
566 tk = token_next(state);
571 With the ability to parse productions and dollar-lines, we have nearly all
572 that we need to parse a grammar from a `code_node`.
574 The head of the first production will effectively be the `start` symbol
575 of the grammar. However it won't _actually_ be so. Processing the
576 grammar is greatly simplified if the real start symbol only has a single
577 production, and expects `$eof` as the final terminal. So when we find
578 the first explicit production we insert an extra implicit production as
579 production zero which looks like
581 ###### Example: production 0
584 where `START` is the first non-terminal given.
586 ###### create production zero
587 struct production *p = calloc(1,sizeof(*p));
588 struct text start = {"$start",6};
589 struct text eof = {"$eof",4};
590 struct text code = {"$0 = $<1;", 9};
591 p->head = sym_find(g, start);
592 p->head->type = Nonterminal;
593 p->head->struct_name = g->current_type;
594 p->head->isref = g->type_isref;
595 if (g->current_type.txt)
597 array_add(&p->body, &p->body_size, head);
598 array_add(&p->body, &p->body_size, sym_find(g, eof));
599 p->head->first_production = g->production_count;
600 array_add(&g->productions, &g->production_count, p);
602 Now we are ready to read in the grammar. We ignore comments
603 and strings so that the marks which introduce them can be
604 used as terminals in the grammar. We don't ignore numbers
605 even though we don't allow them as that causes the scanner
606 to produce errors that the parser is better positioned to handle.
607 We also ignore indentation, but we expect and use newlines.
609 To allow comments in the grammar, we explicitly check for "//" as the
610 first token on the line and those lines are skipped. "//" can still be
611 used as a terminal anywhere that a terminal is expected.
614 static struct grammar *grammar_read(struct code_node *code)
616 struct token_config conf = {
619 .words_marks = known,
620 .known_count = sizeof(known)/sizeof(known[0]),
622 .ignored = (1 << TK_line_comment)
623 | (1 << TK_block_comment)
626 | (1 << TK_multi_string)
631 struct token_state *state = token_open(code, &conf);
633 struct symbol *head = NULL;
637 g = calloc(1, sizeof(*g));
640 for (tk = token_next(state); tk.num != TK_eof;
641 tk = token_next(state)) {
642 if (tk.num == TK_newline)
644 if (tk.num == TK_ident) {
646 head = sym_find(g, tk.txt);
647 if (head->type == Nonterminal)
648 err = "This non-terminal has already be used.";
649 else if (head->type == Virtual)
650 err = "Virtual symbol not permitted in head of production";
652 head->type = Nonterminal;
653 head->struct_name = g->current_type;
654 head->isref = g->type_isref;
655 if (g->production_count == 0) {
656 ## create production zero
658 head->first_production = g->production_count;
659 tk = token_next(state);
660 if (tk.num == TK_mark &&
661 text_is(tk.txt, "->"))
662 err = parse_production(g, head, state);
664 err = "'->' missing in production";
666 } else if (tk.num == TK_mark
667 && text_is(tk.txt, "|")) {
668 // another production for same non-term
670 err = parse_production(g, head, state);
672 err = "First production must have a head";
673 } else if (tk.num == TK_mark
674 && text_is(tk.txt, "$")) {
675 err = dollar_line(state, g, 0);
676 } else if (tk.num == TK_mark
677 && text_is(tk.txt, "$*")) {
678 err = dollar_line(state, g, 1);
679 } else if (tk.num == TK_mark
680 && text_is(tk.txt, "//")) {
681 while (tk.num != TK_newline &&
683 tk = token_next(state);
685 err = "Unrecognised token at start of line.";
691 if (g->terminals_declared) {
694 for (s = g->syms; s; s = s->next) {
695 if (s->type != Unknown)
698 fprintf(stderr, "Token %.*s not declared\n",
699 s->name.len, s->name.txt);
702 free(g); // FIXME free content
708 fprintf(stderr, "Error at line %d: %s\n",
711 free(g); // FIXME free content
715 ## Analysing the grammar
717 The central task in analysing the grammar is to determine a set of
718 states to drive the parsing process. This will require finding various
719 sets of symbols and of "items". Some of these sets will need to attach
720 information to each element in the set, so they are more like maps.
722 Each "item" may have a 'look-ahead' or `LA` set associated with it.
723 Many of these will be the same as each other. To avoid memory wastage,
724 and to simplify some comparisons of sets, these sets will be stored in a
725 list of unique sets, each assigned a number.
727 Once we have the data structures in hand to manage these sets and lists,
728 we can start setting the 'nullable' flag, build the 'FIRST' sets, and
729 then create the item sets which define the various states.
733 Though we don't only store symbols in these sets, they are the main
734 things we store, so they are called symbol sets or "symsets".
736 A symset has a size, an array of shorts, and an optional array of data
737 values, which are also shorts. If the array of data is not present, we
738 store an impossible pointer, as `NULL` is used to indicate that no
739 memory has been allocated yet;
744 unsigned short *syms, *data;
746 #define NO_DATA ((unsigned short *)1)
747 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
748 const struct symset INIT_DATASET = { 0, NULL, NULL };
750 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
751 using an insertion sort. We don't explicitly record the amount of
752 allocated space as it can be derived directly from the current `cnt`
753 using `((cnt - 1) | 7) + 1`.
756 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
759 int current = ((s->cnt-1) | 7) + 1;
760 if (current == s->cnt) {
762 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
763 if (s->data != NO_DATA)
764 s->data = realloc(s->data, sizeof(*s->data) * current);
767 while (i > 0 && s->syms[i-1] > key) {
768 s->syms[i] = s->syms[i-1];
769 if (s->data != NO_DATA)
770 s->data[i] = s->data[i-1];
774 if (s->data != NO_DATA)
779 Finding a symbol (or item) in a `symset` uses a simple binary search.
780 We return the index where the value was found (so data can be accessed),
781 or `-1` to indicate failure.
783 static int symset_find(struct symset *ss, unsigned short key)
790 while (lo + 1 < hi) {
791 int mid = (lo + hi) / 2;
792 if (ss->syms[mid] <= key)
797 if (ss->syms[lo] == key)
802 We will often want to form the union of two symsets. When we do, we
803 will often want to know if anything changed (as that might mean there is
804 more work to do). So `symset_union` reports whether anything was added
805 to the first set. We use a slow quadratic approach as these sets are
806 rarely large. If profiling shows this to be a problem it can be
809 static int symset_union(struct symset *a, struct symset *b)
813 for (i = 0; i < b->cnt; i++)
814 if (symset_find(a, b->syms[i]) < 0) {
815 unsigned short data = 0;
816 if (b->data != NO_DATA)
818 symset_add(a, b->syms[i], data);
824 And of course we must be able to free a symset.
826 static void symset_free(struct symset ss)
829 if (ss.data != NO_DATA)
835 Some symsets are simply stored somewhere appropriate in the `grammar`
836 data structure, others needs a bit of indirection. This applies
837 particularly to "LA" sets. These will be paired with "items" in an
838 "itemset". As itemsets will be stored in a symset, the "LA" set needs
839 to be stored in the `data` field, so we need a mapping from a small
840 (short) number to an LA `symset`.
842 As mentioned earlier this involves creating a list of unique symsets.
844 For now, we just use a linear list sorted by insertion. A skiplist
845 would be more efficient and may be added later.
852 struct setlist *next;
855 ###### grammar fields
856 struct setlist *sets;
861 static int ss_cmp(struct symset a, struct symset b)
864 int diff = a.cnt - b.cnt;
867 for (i = 0; i < a.cnt; i++) {
868 diff = (int)a.syms[i] - (int)b.syms[i];
875 static int save_set(struct grammar *g, struct symset ss)
877 struct setlist **sl = &g->sets;
881 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
888 s = malloc(sizeof(*s));
897 Finding a set by number is currently performed by a simple linear
898 search. If this turns out to hurt performance, we can store the sets in
899 a dynamic array like the productions.
901 static struct symset set_find(struct grammar *g, int num)
903 struct setlist *sl = g->sets;
904 while (sl && sl->num != num)
909 ### Setting `nullable`
911 We set `nullable` on the head symbol for any production for which all
912 the body symbols (if any) are nullable. As this is a recursive
913 definition, any change in the `nullable` setting means that we need to
914 re-evaluate where it needs to be set.
916 We simply loop around performing the same calculations until no more
923 static void set_nullable(struct grammar *g)
926 while (check_again) {
929 for (p = 0; p < g->production_count; p++) {
930 struct production *pr = g->productions[p];
933 if (pr->head->nullable)
935 for (s = 0; s < pr->body_size; s++)
936 if (! pr->body[s]->nullable)
938 if (s == pr->body_size) {
939 pr->head->nullable = 1;
946 ### Building the `first` sets
948 When calculating what can follow a particular non-terminal, we will need
949 to know what the "first" terminal in any subsequent non-terminal might
950 be. So we calculate the `first` set for every non-terminal and store
951 them in an array. We don't bother recording the "first" set for
952 terminals as they are trivial.
954 As the "first" for one symbol might depend on the "first" of another, we
955 repeat the calculations until no changes happen, just like with
956 `set_nullable`. This makes use of the fact that `symset_union` reports
957 if any change happens.
959 The core of this, which finds the "first" of part of a production body,
960 will be reused for computing the "follow" sets, so we split that out
961 into a separate function, `add_first`, which also reports if it got all
962 the way to the end of the production without finding a non-nullable
965 ###### grammar fields
966 struct symset *first;
970 static int add_first(struct production *pr, int start,
971 struct symset *target, struct grammar *g,
976 for (s = start; s < pr->body_size; s++) {
977 struct symbol *bs = pr->body[s];
978 if (bs->type == Terminal) {
979 if (symset_find(target, bs->num) < 0) {
980 symset_add(target, bs->num, 0);
984 } else if (symset_union(target, &g->first[bs->num]))
990 *to_end = (s == pr->body_size);
994 static void build_first(struct grammar *g)
998 g->first = calloc(g->num_syms, sizeof(g->first[0]));
999 for (s = 0; s < g->num_syms; s++)
1000 g->first[s] = INIT_SYMSET;
1002 while (check_again) {
1005 for (p = 0; p < g->production_count; p++) {
1006 struct production *pr = g->productions[p];
1007 struct symset *head = &g->first[pr->head->num];
1009 if (add_first(pr, 0, head, g, NULL))
1015 ### Building the `follow` sets.
1017 There are two different situations when we will want to generate
1018 "follow" sets. If we are doing an SLR analysis, we want to generate a
1019 single "follow" set for each non-terminal in the grammar. That is what
1020 is happening here. If we are doing an LALR or LR analysis we will want
1021 to generate a separate "LA" set for each item. We do that later in
1024 There are two parts to generating a "follow" set. Firstly we look at
1025 every place that any non-terminal appears in the body of any production,
1026 and we find the set of possible "first" symbols after there. This is
1027 done using `add_first` above and only needs to be done once as the
1028 "first" sets are now stable and will not change.
1030 ###### grammar fields
1031 struct symset *follow;
1035 for (p = 0; p < g->production_count; p++) {
1036 struct production *pr = g->productions[p];
1039 for (b = 0; b < pr->body_size - 1; b++) {
1040 struct symbol *bs = pr->body[b];
1041 if (bs->type == Terminal)
1043 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1047 The second part is to add the "follow" set of the head of a production
1048 to the "follow" sets of the final symbol in the production, and any
1049 other symbol which is followed only by `nullable` symbols. As this
1050 depends on "follow" itself we need to repeatedly perform the process
1051 until no further changes happen.
1053 Rather than 2 nested loops, one that repeats the whole process until
1054 there is no change, and one that iterates of the set of productions, we
1055 combine these two functions into a single loop.
1059 for (check_again = 0, p = 0;
1060 p < g->production_count;
1061 p < g->production_count-1
1062 ? p++ : check_again ? (check_again = 0, p = 0)
1064 struct production *pr = g->productions[p];
1067 for (b = pr->body_size - 1; b >= 0; b--) {
1068 struct symbol *bs = pr->body[b];
1069 if (bs->type == Terminal)
1071 if (symset_union(&g->follow[bs->num],
1072 &g->follow[pr->head->num]))
1079 We now just need to create and initialise the `follow` list to get a
1083 static void build_follow(struct grammar *g)
1085 int s, check_again, p;
1086 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1087 for (s = 0; s < g->num_syms; s++)
1088 g->follow[s] = INIT_SYMSET;
1092 ### Building itemsets and states
1094 There are three different levels of detail that can be chosen for
1095 building the itemsets and states for the LR grammar. They are:
1097 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1098 itemsets - look-ahead, if used at all, is simply the 'follow' sets
1100 2. LALR(1) where we build look-ahead sets with each item and merge
1101 the LA sets when we find two paths to the same "kernel" set of items.
1102 3. LR(1) where different look-ahead for any item in the set means
1103 a different item set must be created.
1105 ###### forward declarations
1106 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1108 We need to be able to look through existing states to see if a newly
1109 generated state already exists. For now we use a simple sorted linked
1112 An item is a pair of numbers: the production number and the position of
1113 "DOT", which is an index into the body of the production. As the
1114 numbers are not enormous we can combine them into a single "short" and
1115 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1116 production number (so 2000 productions with maximum of 31 symbols in the
1119 Comparing the itemsets will be a little different to comparing symsets
1120 as we want to do the lookup after generating the "kernel" of an
1121 itemset, so we need to ignore the offset=zero items which are added during
1124 To facilitate this, we modify the "DOT" number so that "0" sorts to
1125 the end of the list in the symset, and then only compare items before
1129 static inline unsigned short item_num(int production, int dot)
1131 return production | ((31-dot) << 11);
1133 static inline int item_prod(unsigned short item)
1135 return item & 0x7ff;
1137 static inline int item_dot(unsigned short item)
1139 return (31-(item >> 11)) & 0x1f;
1142 For LR(1) analysis we need to compare not just the itemset in a state
1143 but also the LA sets. As we assign each unique LA set a number, we
1144 can just compare the symset and the data values together.
1147 static int itemset_cmp(struct symset a, struct symset b,
1148 enum grammar_type type)
1154 i < a.cnt && i < b.cnt &&
1155 item_dot(a.syms[i]) > 0 &&
1156 item_dot(b.syms[i]) > 0;
1158 int diff = a.syms[i] - b.syms[i];
1162 diff = a.data[i] - b.data[i];
1167 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1171 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1177 if (type < LR1 || av == -1)
1180 a.data[i] - b.data[i];
1183 It will be helpful to know if an itemset has been "completed" or not,
1184 particularly for LALR where itemsets get merged, at which point they
1185 need to be consider for completion again. So a `completed` flag is
1188 And now we can build the list of itemsets. The lookup routine returns
1189 both a success flag and a pointer to where in the list an insert should
1190 happen, so we don't need to search a second time.
1194 struct itemset *next;
1196 struct symset items;
1197 struct symset go_to;
1199 unsigned short precedence;
1203 ###### grammar fields
1204 struct itemset *items;
1208 static int itemset_find(struct grammar *g, struct itemset ***where,
1209 struct symset kernel, enum grammar_type type)
1211 struct itemset **ip;
1213 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1214 struct itemset *i = *ip;
1216 diff = itemset_cmp(i->items, kernel, type);
1229 Adding an itemset may require merging the LA sets if LALR analysis is
1230 happening. If any new LA set adds any symbols that weren't in the old
1231 LA set, we clear the `completed` flag so that the dependants of this
1232 itemset will be recalculated and their LA sets updated.
1234 `add_itemset` must consume the symsets it is passed, either by adding
1235 them to a data structure, of freeing them.
1237 static int add_itemset(struct grammar *g, struct symset ss,
1238 enum assoc assoc, unsigned short precedence,
1239 enum grammar_type type)
1241 struct itemset **where, *is;
1243 int found = itemset_find(g, &where, ss, type);
1245 is = calloc(1, sizeof(*is));
1246 is->state = g->states;
1250 is->precedence = precedence;
1252 is->go_to = INIT_DATASET;
1261 for (i = 0; i < ss.cnt; i++) {
1262 struct symset temp = INIT_SYMSET, s;
1263 if (ss.data[i] == is->items.data[i])
1265 s = set_find(g, is->items.data[i]);
1266 symset_union(&temp, &s);
1267 s = set_find(g, ss.data[i]);
1268 if (symset_union(&temp, &s)) {
1269 is->items.data[i] = save_set(g, temp);
1280 To build all the itemsets, we first insert the initial itemset made from
1281 production zero, complete each itemset, and then generate new itemsets
1282 from old until no new ones can be made.
1284 Completing an itemset means finding all the items where "DOT" is
1285 followed by a nonterminal and adding "DOT=0" items for every production
1286 from that non-terminal - providing each item hasn't already been added.
1287 When we add such an item it might get inserted before the current
1288 one, so to make sure we process it we reset the loop counter to the
1291 If LA sets are needed, the LA set for each new item is found using
1292 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This
1293 may be supplemented by the LA set for the item which produced the new
1296 We also collect a set of all symbols which follow "DOT" (in `done`) as
1297 this is used in the next stage.
1299 When itemsets are created we assign a precedence to the itemset from the
1300 complete item, if there is one. We ignore the possibility of there
1301 being two and don't (currently) handle precedence in such grammars.
1302 When completing a grammar we ignore any item where DOT is followed by a
1303 terminal with a precedence lower than that for the itemset. Unless the
1304 terminal has right associativity, we also ignore items where the
1305 terminal has the same precedence. The result is that unwanted items are
1306 still in the itemset, but the terminal doesn't get into the go to set,
1307 so the item is ineffective.
1309 ###### complete itemset
1310 for (i = 0; i < is->items.cnt; i++) {
1311 int p = item_prod(is->items.syms[i]);
1312 int bs = item_dot(is->items.syms[i]);
1313 struct production *pr = g->productions[p];
1316 struct symset LA = INIT_SYMSET;
1317 unsigned short sn = 0;
1319 if (bs == pr->body_size)
1322 if (s->precedence && is->precedence &&
1323 is->precedence > s->precedence)
1324 /* This terminal has a low precedence and
1325 * shouldn't be shifted
1328 if (s->precedence && is->precedence &&
1329 is->precedence == s->precedence && s->assoc != Right)
1330 /* This terminal has a matching precedence and is
1331 * not Right-associative, so we mustn't shift it.
1334 if (symset_find(&done, s->num) < 0)
1335 symset_add(&done, s->num, 0);
1337 if (s->type != Nonterminal)
1343 add_first(pr, bs+1, &LA, g, &to_end);
1345 struct symset ss = set_find(g, is->items.data[i]);
1346 symset_union(&LA, &ss);
1348 sn = save_set(g, LA);
1349 LA = set_find(g, sn);
1352 /* Add productions for this symbol */
1353 for (p2 = s->first_production;
1354 p2 < g->production_count &&
1355 g->productions[p2]->head == s;
1357 int itm = item_num(p2, 0);
1358 int pos = symset_find(&is->items, itm);
1360 symset_add(&is->items, itm, sn);
1361 /* Will have re-ordered, so start
1362 * from beginning again */
1364 } else if (type >= LALR) {
1365 struct symset ss = set_find(g, is->items.data[pos]);
1366 struct symset tmp = INIT_SYMSET;
1367 struct symset *la = &LA;
1369 symset_union(&tmp, &ss);
1370 if (symset_union(&tmp, la)) {
1371 is->items.data[pos] = save_set(g, tmp);
1379 For each symbol we found (and placed in `done`) we collect all the items
1380 for which this symbol is next, and create a new itemset with "DOT"
1381 advanced over the symbol. This is then added to the collection of
1382 itemsets (or merged with a pre-existing itemset).
1384 ###### derive itemsets
1385 // Now we have a completed itemset, so we need to
1386 // compute all the 'next' itemsets and create them
1387 // if they don't exist.
1388 for (i = 0; i < done.cnt; i++) {
1390 unsigned short state;
1391 struct symbol *sym = g->symtab[done.syms[i]];
1392 enum assoc assoc = Non;
1393 unsigned short precedence = 0;
1394 struct symset newitemset = INIT_SYMSET;
1396 newitemset = INIT_DATASET;
1398 for (j = 0; j < is->items.cnt; j++) {
1399 int itm = is->items.syms[j];
1400 int p = item_prod(itm);
1401 int bp = item_dot(itm);
1402 struct production *pr = g->productions[p];
1403 unsigned short la = 0;
1406 if (bp == pr->body_size)
1408 if (pr->body[bp] != sym)
1413 la = is->items.data[j];
1414 if (bp == pr->body_size &&
1415 pr->precedence > 0 &&
1416 pr->precedence > precedence) {
1417 // new itemset is reducible and has a precedence.
1418 precedence = pr->precedence;
1421 pos = symset_find(&newitemset, item_num(p, bp));
1424 symset_add(&newitemset, item_num(p, bp), la);
1425 else if (type >= LALR) {
1426 // Need to merge la set.
1427 int la2 = newitemset.data[pos];
1429 struct symset ss = set_find(g, la2);
1430 struct symset LA = INIT_SYMSET;
1431 symset_union(&LA, &ss);
1432 ss = set_find(g, la);
1433 if (symset_union(&LA, &ss))
1434 newitemset.data[pos] = save_set(g, LA);
1440 state = add_itemset(g, newitemset, assoc, precedence, type);
1441 if (symset_find(&is->go_to, done.syms[i]) < 0)
1442 symset_add(&is->go_to, done.syms[i], state);
1445 All that is left is to create the initial itemset from production zero, and
1446 with `TK_eof` as the LA set.
1449 static void build_itemsets(struct grammar *g, enum grammar_type type)
1451 struct symset first = INIT_SYMSET;
1454 unsigned short la = 0;
1456 // LA set just has eof
1457 struct symset eof = INIT_SYMSET;
1458 symset_add(&eof, TK_eof, 0);
1459 la = save_set(g, eof);
1460 first = INIT_DATASET;
1462 // production 0, offset 0 (with no data)
1463 symset_add(&first, item_num(0, 0), la);
1464 add_itemset(g, first, Non, 0, type);
1465 for (check_again = 0, is = g->items;
1467 is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1469 struct symset done = INIT_SYMSET;
1480 ### Completing the analysis.
1482 The exact process of analysis depends on which level was requested. For
1483 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1484 `LR1` we need first, but not follow. For `SLR` we need both.
1486 We don't build the "action" tables that you might expect as the parser
1487 doesn't use them. They will be calculated to some extent if needed for
1490 Once we have built everything we allocate arrays for the two lists:
1491 symbols and itemsets. This allows more efficient access during
1492 reporting. The symbols are grouped as terminals, then non-terminals,
1493 then virtual, with the start of non-terminals recorded as `first_nonterm`.
1494 Special terminals -- meaning just EOL -- are included with the
1495 non-terminals so that they are not expected by the scanner.
1497 ###### grammar fields
1498 struct symbol **symtab;
1499 struct itemset **statetab;
1502 ###### grammar_analyse
1504 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1508 int snum = TK_reserved;
1509 for (s = g->syms; s; s = s->next)
1510 if (s->num < 0 && s->type == Terminal && !s->isspecial) {
1514 g->first_nonterm = snum;
1515 for (s = g->syms; s; s = s->next)
1516 if (s->num < 0 && s->type != Virtual) {
1520 for (s = g->syms; s; s = s->next)
1526 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1527 for (s = g->syms; s; s = s->next)
1528 g->symtab[s->num] = s;
1537 build_itemsets(g, type);
1539 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1540 for (is = g->items; is ; is = is->next)
1541 g->statetab[is->state] = is;
1544 ## Reporting on the Grammar
1546 The purpose of the report is to give the grammar developer insight into
1547 how the grammar parser will work. It is basically a structured dump of
1548 all the tables that have been generated, plus a description of any conflicts.
1550 ###### grammar_report
1551 static int grammar_report(struct grammar *g, enum grammar_type type)
1557 return report_conflicts(g, type);
1560 Firstly we have the complete list of symbols, together with the
1561 "FIRST" set if that was generated. We add a mark to each symbol to
1562 show if it is nullable (`.`).
1566 static void report_symbols(struct grammar *g)
1570 printf("SYMBOLS + FIRST:\n");
1572 printf("SYMBOLS:\n");
1574 for (n = 0; n < g->num_syms; n++) {
1575 struct symbol *s = g->symtab[n];
1579 printf(" %c%3d%c: ",
1580 s->nullable ? '.':' ',
1581 s->num, symtypes[s->type]);
1584 printf(" (%d%s)", s->precedence,
1585 assoc_names[s->assoc]);
1587 if (g->first && s->type == Nonterminal) {
1590 for (j = 0; j < g->first[n].cnt; j++) {
1593 prtxt(g->symtab[g->first[n].syms[j]]->name);
1600 Then we have the follow sets if they were computed.
1602 static void report_follow(struct grammar *g)
1605 printf("FOLLOW:\n");
1606 for (n = 0; n < g->num_syms; n++)
1607 if (g->follow[n].cnt) {
1611 prtxt(g->symtab[n]->name);
1612 for (j = 0; j < g->follow[n].cnt; j++) {
1615 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1621 And finally the item sets. These include the GO TO tables and, for
1622 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1623 it up a bit. First the items, with production number and associativity.
1625 static void report_item(struct grammar *g, int itm)
1627 int p = item_prod(itm);
1628 int dot = item_dot(itm);
1629 struct production *pr = g->productions[p];
1633 prtxt(pr->head->name);
1635 for (i = 0; i < pr->body_size; i++) {
1636 printf(" %s", (dot == i ? ". ": ""));
1637 prtxt(pr->body[i]->name);
1639 if (dot == pr->body_size)
1642 if (pr->precedence && dot == pr->body_size)
1643 printf(" (%d%s)", pr->precedence,
1644 assoc_names[pr->assoc]);
1645 if (dot < pr->body_size &&
1646 pr->body[dot]->precedence) {
1647 struct symbol *s = pr->body[dot];
1648 printf(" [%d%s]", s->precedence,
1649 assoc_names[s->assoc]);
1654 The LA sets which are (possibly) reported with each item:
1656 static void report_la(struct grammar *g, int lanum)
1658 struct symset la = set_find(g, lanum);
1662 printf(" LOOK AHEAD(%d)", lanum);
1663 for (i = 0; i < la.cnt; i++) {
1666 prtxt(g->symtab[la.syms[i]]->name);
1671 Then the go to sets:
1673 static void report_goto(struct grammar *g, struct symset gt)
1678 for (i = 0; i < gt.cnt; i++) {
1680 prtxt(g->symtab[gt.syms[i]]->name);
1681 printf(" -> %d\n", gt.data[i]);
1685 Now we can report all the item sets complete with items, LA sets, and GO TO.
1687 static void report_itemsets(struct grammar *g)
1690 printf("ITEM SETS(%d)\n", g->states);
1691 for (s = 0; s < g->states; s++) {
1693 struct itemset *is = g->statetab[s];
1694 printf(" Itemset %d:", s);
1696 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1698 for (j = 0; j < is->items.cnt; j++) {
1699 report_item(g, is->items.syms[j]);
1700 if (is->items.data != NO_DATA)
1701 report_la(g, is->items.data[j]);
1703 report_goto(g, is->go_to);
1707 ### Reporting conflicts
1709 Conflict detection varies a lot among different analysis levels. However
1710 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1711 LR1 are also similar as they have FOLLOW or LA sets.
1715 ## conflict functions
1717 static int report_conflicts(struct grammar *g, enum grammar_type type)
1720 printf("Conflicts:\n");
1722 cnt = conflicts_lr0(g, type);
1724 cnt = conflicts_slr(g, type);
1726 printf(" - no conflicts\n");
1730 LR0 conflicts are any state which have both a reducible item and
1731 a shiftable item, or two reducible items.
1733 LR05 conflicts only occur if two possible reductions exist,
1734 as shifts always over-ride reductions.
1736 ###### conflict functions
1737 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1741 for (i = 0; i < g->states; i++) {
1742 struct itemset *is = g->statetab[i];
1743 int last_reduce = -1;
1744 int prev_reduce = -1;
1745 int last_shift = -1;
1749 for (j = 0; j < is->items.cnt; j++) {
1750 int itm = is->items.syms[j];
1751 int p = item_prod(itm);
1752 int bp = item_dot(itm);
1753 struct production *pr = g->productions[p];
1755 if (bp == pr->body_size) {
1756 prev_reduce = last_reduce;
1760 if (pr->body[bp]->type == Terminal)
1763 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1764 printf(" State %d has both SHIFT and REDUCE:\n", i);
1765 report_item(g, is->items.syms[last_shift]);
1766 report_item(g, is->items.syms[last_reduce]);
1769 if (prev_reduce >= 0) {
1770 printf(" State %d has 2 (or more) reducible items\n", i);
1771 report_item(g, is->items.syms[prev_reduce]);
1772 report_item(g, is->items.syms[last_reduce]);
1779 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1780 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1781 in the source of the look ahead set.
1783 We build two datasets to reflect the "action" table: one which maps
1784 terminals to items where that terminal could be shifted and another
1785 which maps terminals to items that could be reduced when the terminal
1786 is in look-ahead. We report when we get conflicts between the two.
1788 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1793 for (i = 0; i < g->states; i++) {
1794 struct itemset *is = g->statetab[i];
1795 struct symset shifts = INIT_DATASET;
1796 struct symset reduce = INIT_DATASET;
1800 /* First collect the shifts */
1801 for (j = 0; j < is->items.cnt; j++) {
1802 unsigned short itm = is->items.syms[j];
1803 int p = item_prod(itm);
1804 int bp = item_dot(itm);
1805 struct production *pr = g->productions[p];
1808 if (bp >= pr->body_size ||
1809 pr->body[bp]->type != Terminal)
1814 if (s->precedence && is->precedence)
1815 /* Precedence resolves this, so no conflict */
1818 if (symset_find(&shifts, s->num) < 0)
1819 symset_add(&shifts, s->num, itm);
1821 /* Now look for reductions and conflicts */
1822 for (j = 0; j < is->items.cnt; j++) {
1823 unsigned short itm = is->items.syms[j];
1824 int p = item_prod(itm);
1825 int bp = item_dot(itm);
1826 struct production *pr = g->productions[p];
1828 if (bp < pr->body_size)
1833 la = g->follow[pr->head->num];
1835 la = set_find(g, is->items.data[j]);
1837 for (k = 0; k < la.cnt; k++) {
1838 int pos = symset_find(&shifts, la.syms[k]);
1840 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1842 prtxt(g->symtab[la.syms[k]]->name);
1844 report_item(g, shifts.data[pos]);
1845 report_item(g, itm);
1847 pos = symset_find(&reduce, la.syms[k]);
1849 symset_add(&reduce, la.syms[k], itm);
1852 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1853 prtxt(g->symtab[la.syms[k]]->name);
1855 report_item(g, itm);
1856 report_item(g, reduce.data[pos]);
1860 symset_free(shifts);
1861 symset_free(reduce);
1867 ## Generating the parser
1869 The exported part of the parser is the `parse_XX` function, where the name
1870 `XX` is based on the name of the parser files.
1872 This takes a `code_node`, a partially initialized `token_config`, and an
1873 optional `FILE` to send tracing to. The `token_config` gets the list of
1874 known words added and then is used with the `code_node` to initialize the
1877 `parse_XX` then calls the library function `parser_run` to actually complete
1878 the parse. This needs the `states` table and functions to call the various
1879 pieces of code provided in the grammar file, so they are generated first.
1881 ###### parser_generate
1883 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1884 struct code_node *pre_reduce)
1890 gen_reduce(f, g, file, pre_reduce);
1893 fprintf(f, "#line 0 \"gen_parser\"\n");
1894 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1897 fprintf(f, "\tstruct token_state *tokens;\n");
1898 fprintf(f, "\tconfig->words_marks = known;\n");
1899 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1900 fprintf(f, "\ttokens = token_open(code, config);\n");
1901 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1902 fprintf(f, "\ttoken_close(tokens);\n");
1903 fprintf(f, "\treturn rv;\n");
1904 fprintf(f, "}\n\n");
1907 ### Known words table
1909 The known words table is simply an array of terminal symbols.
1910 The table of nonterminals used for tracing is a similar array.
1914 static void gen_known(FILE *f, struct grammar *g)
1917 fprintf(f, "#line 0 \"gen_known\"\n");
1918 fprintf(f, "static const char *known[] = {\n");
1919 for (i = TK_reserved;
1920 i < g->num_syms && g->symtab[i]->type == Terminal;
1922 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1923 g->symtab[i]->name.txt);
1924 fprintf(f, "};\n\n");
1927 static void gen_non_term(FILE *f, struct grammar *g)
1930 fprintf(f, "#line 0 \"gen_non_term\"\n");
1931 fprintf(f, "static const char *non_term[] = {\n");
1932 for (i = TK_reserved;
1935 if (g->symtab[i]->type == Nonterminal ||
1936 g->symtab[i]->isspecial)
1937 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1938 g->symtab[i]->name.txt);
1939 fprintf(f, "};\n\n");
1942 ### States and the goto tables.
1944 For each state we record the goto table and details of the reducible
1945 production if there is one.
1946 Some of the details of the reducible production are stored in the
1947 `do_reduce` function to come later. Here we store the production
1948 number, the body size (useful for stack management), and the resulting
1949 symbol (useful for knowing how to free data later).
1951 The go to table is stored in a simple array of `sym` and corresponding
1954 ###### exported types
1962 const struct lookup * go_to;
1971 static void gen_goto(FILE *f, struct grammar *g)
1974 fprintf(f, "#line 0 \"gen_goto\"\n");
1975 for (i = 0; i < g->states; i++) {
1976 struct symset gt = g->statetab[i]->go_to;
1981 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1983 for (j = 0; j < gt.cnt; j++)
1984 fprintf(f, "\t{ %d, %d },\n",
1985 gt.syms[j], gt.data[j]);
1990 static void gen_states(FILE *f, struct grammar *g)
1993 fprintf(f, "#line 0 \"gen_states\"\n");
1994 fprintf(f, "static const struct state states[] = {\n");
1995 for (i = 0; i < g->states; i++) {
1996 struct itemset *is = g->statetab[i];
1997 int j, prod = -1, prod_len;
1999 for (j = 0; j < is->items.cnt; j++) {
2000 int itm = is->items.syms[j];
2001 int p = item_prod(itm);
2002 int bp = item_dot(itm);
2003 struct production *pr = g->productions[p];
2005 if (bp < pr->body_size)
2007 /* This is what we reduce - choose longest */
2008 if (prod < 0 || prod_len < pr->body_size) {
2010 prod_len = pr->body_size;
2014 fprintf(f, "\t[%d] = { %d, goto_%d, ",
2015 i, is->go_to.cnt, i);
2017 fprintf(f, "\t[%d] = { 0, NULL, ", i);
2019 struct production *pr = g->productions[prod];
2020 struct symbol *hd = pr->head;
2021 fprintf(f, "%d, %d, %d, ",
2022 prod, pr->body_size, pr->head->num);
2023 if (hd->struct_name.txt == NULL)
2024 fprintf(f, "0 },\n");
2026 fprintf(f, "sizeof(struct %.*s%s) },\n",
2027 hd->struct_name.len,
2028 hd->struct_name.txt,
2029 hd->isref ? "*" : "");
2031 fprintf(f, "-1, -1, -1, -1 },\n");
2033 fprintf(f, "};\n\n");
2036 ### The `do_reduce` function and the code
2038 When the parser engine decides to reduce a production, it calls
2039 `do_reduce` which runs the code that was included with the production,
2042 This code needs to be able to store data somewhere. Rather than
2043 requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
2044 buffer and have `do_reduce` return the size to be saved.
2046 In order for the code to access "global" context, we pass in the
2047 "config" pointer that was passed to the parser function. If the `struct
2048 token_config` is embedded in some larger structure, the reducing code
2049 can access the larger structure using pointer manipulation. Performing
2050 that pointer manipulation, and any other common code or variables for
2051 all reduce actions, can be provided in code node called "reduce" which
2052 is passed around in `pre_reduce` which you might have already noticed.
2054 The code fragments require translation when written out. Any `$N` needs
2055 to be converted to a reference either to that buffer (if `$0`) or to the
2056 structure returned by a previous reduction. These pointers need to be
2057 cast to the appropriate type for each access. All this is handled in
2060 `gen_code` also allows symbol references to contain a '`<`' as in
2061 '`$<2`'. This is particularly useful for references (or pointers), but
2062 can be used with structures too. The `<` implies that the value is
2063 being moved out, so the object will not be automatically freed. It is
2064 equivalent to assigning `NULL` to the pointer or filling a structure
2067 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2068 and possibly a number. A number by itself (other than zero) selects a
2069 symbol from the body of the production. A sequence of letters selects
2070 the shortest symbol in the body which contains those letters in the
2071 given order. If a number follows the letters, then a later occurrence
2072 of that symbol is chosen. So "`$AB2`" will refer to the structure
2073 attached to the second occurrence of the shortest symbol which contains
2074 an `A` followed by a `B`. If there is no unique shortest system, or if
2075 the number given is too large, then the symbol reference is not
2076 transformed, and will cause an error when the code is compiled.
2080 static int textchr(struct text t, char c, int s)
2083 for (i = s; i < t.len; i++)
2089 static int subseq_match(char *seq, int slen, struct text name)
2093 st = textchr(name, *seq, st);
2103 static int choose_sym(char **namep, int len, struct production *p)
2105 char *name = *namep;
2114 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2120 while (len > 0 && (c >= '0' && c <= '9')) {
2123 n = n * 10 + (c - '0');
2127 if (name == *namep || n > p->body_size)
2133 for (i = 0; i < p->body_size; i++) {
2134 if (!subseq_match(nam, namlen, p->body[i]->name))
2136 if (slen == 0 || p->body[i]->name.len < slen) {
2138 slen = p->body[i]->name.len;
2140 if (s >= 0 && p->body[i] != p->body[s] &&
2141 p->body[i]->name.len == p->body[s]->name.len)
2142 /* not unique, so s cannot be used */
2149 for (i = 0; i < p->body_size; i++)
2150 if (p->body[i] == p->body[s]) {
2155 if (n > 0 || i == p->body_size)
2161 static void gen_code(struct production *p, FILE *f, struct grammar *g)
2164 char *used = calloc(1, p->body_size);
2167 fprintf(f, "\t\t\t");
2168 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2182 n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2191 fprintf(f, "(*(struct %.*s*%s)ret)",
2192 p->head->struct_name.len,
2193 p->head->struct_name.txt,
2194 p->head->isref ? "*":"");
2195 else if (p->body[n-1]->type == Terminal)
2196 fprintf(f, "(*(struct token *)body[%d])",
2198 else if (p->body[n-1]->struct_name.txt == NULL)
2199 fprintf(f, "$%d", n);
2201 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2202 p->body[n-1]->struct_name.len,
2203 p->body[n-1]->struct_name.txt,
2204 p->body[n-1]->isref ? "*":"", n-1);
2210 for (i = 0; i < p->body_size; i++) {
2211 if (p->body[i]->struct_name.txt &&
2213 // assume this has been copied out
2214 if (p->body[i]->isref)
2215 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2217 fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2225 static void gen_reduce(FILE *f, struct grammar *g, char *file,
2226 struct code_node *pre_reduce)
2229 fprintf(f, "#line 1 \"gen_reduce\"\n");
2230 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2232 fprintf(f, "\tint ret_size = 0;\n");
2234 code_node_print(f, pre_reduce, file);
2236 fprintf(f, "#line 4 \"gen_reduce\"\n");
2237 fprintf(f, "\tswitch(prod) {\n");
2238 for (i = 0; i < g->production_count; i++) {
2239 struct production *p = g->productions[i];
2240 fprintf(f, "\tcase %d:\n", i);
2243 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2247 if (p->head->struct_name.txt)
2248 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2249 p->head->struct_name.len,
2250 p->head->struct_name.txt,
2251 p->head->isref ? "*":"");
2253 fprintf(f, "\t\tbreak;\n");
2255 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2260 As each non-terminal can potentially cause a different type of data
2261 structure to be allocated and filled in, we need to be able to free it when
2264 It is particularly important to have fine control over freeing during error
2265 recovery where individual stack frames might need to be freed.
2267 For this, the grammar author is required to defined a `free_XX` function for
2268 each structure that is used by a non-terminal. `do_free` will call whichever
2269 is appropriate given a symbol number, and will call `free` (as is
2270 appropriate for tokens) on any terminal symbol.
2274 static void gen_free(FILE *f, struct grammar *g)
2278 fprintf(f, "#line 0 \"gen_free\"\n");
2279 fprintf(f, "static void do_free(short sym, void *asn)\n");
2281 fprintf(f, "\tif (!asn) return;\n");
2282 fprintf(f, "\tif (sym < %d", g->first_nonterm);
2283 /* Need to handle special terminals too */
2284 for (i = 0; i < g->num_syms; i++) {
2285 struct symbol *s = g->symtab[i];
2286 if (i >= g->first_nonterm && s->type == Terminal &&
2288 fprintf(f, " || sym == %d", s->num);
2290 fprintf(f, ") {\n");
2291 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2292 fprintf(f, "\tswitch(sym) {\n");
2294 for (i = 0; i < g->num_syms; i++) {
2295 struct symbol *s = g->symtab[i];
2297 s->type != Nonterminal ||
2298 s->struct_name.txt == NULL)
2301 fprintf(f, "\tcase %d:\n", s->num);
2303 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2305 s->struct_name.txt);
2306 fprintf(f, "\t\tfree(asn);\n");
2308 fprintf(f, "\t\tfree_%.*s(asn);\n",
2310 s->struct_name.txt);
2311 fprintf(f, "\t\tbreak;\n");
2313 fprintf(f, "\t}\n}\n\n");
2316 ## The main routine.
2318 There are three key parts to the "main" function of parsergen: processing
2319 the arguments, loading the grammar file, and dealing with the grammar.
2321 ### Argument processing.
2323 Command line options allow the selection of analysis mode, name of output
2324 file, and whether or not a report should be generated. By default we create
2325 a report only if no code output was requested.
2327 The `parse_XX` function name uses the basename of the output file which
2328 should not have a suffix (`.c`). `.c` is added to the given name for the
2329 code, and `.h` is added for the header (if header text is specifed in the
2336 static const struct option long_options[] = {
2337 { "LR0", 0, NULL, '0' },
2338 { "LR05", 0, NULL, '5' },
2339 { "SLR", 0, NULL, 'S' },
2340 { "LALR", 0, NULL, 'L' },
2341 { "LR1", 0, NULL, '1' },
2342 { "tag", 1, NULL, 't' },
2343 { "report", 0, NULL, 'R' },
2344 { "output", 1, NULL, 'o' },
2345 { NULL, 0, NULL, 0 }
2347 const char *options = "05SL1t:Ro:";
2349 ###### process arguments
2351 char *outfile = NULL;
2356 enum grammar_type type = LR05;
2357 while ((opt = getopt_long(argc, argv, options,
2358 long_options, NULL)) != -1) {
2373 outfile = optarg; break;
2375 tag = optarg; break;
2377 fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2382 infile = argv[optind++];
2384 fprintf(stderr, "No input file given\n");
2387 if (outfile && report == 1)
2390 if (name && strchr(name, '/'))
2391 name = strrchr(name, '/')+1;
2393 if (optind < argc) {
2394 fprintf(stderr, "Excess command line arguments\n");
2398 ### Loading the grammar file
2400 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2403 Once we have extracted the code (with `mdcode`) we expect to find three
2404 or four sections: header, code, grammar, reduce.
2405 Anything else that is not excluded by the `--tag` option is an error.
2407 "header", "code", and "reduce" are optional, though it is hard to build
2408 a working parser without the first two. "grammar" must be provided.
2412 #include <sys/mman.h>
2417 static void pr_err(char *msg)
2420 fprintf(stderr, "%s\n", msg);
2424 struct section *table;
2428 fd = open(infile, O_RDONLY);
2430 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2431 infile, strerror(errno));
2434 len = lseek(fd, 0, 2);
2435 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2436 table = code_extract(file, file+len, pr_err);
2438 struct code_node *hdr = NULL;
2439 struct code_node *code = NULL;
2440 struct code_node *gram = NULL;
2441 struct code_node *pre_reduce = NULL;
2442 for (s = table; s; s = s->next) {
2443 struct text sec = s->section;
2444 if (tag && !strip_tag(&sec, tag))
2446 if (text_is(sec, "header"))
2448 else if (text_is(sec, "code"))
2450 else if (text_is(sec, "grammar"))
2452 else if (text_is(sec, "reduce"))
2453 pre_reduce = s->code;
2455 fprintf(stderr, "Unknown content section: %.*s\n",
2456 s->section.len, s->section.txt);
2461 ### Processing the input
2463 As we need to append an extention to a filename and then open it for
2464 writing, and we need to do this twice, it helps to have a separate function.
2468 static FILE *open_ext(char *base, char *ext)
2470 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2472 strcat(strcpy(fn, base), ext);
2478 If we can read the grammar, then we analyse and optionally report on it. If
2479 the report finds conflicts we will exit with an error status.
2481 ###### process input
2482 struct grammar *g = NULL;
2484 fprintf(stderr, "No grammar section provided\n");
2487 g = grammar_read(gram);
2489 fprintf(stderr, "Failure to parse grammar\n");
2494 grammar_analyse(g, type);
2496 if (grammar_report(g, type))
2500 If a "headers" section is defined, we write it out and include a declaration
2501 for the `parse_XX` function so it can be used from separate code.
2503 if (rv == 0 && hdr && outfile) {
2504 FILE *f = open_ext(outfile, ".h");
2506 code_node_print(f, hdr, infile);
2507 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2511 fprintf(stderr, "Cannot create %s.h\n",
2517 And if all goes well and an output file was provided, we create the `.c`
2518 file with the code section (if any) and the parser tables and function.
2520 if (rv == 0 && outfile) {
2521 FILE *f = open_ext(outfile, ".c");
2524 code_node_print(f, code, infile);
2525 gen_parser(f, g, infile, name, pre_reduce);
2528 fprintf(stderr, "Cannot create %s.c\n",
2534 And that about wraps it up. We need to set the locale so that UTF-8 is
2535 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2537 ###### File: parsergen.mk
2538 parsergen : parsergen.o libscanner.o libmdcode.o
2539 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2546 int main(int argc, char *argv[])
2551 setlocale(LC_ALL,"");
2553 ## process arguments
2560 ## The SHIFT/REDUCE parser
2562 Having analysed the grammar and generated all the tables, we only need
2563 the shift/reduce engine to bring it all together.
2565 ### Goto table lookup
2567 The parser generator has nicely provided us with goto tables sorted by
2568 symbol number. We need a binary search function to find a symbol in the
2571 ###### parser functions
2573 static int search(const struct state *l, int sym)
2576 int hi = l->go_to_cnt;
2580 while (lo + 1 < hi) {
2581 int mid = (lo + hi) / 2;
2582 if (l->go_to[mid].sym <= sym)
2587 if (l->go_to[lo].sym == sym)
2588 return l->go_to[lo].state;
2593 ### Memory allocation
2595 The `scanner` returns tokens in a local variable - we want them in allocated
2596 memory so they can live in the `asn_stack`. So we provide `tok_copy` to
2597 make an allocated copy as required.
2599 ###### parser includes
2602 ###### parser functions
2604 static struct token *tok_copy(struct token tk)
2606 struct token *new = malloc(sizeof(*new));
2611 ### The state stack.
2613 The core data structure for the parser is the stack. This tracks all
2614 the symbols that have been recognised or partially recognised.
2616 The stack usually won't grow very large - maybe a few tens of entries.
2617 So we dynamically grow an array as required but never bother to
2618 shrink it down again.
2620 We keep the stack as two separate allocations. One, `asn_stack` stores
2621 the "abstract syntax nodes" which are created by each reduction. When
2622 we call `do_reduce` we need to pass an array of the `asn`s of the body
2623 of the production, and by keeping a separate `asn` stack, we can just
2624 pass a pointer into this stack.
2626 The other allocation stores all other stack fields of which there are
2627 several. The `state` is the most important one and guides the parsing
2628 process. The `sym` is nearly unnecessary as it is implicit in the
2629 state. However when we want to free entries from the `asn_stack`, it
2630 helps to know what type they are so we can call the right freeing
2631 function. The symbol leads us to the right free function through
2634 The stack is most properly seen as alternating states and symbols -
2635 states, like the 'DOT' in items, are between symbols. Each frame in
2636 our stack holds a state and the symbol that was before it. The
2637 bottom of stack holds the start state but no symbol, as nothing came
2638 before the beginning. As we need to store some value, `TK_eof` is used
2639 to mark the beginning of the file as well as the end.
2641 ###### parser functions
2655 Two operations are needed on the stack - shift (which is like push) and pop.
2657 Shift applies not only to terminals but also to non-terminals. When we
2658 reduce a production we will pop off frames corresponding to the body
2659 symbols, then push on a frame for the head of the production. This last
2660 is exactly the same process as shifting in a terminal so we use the same
2661 function for both. In both cases we provide the symbol. The state is
2662 deduced from the current top-of-stack state and the new symbol.
2664 To simplify other code we arrange for `shift` to fail if there is no `goto`
2665 state for the symbol. This is useful in basic parsing due to our design
2666 that we shift when we can, and reduce when we cannot. So the `shift`
2667 function reports if it could.
2669 `shift` is also used to push state zero onto the stack, so if the
2670 stack is empty, it always chooses zero as the next state.
2672 So `shift` finds the next state. If that succeeds it extends the
2673 allocations if needed and pushes all the information onto the stacks.
2675 ###### parser functions
2677 static int shift(struct parser *p,
2678 short sym, void *asn,
2679 const struct state states[])
2681 struct frame next = {0};
2682 int newstate = p->tos
2683 ? search(&states[p->stack[p->tos-1].state],
2688 if (p->tos >= p->stack_size) {
2689 p->stack_size += 10;
2690 p->stack = realloc(p->stack, p->stack_size
2691 * sizeof(p->stack[0]));
2692 p->asn_stack = realloc(p->asn_stack, p->stack_size
2693 * sizeof(p->asn_stack[0]));
2696 next.state = newstate;
2698 p->stack[p->tos] = next;
2699 p->asn_stack[p->tos] = asn;
2704 `pop` primarily moves the top of stack (`tos`) back down the required
2705 amount and frees any `asn` entries that need to be freed. It is called
2706 _after_ we reduce a production, just before we `shift` the nonterminal
2709 ###### parser functions
2711 static void pop(struct parser *p, int num,
2712 void(*do_free)(short sym, void *asn))
2717 for (i = 0; i < num; i++)
2718 do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]);
2721 ### The heart of the parser.
2723 Now we have the parser. For each token we might shift it, trigger a
2724 reduction, or start error handling. 2D tokens (IN, OUT, EOL) also need
2727 We return whatever `asn` was returned by reducing production zero.
2729 When we find `TK_in` and `TK_out` tokens which report indents we need
2730 to handle them directly as the grammar cannot express what we want to
2733 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2734 record how many indents there are following the previous token.
2736 `TK_out` tokens must be canceled against an indent count
2737 within the stack. If we can reduce some symbols that are all since
2738 the most recent indent, then we do that first. If the minimum prefix
2739 of the current state then extends back before the most recent indent,
2740 that indent can be cancelled. If the minimum prefix is shorter then
2741 the indent had ended prematurely and we must start error handling, which
2742 is still a work-in-progress.
2744 `TK_newline` tokens are ignored unless the top stack frame records
2745 that they are permitted. In that case they will not be considered for
2746 shifting if it is possible to reduce some symbols that are all since
2747 the most recent start of line. This is how a newline forcibly
2748 terminates any line-like structure - we try to reduce down to at most
2749 one symbol for each line where newlines are allowed.
2750 A consequence of this is that a rule like
2752 ###### Example: newlines - broken
2756 IfStatement -> Newlines if ....
2758 cannot work, as the NEWLINE will never be shifted as the empty string
2759 will be reduced first. Optional sets of newlines need to be include
2760 in the thing that preceed:
2762 ###### Example: newlines - works
2766 IfStatement -> If ....
2768 Here the NEWLINE will be shifted because nothing can be reduced until
2771 We have already discussed the bulk of the handling of a "reduce" action,
2772 with the `pop()` and `shift()` functions doing much of the work. There
2773 is a little more complexity needed to manage storage for the asn (Abstract
2774 Syntax Node), and also a test of whether the reduction is permitted.
2776 When we try to shift the result of reducing production-zero, it will
2777 fail because there is no next state. In this case the asn will not have
2778 been stored on the stack, so it get stored in the `ret` variable, and we
2779 report that that input has been accepted.
2787 if (states[tos->state].reduce_prod >= 0) {
2790 const struct state *nextstate = &states[tos->state];
2791 int prod = nextstate->reduce_prod;
2792 int size = nextstate->reduce_size;
2793 int res_size = nextstate->result_size;
2795 body = p.asn_stack + (p.tos - size);
2796 res = res_size ? calloc(1, res_size) : NULL;
2797 res_size = do_reduce(prod, body, config, res);
2798 if (res_size != nextstate->result_size)
2800 pop(&p, size, do_free);
2801 if (!shift(&p, nextstate->reduce_sym, res, states)) {
2804 parser_trace_action(trace, "Accept");
2806 parser_trace_action(trace, "Reduce");
2810 If we can neither shift nor reduce we have an error to handle. There
2811 are two possible responses to an error: we can pop single frames off the
2812 stack until we find a frame that can shift `TK_error`, or we can discard
2813 the current look-ahead symbol.
2815 When we first see an error we do the first of these and set a flag to
2816 record that we are processing an error. If the normal shift/reduce
2817 tests fail to find that anything can be done from that state, we start
2818 dropping tokens until either we manage to shift one, or reach end-of-file.
2831 parser_trace_action(trace, "DISCARD");
2832 if (tk->num == TK_eof)
2837 struct token *err_tk;
2839 parser_trace_action(trace, "ERROR");
2841 err_tk = tok_copy(*tk);
2842 while (p.tos > 0 && shift(&p, TK_error, err_tk, states) == 0)
2843 // discard this state
2844 pop(&p, 1, do_free);
2847 // no state accepted TK_error
2853 ###### parser includes
2858 void *parser_run(struct token_state *tokens,
2859 const struct state states[],
2860 int (*do_reduce)(int, void**, struct token_config*, void*),
2861 void (*do_free)(short, void*),
2862 FILE *trace, const char *non_term[],
2863 struct token_config *config)
2865 struct parser p = { 0 };
2866 struct token *tk = NULL;
2870 shift(&p, TK_eof, NULL, states);
2871 while (!accepted && p.tos > 0) {
2872 struct frame *tos = &p.stack[p.tos-1];
2874 tk = tok_copy(token_next(tokens));
2875 parser_trace(trace, &p,
2876 tk, states, non_term, config->known_count);
2878 if (tk->num == TK_in) {
2881 parser_trace_action(trace, "Record");
2884 if (tk->num == TK_out) {
2890 parser_trace_action(trace, "Cancel");
2893 // fall through to error handling as both SHIFT and REDUCE
2896 if (tk->num == TK_newline) {
2900 parser_trace_action(trace, "Discard");
2904 if (shift(&p, tk->num, tk, states)) {
2906 parser_trace_action(trace, "Shift");
2915 pop(&p, p.tos, do_free);
2921 ###### exported functions
2922 void *parser_run(struct token_state *tokens,
2923 const struct state states[],
2924 int (*do_reduce)(int, void**, struct token_config*, void*),
2925 void (*do_free)(short, void*),
2926 FILE *trace, const char *non_term[],
2927 struct token_config *config);
2931 Being able to visualize the parser in action can be invaluable when
2932 debugging the parser code, or trying to understand how the parse of a
2933 particular grammar progresses. The stack contains all the important
2934 state, so just printing out the stack every time around the parse loop
2935 can make it possible to see exactly what is happening.
2937 This doesn't explicitly show each SHIFT and REDUCE action. However they
2938 are easily deduced from the change between consecutive lines, and the
2939 details of each state can be found by cross referencing the states list
2940 in the stack with the "report" that parsergen can generate.
2942 For terminal symbols, we just dump the token. For non-terminals we
2943 print the name of the symbol. The look ahead token is reported at the
2944 end inside square brackets.
2946 ###### parser functions
2948 static char *reserved_words[] = {
2949 [TK_error] = "ERROR",
2952 [TK_newline] = "NEWLINE",
2956 void parser_trace(FILE *trace, struct parser *p,
2957 struct token *tk, const struct state states[],
2958 const char *non_term[], int knowns)
2963 for (i = 0; i < p->tos; i++) {
2964 struct frame *f = &p->stack[i];
2967 if (sym < TK_reserved &&
2968 reserved_words[sym] != NULL)
2969 fputs(reserved_words[sym], trace);
2970 else if (sym < TK_reserved + knowns) {
2971 struct token *t = p->asn_stack[i];
2972 text_dump(trace, t->txt, 20);
2974 fputs(non_term[sym - TK_reserved - knowns],
2978 fprintf(trace, "(%d) ", f->state);
2980 fprintf(trace, "[");
2981 if (tk->num < TK_reserved &&
2982 reserved_words[tk->num] != NULL)
2983 fputs(reserved_words[tk->num], trace);
2985 text_dump(trace, tk->txt, 20);
2986 fprintf(trace, ":%d:%d]", tk->line, tk->col);
2989 void parser_trace_action(FILE *trace, char *action)
2992 fprintf(trace, " - %s\n", action);
2997 The obvious example for a parser is a calculator.
2999 As `scanner` provides number parsing function using `libgmp` it is not much
3000 work to perform arbitrary rational number calculations.
3002 This calculator takes one expression, or an equality test, per line.
3003 The results are printed and if any equality test fails, the program
3004 exits with an error.
3006 ###### File: parsergen.mk
3007 calc.c calc.h : parsergen parsergen.mdc
3008 ./parsergen --tag calc -o calc parsergen.mdc
3009 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3010 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3012 ./calc parsergen.mdc
3017 #include "parse_number.h"
3018 // what do we use for a demo-grammar? A calculator of course.
3030 #include <sys/mman.h>
3036 #include "scanner.h"
3041 static void free_number(struct number *n)
3047 static int text_is(struct text t, char *s)
3049 return (strlen(s) == t.len &&
3050 strncmp(s, t.txt, t.len) == 0);
3053 int main(int argc, char *argv[])
3055 int fd = open(argv[1], O_RDONLY);
3056 int len = lseek(fd, 0, 2);
3057 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3058 struct section *table = code_extract(file, file+len, NULL);
3060 struct token_config config = {
3061 .ignored = (1 << TK_line_comment)
3064 .number_chars = ".,_+-",
3068 for (s = table; s; s = s->next)
3069 if (text_is(s->section, "example: input"))
3070 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3072 struct section *t = table->next;
3073 code_free(table->code);
3085 Session -> Session Line
3088 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3089 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3090 gmp_printf(" or as a decimal: %Fg\n", fl);
3094 | Expression = Expression NEWLINE ${
3095 if (mpq_equal($1.val, $3.val))
3096 gmp_printf("Both equal %Qd\n", $1.val);
3098 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
3103 | NEWLINE ${ printf("Blank line\n"); }$
3104 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3107 Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3108 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3109 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3110 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3111 | Expression // Expression ${ {
3114 mpz_init(z0); mpz_init(z1); mpz_init(z2);
3115 mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3116 mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3117 mpz_tdiv_q(z0, z1, z2);
3118 mpq_set_z($0.val, z0);
3119 mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3121 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3122 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3127 3.1415926535 - 355/113
3129 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3131 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1