1 # LR(1) Parser Generator with 2D support #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 "2D support" means that indentation and line breaks can be significant.
8 Indent tokens (IN and OUT) and end-of-line tokens (EOL and NEWLINE) can
9 be used to describe the grammar and the parser can selectively ignore
10 these where they aren't relevant.
12 There are several distinct sections.
14 1. `grammar_read` will read a grammar from a literate-code file and
15 store the productions, symbols, and code fragments.
16 2. `grammar_analyse` will take that grammar and build LR parsing
17 states and look-ahead tables.
18 3. `grammar_report` will present the details of the analysis
19 in a readable format and will report any conflicts.
20 4. `parser_generate` will write out C code files with various
21 tables and with the code fragments arranged in useful places.
22 5. `parser_run` is a library function intended to be linked together
23 with the generated parser tables to complete the implementation of
25 6. Finally `calc` is a test grammar for a simple calculator. The
26 `parsergen` program built from the C code in this file can extract
27 that grammar directly from this file and process it.
29 ###### File: parsergen.c
34 ## forward declarations
45 ###### File: libparser.c
52 ###### File: parsergen.mk
55 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
58 ## Reading the grammar
60 The grammar must be stored in a markdown literate code file as parsed by
61 "[mdcode][]". It may have up to four top level (i.e. unreferenced)
62 sections: `header`, `code`, `grammar` and `reduce`. The first two, if
63 present, will be literally copied into the generated `.c` and `.h`
64 files. The third is required and contains the grammar which is
65 tokenised with "[scanner][]". `reduce` can provide variable
66 declarations and common intialization code for all the reduction code
67 fragments in the grammar.
69 If the `--tag` option is given, then any top level header that doesn't
70 start with the tag is ignored, and the tag is striped from the rest. So
71 `--tag Foo` means that the four sections expected are `Foo: header`,
72 `Foo: code`, `Foo: grammar` and `Foo: reduce`. The tag `calc` is used
73 to extract the sample calculator from this file.
76 [scanner]: scanner.html
82 ###### parser includes
86 The grammar contains production sets, precedence/associativity
87 declarations, general terminal declarations, and data type declarations.
88 These are all parsed with _ad hoc_ parsing as we don't have a parser
91 The precedence and associativity can be set for each production, and
92 can be inherited from symbols. The data type (either structure or a
93 reference to a structure) is potentially defined for each non-terminal
94 and describes what C structure is used to store information about each
98 enum assoc {Left, Right, Non};
99 char *assoc_names[] = {"Left","Right","Non"};
102 struct text struct_name;
105 unsigned short precedence;
109 unsigned short precedence;
117 The strings reported by `mdcode` and `scanner` are `struct text` which
118 have length rather than being nul terminated. To help with printing and
119 comparing we define `text_is` and `prtxt`, which should possibly go in
120 `mdcode`. `scanner` does provide `text_dump` which is useful for
121 strings which might contain control characters.
123 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
124 because that is what we need to detect tags.
127 static int text_is(struct text t, char *s)
129 return (strlen(s) == t.len &&
130 strncmp(s, t.txt, t.len) == 0);
132 static void prtxt(struct text t)
134 printf("%.*s", t.len, t.txt);
137 static int strip_tag(struct text *t, char *tag)
139 int skip = strlen(tag) + 1;
140 if (skip >= t->len ||
141 strncmp(t->txt, tag, skip-1) != 0 ||
142 t->txt[skip-1] != ':')
144 while (skip < t->len && t->txt[skip] == ' ')
153 Productions are comprised primarily of symbols - terminal and
154 non-terminal. We do not make a syntactic distinction between the two,
155 though non-terminals must be identifiers while terminals can also be
156 marks. Non-terminal symbols are those which appear in the head of a
157 production, terminal symbols are those which don't. There are also
158 "virtual" symbols used for precedence marking discussed later, and
159 sometimes we won't know what type a symbol is yet.
161 To help with code safety it is possible to declare the terminal symbols.
162 If this is done, then any symbol used in a production that does not
163 appear in the head of any production and is not declared as a terminal
164 is treated as an error.
166 ###### forward declarations
167 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
168 char *symtypes = "UVTN";
171 ###### grammar fields
172 int terminals_declared;
174 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
175 table of known symbols and the resulting parser will report them as
176 `TK_reserved + N`. A small set of identifiers are reserved for the
177 different token types that `scanner` can report, and an even smaller set
178 are reserved for a special token that the parser can generate (`EOL`) as
179 will be described later. This latter set cannot use predefined numbers,
180 so they are marked as `isspecial` for now and will get assigned a number
181 with the non-terminals later.
185 static struct { int num; char *name; } reserved_words[] = {
186 { TK_error, "ERROR" },
187 { TK_number, "NUMBER" },
188 { TK_ident, "IDENTIFIER" },
190 { TK_string, "STRING" },
191 { TK_multi_string, "MULTI_STRING" },
194 { TK_newline, "NEWLINE" },
201 unsigned int isspecial:1;
203 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
204 recognised. The former is automatically expected at the end of the text
205 being parsed. The latter are always ignored by the parser.
207 All of these symbols are stored in a simple symbol table. We use the
208 `struct text` from `mdcode` to store the name and link them together
209 into a sorted list using an insertion sort.
211 We don't have separate `find` and `insert` functions as any symbol we
212 find needs to be remembered. We simply expect `find` to always return a
213 symbol, but its type might be `Unknown`.
222 ###### grammar fields
227 static struct symbol *sym_find(struct grammar *g, struct text s)
229 struct symbol **l = &g->syms;
234 (cmp = text_cmp((*l)->name, s)) < 0)
238 n = calloc(1, sizeof(*n));
247 static void symbols_init(struct grammar *g)
249 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
251 for (i = 0; i < entries; i++) {
254 t.txt = reserved_words[i].name;
255 t.len = strlen(t.txt);
258 s->num = reserved_words[i].num;
263 ### Data types and precedence.
265 Data type specification, precedence specification, and declaration of
266 terminals are all introduced by a dollar sign at the start of the line.
267 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
268 precedence, if it is `TERM` then the line declares terminals without
269 precedence, otherwise it specifies a data type.
271 The data type name is simply stored and applied to the head of all
272 subsequent productions. It must be the name of a structure optionally
273 preceded by an asterisk which means a reference or "pointer". So
274 `$expression` maps to `struct expression` and `$*statement` maps to
275 `struct statement *`.
277 Any productions given before the first data type declaration will have
278 no data type associated with them and can carry no information. In
279 order to allow other non-terminals to have no type, the data type
280 `$void` can be given. This does *not* mean that `struct void` will be
281 used, but rather than no type will be associated with subsequent
284 The precedence line must contain a list of symbols, either terminal
285 symbols or virtual symbols. It can only contain symbols that have not
286 been seen yet, so precedence declaration must precede the productions
289 A "virtual" symbol is an identifier preceded by `$$`. Virtual symbols
290 carry precedence information but are not included in the grammar. A
291 production can declare that it inherits the precedence of a given
294 This use of `$$` precludes it from being used as a symbol in the
295 described language. Two other symbols: `${` and `}$` are also
298 Each new precedence line introduces a new precedence level and
299 declares how it associates. This level is stored in each symbol
300 listed and may be inherited by any production which uses the symbol. A
301 production inherits from the last symbol which has a precedence.
303 The symbols on the first precedence line have the lowest precedence.
304 Subsequent lines introduce symbols with higher precedence and so bind
307 Note that a declaration line (or "dollar line") can start with either of
308 two different marks: "$" or "$*". The `dollar_line()` function defined
309 here is told which was found in the `isref` argument.
311 ###### grammar fields
312 struct text current_type;
317 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
318 static const char *known[] = { "$$", "${", "}$" };
321 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
323 struct token t = token_next(ts);
329 if (t.num != TK_ident) {
330 err = "type or assoc expected after '$'";
333 if (text_is(t.txt, "LEFT"))
335 else if (text_is(t.txt, "RIGHT"))
337 else if (text_is(t.txt, "NON"))
339 else if (text_is(t.txt, "TERM")) {
341 g->terminals_declared = 1;
343 g->current_type = t.txt;
344 g->type_isref = isref;
345 if (text_is(t.txt, "void"))
346 g->current_type.txt = NULL;
348 if (t.num != TK_newline) {
349 err = "Extra tokens after type name";
356 err = "$* cannot be followed by a precedence";
360 // This is a precedence or TERM line, need some symbols.
364 while (t.num != TK_newline) {
365 enum symtype type = Terminal;
367 if (t.num == TK_virtual) {
370 if (t.num != TK_ident) {
371 err = "$$ must be followed by a word";
375 err = "Virtual symbols not permitted on $TERM line";
378 } else if (t.num != TK_ident &&
380 err = "Illegal token in precedence line";
383 s = sym_find(g, t.txt);
384 if (s->type != Unknown) {
385 err = "Symbols in precedence/TERM line must not already be known.";
390 s->precedence = g->prec_levels;
397 err = "No symbols given on precedence/TERM line";
402 while (t.num != TK_newline && t.num != TK_eof)
409 A production either starts with an identifier which is the head
410 non-terminal, or a vertical bar (`|`) in which case this production
411 uses the same head as the previous one. The identifier must be
412 followed by a `->` mark. All productions for a given non-terminal must
413 be grouped together with the `nonterminal ->` given only once.
415 After this a (possibly empty) sequence of identifiers and marks form
416 the body of the production. A virtual symbol may be given after the
417 body by preceding it with `$$`. If a virtual symbol is given, the
418 precedence of the production is that for the virtual symbol. If none
419 is given, the precedence is inherited from the last symbol in the
420 production which has a precedence specified.
422 After the optional precedence may come the `${` mark. This indicates
423 the start of a code fragment. If present, this must be on the same
424 line as the start of the production.
426 All of the text from the `${` through to the matching `}$` is
427 collected and forms the code-fragment for the production. It must all
428 be in one `code_node` of the literate code. The `}$` must be
429 at the end of a line.
431 Text in the code fragment will undergo substitutions where `$N` or
432 `$<N`,for some numeric `N` (or non-numeric indicator as described
433 later), will be replaced with a variable holding the parse information
434 for the particular symbol in the production. `$0` is the head of the
435 production, `$1` is the first symbol of the body, etc. The type of `$N`
436 for a terminal symbol is `struct token`. For a non-terminal, it is
437 whatever has been declared for that symbol. The `<` may be included and
438 means that the value (usually a reference) is being moved out, so it
439 will not automatically be freed. The effect of using '<' is that the
440 variable is cleared to all-zeros.
442 While building productions we will need to add to an array which needs to
446 static void array_add(void *varray, int *cnt, void *new)
448 void ***array = varray;
451 current = ((*cnt-1) | (step-1))+1;
452 if (*cnt == current) {
455 *array = realloc(*array, current * sizeof(void*));
457 (*array)[*cnt] = new;
461 Collecting the code fragment simply involves reading tokens until we
462 hit the end token `}$`, and noting the character position of the start and
466 static struct text collect_code(struct token_state *state,
471 code.txt = start.txt.txt + start.txt.len;
473 t = token_next(state);
474 while (t.node == start.node &&
475 t.num != TK_close && t.num != TK_error &&
477 if (t.num == TK_close && t.node == start.node)
478 code.len = t.txt.txt - code.txt;
484 Now we have all the bits we need to parse a full production.
489 ###### grammar fields
490 struct production **productions;
491 int production_count;
493 ###### production fields
495 struct symbol **body;
501 int first_production;
504 static char *parse_production(struct grammar *g,
506 struct token_state *state)
508 /* Head has already been parsed. */
511 struct production p, *pp;
513 memset(&p, 0, sizeof(p));
515 tk = token_next(state);
516 while (tk.num == TK_ident || tk.num == TK_mark) {
517 struct symbol *bs = sym_find(g, tk.txt);
518 if (bs->type == Unknown) {
519 if (!g->terminals_declared)
522 if (bs->type == Virtual) {
523 err = "Virtual symbol not permitted in production";
526 if (bs->precedence) {
527 p.precedence = bs->precedence;
530 array_add(&p.body, &p.body_size, bs);
531 tk = token_next(state);
533 if (tk.num == TK_virtual) {
535 tk = token_next(state);
536 if (tk.num != TK_ident) {
537 err = "word required after $$";
540 vs = sym_find(g, tk.txt);
541 if (vs->precedence == 0) {
542 err = "symbol after $$ must have precedence";
545 p.precedence = vs->precedence;
548 tk = token_next(state);
550 if (tk.num == TK_open) {
551 p.code_line = tk.line;
552 p.code = collect_code(state, tk);
553 if (p.code.txt == NULL) {
554 err = "code fragment not closed properly";
557 tk = token_next(state);
560 if (tk.num != TK_newline && tk.num != TK_eof) {
561 err = "stray tokens at end of line";
564 pp = malloc(sizeof(*pp));
566 array_add(&g->productions, &g->production_count, pp);
569 while (tk.num != TK_newline && tk.num != TK_eof)
570 tk = token_next(state);
575 With the ability to parse productions and dollar-lines, we have nearly all
576 that we need to parse a grammar from a `code_node`.
578 The head of the first production will effectively be the `start` symbol
579 of the grammar. However it won't _actually_ be so. Processing the
580 grammar is greatly simplified if the real start symbol only has a single
581 production, and expects `$eof` as the final terminal. So when we find
582 the first explicit production we insert an extra implicit production as
583 production zero which looks like
585 ###### Example: production 0
588 where `START` is the first non-terminal given.
590 ###### create production zero
591 struct production *p = calloc(1,sizeof(*p));
592 struct text start = {"$start",6};
593 struct text eof = {"$eof",4};
594 struct text code = {"$0 = $<1;", 9};
595 p->head = sym_find(g, start);
596 p->head->type = Nonterminal;
597 p->head->struct_name = g->current_type;
598 p->head->isref = g->type_isref;
599 if (g->current_type.txt)
601 array_add(&p->body, &p->body_size, head);
602 array_add(&p->body, &p->body_size, sym_find(g, eof));
603 p->head->first_production = g->production_count;
604 array_add(&g->productions, &g->production_count, p);
606 Now we are ready to read in the grammar. We ignore comments
607 and strings so that the marks which introduce them can be
608 used as terminals in the grammar. We don't ignore numbers
609 even though we don't allow them as that causes the scanner
610 to produce errors that the parser is better positioned to handle.
611 We also ignore indentation, but we expect and use newlines.
613 To allow comments in the grammar, we explicitly check for "//" as the
614 first token on the line and those lines are skipped. "//" can still be
615 used as a terminal anywhere that a terminal is expected.
618 static struct grammar *grammar_read(struct code_node *code)
620 struct token_config conf = {
623 .words_marks = known,
624 .known_count = sizeof(known)/sizeof(known[0]),
626 .ignored = (1 << TK_line_comment)
627 | (1 << TK_block_comment)
630 | (1 << TK_multi_string)
635 struct token_state *state = token_open(code, &conf);
637 struct symbol *head = NULL;
641 g = calloc(1, sizeof(*g));
644 for (tk = token_next(state); tk.num != TK_eof;
645 tk = token_next(state)) {
646 if (tk.num == TK_newline)
648 if (tk.num == TK_ident) {
650 head = sym_find(g, tk.txt);
651 if (head->type == Nonterminal)
652 err = "This non-terminal has already be used.";
653 else if (head->type == Virtual)
654 err = "Virtual symbol not permitted in head of production";
656 head->type = Nonterminal;
657 head->struct_name = g->current_type;
658 head->isref = g->type_isref;
659 if (g->production_count == 0) {
660 ## create production zero
662 head->first_production = g->production_count;
663 tk = token_next(state);
664 if (tk.num == TK_mark &&
665 text_is(tk.txt, "->"))
666 err = parse_production(g, head, state);
668 err = "'->' missing in production";
670 } else if (tk.num == TK_mark
671 && text_is(tk.txt, "|")) {
672 // another production for same non-term
674 err = parse_production(g, head, state);
676 err = "First production must have a head";
677 } else if (tk.num == TK_mark
678 && text_is(tk.txt, "$")) {
679 err = dollar_line(state, g, 0);
680 } else if (tk.num == TK_mark
681 && text_is(tk.txt, "$*")) {
682 err = dollar_line(state, g, 1);
683 } else if (tk.num == TK_mark
684 && text_is(tk.txt, "//")) {
685 while (tk.num != TK_newline &&
687 tk = token_next(state);
689 err = "Unrecognised token at start of line.";
695 if (g->terminals_declared) {
698 for (s = g->syms; s; s = s->next) {
699 if (s->type != Unknown)
702 fprintf(stderr, "Token %.*s not declared\n",
703 s->name.len, s->name.txt);
706 free(g); // FIXME free content
712 fprintf(stderr, "Error at line %d: %s\n",
715 free(g); // FIXME free content
719 ## Analysing the grammar
721 The central task in analysing the grammar is to determine a set of
722 states to drive the parsing process. This will require finding various
723 sets of symbols and of "items". Some of these sets will need to attach
724 information to each element in the set, so they are more like maps.
726 Each "item" may have a 'look-ahead' or `LA` set associated with it.
727 Many of these will be the same as each other. To avoid memory wastage,
728 and to simplify some comparisons of sets, these sets will be stored in a
729 list of unique sets, each assigned a number.
731 Once we have the data structures in hand to manage these sets and lists,
732 we can start setting the 'nullable' flag, build the 'FIRST' and 'FOLLOW'
733 sets, and then create the item sets which define the various states.
737 Though we don't only store symbols in these sets, they are the main
738 things we store, so they are called symbol sets or "symsets".
740 A symset has a size, an array of shorts, and an optional array of data
741 values, which are also shorts. If the array of data is not present, we
742 store an impossible pointer, as `NULL` is used to indicate that no
743 memory has been allocated yet;
748 unsigned short *syms, *data;
750 #define NO_DATA ((unsigned short *)1)
751 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
752 const struct symset INIT_DATASET = { 0, NULL, NULL };
754 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
755 using an insertion sort. We don't explicitly record the amount of
756 allocated space as it can be derived directly from the current `cnt`
757 using `((cnt - 1) | 7) + 1`.
760 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
763 int current = ((s->cnt-1) | 7) + 1;
764 if (current == s->cnt) {
766 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
767 if (s->data != NO_DATA)
768 s->data = realloc(s->data, sizeof(*s->data) * current);
771 while (i > 0 && s->syms[i-1] > key) {
772 s->syms[i] = s->syms[i-1];
773 if (s->data != NO_DATA)
774 s->data[i] = s->data[i-1];
778 if (s->data != NO_DATA)
783 Finding a symbol (or item) in a `symset` uses a simple binary search.
784 We return the index where the value was found (so data can be accessed),
785 or `-1` to indicate failure.
787 static int symset_find(struct symset *ss, unsigned short key)
794 while (lo + 1 < hi) {
795 int mid = (lo + hi) / 2;
796 if (ss->syms[mid] <= key)
801 if (ss->syms[lo] == key)
806 We will often want to form the union of two symsets. When we do, we
807 will often want to know if anything changed (as that might mean there is
808 more work to do). So `symset_union` reports whether anything was added
809 to the first set. We use a slow quadratic approach as these sets are
810 rarely large. If profiling shows this to be a problem it can be
813 static int symset_union(struct symset *a, struct symset *b)
817 for (i = 0; i < b->cnt; i++)
818 if (symset_find(a, b->syms[i]) < 0) {
819 unsigned short data = 0;
820 if (b->data != NO_DATA)
822 symset_add(a, b->syms[i], data);
828 And of course we must be able to free a symset.
830 static void symset_free(struct symset ss)
833 if (ss.data != NO_DATA)
839 Some symsets are simply stored somewhere appropriate in the `grammar`
840 data structure, others needs a bit of indirection. This applies
841 particularly to "LA" sets. These will be paired with "items" in an
842 "itemset". As itemsets will be stored in a symset, the "LA" set needs
843 to be stored in the `data` field, so we need a mapping from a small
844 (short) number to an LA `symset`.
846 As mentioned earlier this involves creating a list of unique symsets.
848 For now, we just use a linear list sorted by insertion. A skiplist
849 would be more efficient and may be added later.
856 struct setlist *next;
859 ###### grammar fields
860 struct setlist *sets;
865 static int ss_cmp(struct symset a, struct symset b)
868 int diff = a.cnt - b.cnt;
871 for (i = 0; i < a.cnt; i++) {
872 diff = (int)a.syms[i] - (int)b.syms[i];
879 static int save_set(struct grammar *g, struct symset ss)
881 struct setlist **sl = &g->sets;
885 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
892 s = malloc(sizeof(*s));
901 Finding a set by number is currently performed by a simple linear
902 search. If this turns out to hurt performance, we can store the sets in
903 a dynamic array like the productions.
905 static struct symset set_find(struct grammar *g, int num)
907 struct setlist *sl = g->sets;
908 while (sl && sl->num != num)
913 ### Setting `nullable`
915 We set `nullable` on the head symbol for any production for which all
916 the body symbols (if any) are nullable. As this is a recursive
917 definition, any change in the `nullable` setting means that we need to
918 re-evaluate where it needs to be set.
920 We simply loop around performing the same calculations until no more
927 static void set_nullable(struct grammar *g)
930 while (check_again) {
933 for (p = 0; p < g->production_count; p++) {
934 struct production *pr = g->productions[p];
937 if (pr->head->nullable)
939 for (s = 0; s < pr->body_size; s++)
940 if (! pr->body[s]->nullable)
942 if (s == pr->body_size) {
943 pr->head->nullable = 1;
950 ### Building the `first` sets
952 When calculating what can follow a particular non-terminal, we will need
953 to know what the "first" terminal in any subsequent non-terminal might
954 be. So we calculate the `first` set for every non-terminal and store
955 them in an array. We don't bother recording the "first" set for
956 terminals as they are trivial.
958 As the "first" for one symbol might depend on the "first" of another, we
959 repeat the calculations until no changes happen, just like with
960 `set_nullable`. This makes use of the fact that `symset_union` reports
961 if any change happens.
963 The core of this, which finds the "first" of part of a production body,
964 will be reused for computing the "follow" sets, so we split that out
965 into a separate function, `add_first`, which also reports if it got all
966 the way to the end of the production without finding a non-nullable
969 ###### grammar fields
970 struct symset *first;
974 static int add_first(struct production *pr, int start,
975 struct symset *target, struct grammar *g,
980 for (s = start; s < pr->body_size; s++) {
981 struct symbol *bs = pr->body[s];
982 if (bs->type == Terminal) {
983 if (symset_find(target, bs->num) < 0) {
984 symset_add(target, bs->num, 0);
988 } else if (symset_union(target, &g->first[bs->num]))
994 *to_end = (s == pr->body_size);
998 static void build_first(struct grammar *g)
1000 int check_again = 1;
1002 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1003 for (s = 0; s < g->num_syms; s++)
1004 g->first[s] = INIT_SYMSET;
1006 while (check_again) {
1009 for (p = 0; p < g->production_count; p++) {
1010 struct production *pr = g->productions[p];
1011 struct symset *head = &g->first[pr->head->num];
1013 if (add_first(pr, 0, head, g, NULL))
1019 ### Building the `follow` sets.
1021 There are two different situations when we will want to generate
1022 "follow" sets. If we are doing an SLR analysis, we want to generate a
1023 single "follow" set for each non-terminal in the grammar. That is what
1024 is happening here. If we are doing an LALR or LR analysis we will want
1025 to generate a separate "LA" set for each item. We do that later in
1028 There are two parts to generating a "follow" set. Firstly we look at
1029 every place that any non-terminal appears in the body of any production,
1030 and we find the set of possible "first" symbols after there. This is
1031 done using `add_first` above and only needs to be done once as the
1032 "first" sets are now stable and will not change.
1034 ###### grammar fields
1035 struct symset *follow;
1039 for (p = 0; p < g->production_count; p++) {
1040 struct production *pr = g->productions[p];
1043 for (b = 0; b < pr->body_size - 1; b++) {
1044 struct symbol *bs = pr->body[b];
1045 if (bs->type == Terminal)
1047 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1051 The second part is to add the "follow" set of the head of a production
1052 to the "follow" sets of the final symbol in the production, and any
1053 other symbol which is followed only by `nullable` symbols. As this
1054 depends on "follow" itself we need to repeatedly perform the process
1055 until no further changes happen.
1057 Rather than 2 nested loops, one that repeats the whole process until
1058 there is no change, and one that iterates of the set of productions, we
1059 combine these two functions into a single loop.
1063 for (check_again = 0, p = 0;
1064 p < g->production_count;
1065 p < g->production_count-1
1066 ? p++ : check_again ? (check_again = 0, p = 0)
1068 struct production *pr = g->productions[p];
1071 for (b = pr->body_size - 1; b >= 0; b--) {
1072 struct symbol *bs = pr->body[b];
1073 if (bs->type == Terminal)
1075 if (symset_union(&g->follow[bs->num],
1076 &g->follow[pr->head->num]))
1083 We now just need to create and initialise the `follow` list to get a
1087 static void build_follow(struct grammar *g)
1089 int s, check_again, p;
1090 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1091 for (s = 0; s < g->num_syms; s++)
1092 g->follow[s] = INIT_SYMSET;
1096 ### Building itemsets and states
1098 There are three different levels of detail that can be chosen for
1099 building the itemsets and states for the LR grammar. They are:
1101 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1102 itemsets - look-ahead, if used at all, is simply the 'follow' sets
1104 2. LALR(1) where we build look-ahead sets with each item and merge
1105 the LA sets when we find two paths to the same "kernel" set of items.
1106 3. LR(1) where different look-ahead for any item in the set means
1107 a different item set must be created.
1109 ###### forward declarations
1110 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1112 We need to be able to look through existing states to see if a newly
1113 generated state already exists. For now we use a simple sorted linked
1116 An item is a pair of numbers: the production number and the position of
1117 "DOT", which is an index into the body of the production. As the
1118 numbers are not enormous we can combine them into a single "short" and
1119 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1120 production number (so 2000 productions with maximum of 31 symbols in the
1123 Comparing the itemsets will be a little different to comparing symsets
1124 as we want to do the lookup after generating the "kernel" of an
1125 itemset, so we need to ignore the offset=zero items which are added during
1128 To facilitate this, we modify the "DOT" number so that "0" sorts to
1129 the end of the list in the symset, and then only compare items before
1133 static inline unsigned short item_num(int production, int dot)
1135 return production | ((31-dot) << 11);
1137 static inline int item_prod(unsigned short item)
1139 return item & 0x7ff;
1141 static inline int item_dot(unsigned short item)
1143 return (31-(item >> 11)) & 0x1f;
1146 For LR(1) analysis we need to compare not just the itemset in a state
1147 but also the LA sets. As we assign each unique LA set a number, we
1148 can just compare the symset and the data values together.
1151 static int itemset_cmp(struct symset a, struct symset b,
1152 enum grammar_type type)
1158 i < a.cnt && i < b.cnt &&
1159 item_dot(a.syms[i]) > 0 &&
1160 item_dot(b.syms[i]) > 0;
1162 int diff = a.syms[i] - b.syms[i];
1166 diff = a.data[i] - b.data[i];
1171 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1175 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1181 if (type < LR1 || av == -1)
1184 a.data[i] - b.data[i];
1187 It will be helpful to know if an itemset has been "completed" or not,
1188 particularly for LALR where itemsets get merged, at which point they
1189 need to be consider for completion again. So a `completed` flag is
1192 And now we can build the list of itemsets. The lookup routine returns
1193 both a success flag and a pointer to where in the list an insert should
1194 happen, so we don't need to search a second time.
1198 struct itemset *next;
1200 struct symset items;
1201 struct symset go_to;
1203 unsigned short precedence;
1207 ###### grammar fields
1208 struct itemset *items;
1212 static int itemset_find(struct grammar *g, struct itemset ***where,
1213 struct symset kernel, enum grammar_type type)
1215 struct itemset **ip;
1217 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1218 struct itemset *i = *ip;
1220 diff = itemset_cmp(i->items, kernel, type);
1233 Adding an itemset may require merging the LA sets if LALR analysis is
1234 happening. If any new LA set adds any symbols that weren't in the old
1235 LA set, we clear the `completed` flag so that the dependants of this
1236 itemset will be recalculated and their LA sets updated.
1238 `add_itemset` must consume the symsets it is passed, either by adding
1239 them to a data structure, of freeing them.
1241 static int add_itemset(struct grammar *g, struct symset ss,
1242 enum assoc assoc, unsigned short precedence,
1243 enum grammar_type type)
1245 struct itemset **where, *is;
1247 int found = itemset_find(g, &where, ss, type);
1249 is = calloc(1, sizeof(*is));
1250 is->state = g->states;
1254 is->precedence = precedence;
1256 is->go_to = INIT_DATASET;
1265 for (i = 0; i < ss.cnt; i++) {
1266 struct symset temp = INIT_SYMSET, s;
1267 if (ss.data[i] == is->items.data[i])
1269 s = set_find(g, is->items.data[i]);
1270 symset_union(&temp, &s);
1271 s = set_find(g, ss.data[i]);
1272 if (symset_union(&temp, &s)) {
1273 is->items.data[i] = save_set(g, temp);
1284 To build all the itemsets, we first insert the initial itemset made from
1285 production zero, complete each itemset, and then generate new itemsets
1286 from old until no new ones can be made.
1288 Completing an itemset means finding all the items where "DOT" is
1289 followed by a nonterminal and adding "DOT=0" items for every production
1290 from that non-terminal - providing each item hasn't already been added.
1291 When we add such an item it might get inserted before the current
1292 one, so to make sure we process it we reset the loop counter to the
1295 If LA sets are needed, the LA set for each new item is found using
1296 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This
1297 may be supplemented by the LA set for the item which produced the new
1300 We also collect a set of all symbols which follow "DOT" (in `done`) as
1301 this is used in the next stage.
1303 When itemsets are created we assign a precedence to the itemset from the
1304 complete item, if there is one. We ignore the possibility of there
1305 being two and don't (currently) handle precedence in such grammars.
1306 When completing a grammar we ignore any item where DOT is followed by a
1307 terminal with a precedence lower than that for the itemset. Unless the
1308 terminal has right associativity, we also ignore items where the
1309 terminal has the same precedence. The result is that unwanted items are
1310 still in the itemset, but the terminal doesn't get into the go to set,
1311 so the item is ineffective.
1313 ###### complete itemset
1314 for (i = 0; i < is->items.cnt; i++) {
1315 int p = item_prod(is->items.syms[i]);
1316 int bs = item_dot(is->items.syms[i]);
1317 struct production *pr = g->productions[p];
1320 struct symset LA = INIT_SYMSET;
1321 unsigned short sn = 0;
1323 if (bs == pr->body_size)
1326 if (s->precedence && is->precedence &&
1327 is->precedence > s->precedence)
1328 /* This terminal has a low precedence and
1329 * shouldn't be shifted
1332 if (s->precedence && is->precedence &&
1333 is->precedence == s->precedence && s->assoc != Right)
1334 /* This terminal has a matching precedence and is
1335 * not Right-associative, so we mustn't shift it.
1338 if (symset_find(&done, s->num) < 0)
1339 symset_add(&done, s->num, 0);
1341 if (s->type != Nonterminal)
1347 add_first(pr, bs+1, &LA, g, &to_end);
1349 struct symset ss = set_find(g, is->items.data[i]);
1350 symset_union(&LA, &ss);
1352 sn = save_set(g, LA);
1353 LA = set_find(g, sn);
1356 /* Add productions for this symbol */
1357 for (p2 = s->first_production;
1358 p2 < g->production_count &&
1359 g->productions[p2]->head == s;
1361 int itm = item_num(p2, 0);
1362 int pos = symset_find(&is->items, itm);
1364 symset_add(&is->items, itm, sn);
1365 /* Will have re-ordered, so start
1366 * from beginning again */
1368 } else if (type >= LALR) {
1369 struct symset ss = set_find(g, is->items.data[pos]);
1370 struct symset tmp = INIT_SYMSET;
1371 struct symset *la = &LA;
1373 symset_union(&tmp, &ss);
1374 if (symset_union(&tmp, la)) {
1375 is->items.data[pos] = save_set(g, tmp);
1383 For each symbol we found (and placed in `done`) we collect all the items
1384 for which this symbol is next, and create a new itemset with "DOT"
1385 advanced over the symbol. This is then added to the collection of
1386 itemsets (or merged with a pre-existing itemset).
1388 ###### derive itemsets
1389 // Now we have a completed itemset, so we need to
1390 // compute all the 'next' itemsets and create them
1391 // if they don't exist.
1392 for (i = 0; i < done.cnt; i++) {
1394 unsigned short state;
1395 struct symbol *sym = g->symtab[done.syms[i]];
1396 enum assoc assoc = Non;
1397 unsigned short precedence = 0;
1398 struct symset newitemset = INIT_SYMSET;
1400 newitemset = INIT_DATASET;
1402 for (j = 0; j < is->items.cnt; j++) {
1403 int itm = is->items.syms[j];
1404 int p = item_prod(itm);
1405 int bp = item_dot(itm);
1406 struct production *pr = g->productions[p];
1407 unsigned short la = 0;
1410 if (bp == pr->body_size)
1412 if (pr->body[bp] != sym)
1417 la = is->items.data[j];
1418 if (bp == pr->body_size &&
1419 pr->precedence > 0 &&
1420 pr->precedence > precedence) {
1421 // new itemset is reducible and has a precedence.
1422 precedence = pr->precedence;
1425 pos = symset_find(&newitemset, item_num(p, bp));
1428 symset_add(&newitemset, item_num(p, bp), la);
1429 else if (type >= LALR) {
1430 // Need to merge la set.
1431 int la2 = newitemset.data[pos];
1433 struct symset ss = set_find(g, la2);
1434 struct symset LA = INIT_SYMSET;
1435 symset_union(&LA, &ss);
1436 ss = set_find(g, la);
1437 if (symset_union(&LA, &ss))
1438 newitemset.data[pos] = save_set(g, LA);
1444 state = add_itemset(g, newitemset, assoc, precedence, type);
1445 if (symset_find(&is->go_to, done.syms[i]) < 0)
1446 symset_add(&is->go_to, done.syms[i], state);
1449 All that is left is to create the initial itemset from production zero, and
1450 with `TK_eof` as the LA set.
1453 static void build_itemsets(struct grammar *g, enum grammar_type type)
1455 struct symset first = INIT_SYMSET;
1458 unsigned short la = 0;
1460 // LA set just has eof
1461 struct symset eof = INIT_SYMSET;
1462 symset_add(&eof, TK_eof, 0);
1463 la = save_set(g, eof);
1464 first = INIT_DATASET;
1466 // production 0, offset 0 (with no data)
1467 symset_add(&first, item_num(0, 0), la);
1468 add_itemset(g, first, Non, 0, type);
1469 for (check_again = 0, is = g->items;
1471 is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1473 struct symset done = INIT_SYMSET;
1484 ### Completing the analysis.
1486 The exact process of analysis depends on which level was requested. For
1487 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1488 `LR1` we need first, but not follow. For `SLR` we need both.
1490 We don't build the "action" tables that you might expect as the parser
1491 doesn't use them. They will be calculated to some extent if needed for
1494 Once we have built everything we allocate arrays for the two lists:
1495 symbols and itemsets. This allows more efficient access during
1496 reporting. The symbols are grouped as terminals, then non-terminals,
1497 then virtual, with the start of non-terminals recorded as `first_nonterm`.
1498 Special terminals -- meaning just EOL -- are included with the
1499 non-terminals so that they are not expected by the scanner.
1501 ###### grammar fields
1502 struct symbol **symtab;
1503 struct itemset **statetab;
1506 ###### grammar_analyse
1508 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1512 int snum = TK_reserved;
1513 for (s = g->syms; s; s = s->next)
1514 if (s->num < 0 && s->type == Terminal && !s->isspecial) {
1518 g->first_nonterm = snum;
1519 for (s = g->syms; s; s = s->next)
1520 if (s->num < 0 && s->type != Virtual) {
1524 for (s = g->syms; s; s = s->next)
1530 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1531 for (s = g->syms; s; s = s->next)
1532 g->symtab[s->num] = s;
1541 build_itemsets(g, type);
1543 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1544 for (is = g->items; is ; is = is->next)
1545 g->statetab[is->state] = is;
1548 ## Reporting on the Grammar
1550 The purpose of the report is to give the grammar developer insight into
1551 how the grammar parser will work. It is basically a structured dump of
1552 all the tables that have been generated, plus a description of any conflicts.
1554 ###### grammar_report
1555 static int grammar_report(struct grammar *g, enum grammar_type type)
1561 return report_conflicts(g, type);
1564 Firstly we have the complete list of symbols, together with the
1565 "FIRST" set if that was generated. We add a mark to each symbol to
1566 show if it is nullable (`.`).
1570 static void report_symbols(struct grammar *g)
1574 printf("SYMBOLS + FIRST:\n");
1576 printf("SYMBOLS:\n");
1578 for (n = 0; n < g->num_syms; n++) {
1579 struct symbol *s = g->symtab[n];
1583 printf(" %c%3d%c: ",
1584 s->nullable ? '.':' ',
1585 s->num, symtypes[s->type]);
1588 printf(" (%d%s)", s->precedence,
1589 assoc_names[s->assoc]);
1591 if (g->first && s->type == Nonterminal) {
1594 for (j = 0; j < g->first[n].cnt; j++) {
1597 prtxt(g->symtab[g->first[n].syms[j]]->name);
1604 Then we have the follow sets if they were computed.
1606 static void report_follow(struct grammar *g)
1609 printf("FOLLOW:\n");
1610 for (n = 0; n < g->num_syms; n++)
1611 if (g->follow[n].cnt) {
1615 prtxt(g->symtab[n]->name);
1616 for (j = 0; j < g->follow[n].cnt; j++) {
1619 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1625 And finally the item sets. These include the GO TO tables and, for
1626 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1627 it up a bit. First the items, with production number and associativity.
1629 static void report_item(struct grammar *g, int itm)
1631 int p = item_prod(itm);
1632 int dot = item_dot(itm);
1633 struct production *pr = g->productions[p];
1637 prtxt(pr->head->name);
1639 for (i = 0; i < pr->body_size; i++) {
1640 printf(" %s", (dot == i ? ". ": ""));
1641 prtxt(pr->body[i]->name);
1643 if (dot == pr->body_size)
1646 if (pr->precedence && dot == pr->body_size)
1647 printf(" (%d%s)", pr->precedence,
1648 assoc_names[pr->assoc]);
1649 if (dot < pr->body_size &&
1650 pr->body[dot]->precedence) {
1651 struct symbol *s = pr->body[dot];
1652 printf(" [%d%s]", s->precedence,
1653 assoc_names[s->assoc]);
1658 The LA sets which are (possibly) reported with each item:
1660 static void report_la(struct grammar *g, int lanum)
1662 struct symset la = set_find(g, lanum);
1666 printf(" LOOK AHEAD(%d)", lanum);
1667 for (i = 0; i < la.cnt; i++) {
1670 prtxt(g->symtab[la.syms[i]]->name);
1675 Then the go to sets:
1677 static void report_goto(struct grammar *g, struct symset gt)
1682 for (i = 0; i < gt.cnt; i++) {
1684 prtxt(g->symtab[gt.syms[i]]->name);
1685 printf(" -> %d\n", gt.data[i]);
1689 Now we can report all the item sets complete with items, LA sets, and GO TO.
1691 static void report_itemsets(struct grammar *g)
1694 printf("ITEM SETS(%d)\n", g->states);
1695 for (s = 0; s < g->states; s++) {
1697 struct itemset *is = g->statetab[s];
1698 printf(" Itemset %d:", s);
1700 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1702 for (j = 0; j < is->items.cnt; j++) {
1703 report_item(g, is->items.syms[j]);
1704 if (is->items.data != NO_DATA)
1705 report_la(g, is->items.data[j]);
1707 report_goto(g, is->go_to);
1711 ### Reporting conflicts
1713 Conflict detection varies a lot among different analysis levels. However
1714 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1715 LR1 are also similar as they have FOLLOW or LA sets.
1719 ## conflict functions
1721 static int report_conflicts(struct grammar *g, enum grammar_type type)
1724 printf("Conflicts:\n");
1726 cnt = conflicts_lr0(g, type);
1728 cnt = conflicts_slr(g, type);
1730 printf(" - no conflicts\n");
1734 LR0 conflicts are any state which have both a reducible item and
1735 a shiftable item, or two reducible items.
1737 LR05 conflicts only occur if two possible reductions exist,
1738 as shifts always over-ride reductions.
1740 ###### conflict functions
1741 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1745 for (i = 0; i < g->states; i++) {
1746 struct itemset *is = g->statetab[i];
1747 int last_reduce = -1;
1748 int prev_reduce = -1;
1749 int last_shift = -1;
1753 for (j = 0; j < is->items.cnt; j++) {
1754 int itm = is->items.syms[j];
1755 int p = item_prod(itm);
1756 int bp = item_dot(itm);
1757 struct production *pr = g->productions[p];
1759 if (bp == pr->body_size) {
1760 prev_reduce = last_reduce;
1764 if (pr->body[bp]->type == Terminal)
1767 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1768 printf(" State %d has both SHIFT and REDUCE:\n", i);
1769 report_item(g, is->items.syms[last_shift]);
1770 report_item(g, is->items.syms[last_reduce]);
1773 if (prev_reduce >= 0) {
1774 printf(" State %d has 2 (or more) reducible items\n", i);
1775 report_item(g, is->items.syms[prev_reduce]);
1776 report_item(g, is->items.syms[last_reduce]);
1783 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1784 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1785 in the source of the look ahead set.
1787 We build two datasets to reflect the "action" table: one which maps
1788 terminals to items where that terminal could be shifted and another
1789 which maps terminals to items that could be reduced when the terminal
1790 is in look-ahead. We report when we get conflicts between the two.
1792 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1797 for (i = 0; i < g->states; i++) {
1798 struct itemset *is = g->statetab[i];
1799 struct symset shifts = INIT_DATASET;
1800 struct symset reduce = INIT_DATASET;
1804 /* First collect the shifts */
1805 for (j = 0; j < is->items.cnt; j++) {
1806 unsigned short itm = is->items.syms[j];
1807 int p = item_prod(itm);
1808 int bp = item_dot(itm);
1809 struct production *pr = g->productions[p];
1812 if (bp >= pr->body_size ||
1813 pr->body[bp]->type != Terminal)
1818 if (s->precedence && is->precedence)
1819 /* Precedence resolves this, so no conflict */
1822 if (symset_find(&shifts, s->num) < 0)
1823 symset_add(&shifts, s->num, itm);
1825 /* Now look for reductions and conflicts */
1826 for (j = 0; j < is->items.cnt; j++) {
1827 unsigned short itm = is->items.syms[j];
1828 int p = item_prod(itm);
1829 int bp = item_dot(itm);
1830 struct production *pr = g->productions[p];
1832 if (bp < pr->body_size)
1837 la = g->follow[pr->head->num];
1839 la = set_find(g, is->items.data[j]);
1841 for (k = 0; k < la.cnt; k++) {
1842 int pos = symset_find(&shifts, la.syms[k]);
1844 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1846 prtxt(g->symtab[la.syms[k]]->name);
1848 report_item(g, shifts.data[pos]);
1849 report_item(g, itm);
1851 pos = symset_find(&reduce, la.syms[k]);
1853 symset_add(&reduce, la.syms[k], itm);
1856 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1857 prtxt(g->symtab[la.syms[k]]->name);
1859 report_item(g, itm);
1860 report_item(g, reduce.data[pos]);
1864 symset_free(shifts);
1865 symset_free(reduce);
1871 ## Generating the parser
1873 The exported part of the parser is the `parse_XX` function, where the name
1874 `XX` is based on the name of the parser files.
1876 This takes a `code_node`, a partially initialized `token_config`, and an
1877 optional `FILE` to send tracing to. The `token_config` gets the list of
1878 known words added and then is used with the `code_node` to initialize the
1881 `parse_XX` then calls the library function `parser_run` to actually complete
1882 the parse. This needs the `states` table and functions to call the various
1883 pieces of code provided in the grammar file, so they are generated first.
1885 ###### parser_generate
1887 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1888 struct code_node *pre_reduce)
1894 gen_reduce(f, g, file, pre_reduce);
1897 fprintf(f, "#line 0 \"gen_parser\"\n");
1898 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1901 fprintf(f, "\tstruct token_state *tokens;\n");
1902 fprintf(f, "\tconfig->words_marks = known;\n");
1903 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1904 fprintf(f, "\ttokens = token_open(code, config);\n");
1905 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1906 fprintf(f, "\ttoken_close(tokens);\n");
1907 fprintf(f, "\treturn rv;\n");
1908 fprintf(f, "}\n\n");
1911 ### Known words table
1913 The known words table is simply an array of terminal symbols.
1914 The table of nonterminals used for tracing is a similar array.
1918 static void gen_known(FILE *f, struct grammar *g)
1921 fprintf(f, "#line 0 \"gen_known\"\n");
1922 fprintf(f, "static const char *known[] = {\n");
1923 for (i = TK_reserved;
1924 i < g->num_syms && g->symtab[i]->type == Terminal;
1926 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1927 g->symtab[i]->name.txt);
1928 fprintf(f, "};\n\n");
1931 static void gen_non_term(FILE *f, struct grammar *g)
1934 fprintf(f, "#line 0 \"gen_non_term\"\n");
1935 fprintf(f, "static const char *non_term[] = {\n");
1936 for (i = TK_reserved;
1939 if (g->symtab[i]->type == Nonterminal ||
1940 g->symtab[i]->isspecial)
1941 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1942 g->symtab[i]->name.txt);
1943 fprintf(f, "};\n\n");
1946 ### States and the goto tables.
1948 For each state we record the goto table and details of the reducible
1949 production if there is one.
1950 Some of the details of the reducible production are stored in the
1951 `do_reduce` function to come later. Here we store the production
1952 number, the body size (useful for stack management), and the resulting
1953 symbol (useful for knowing how to free data later).
1955 The go to table is stored in a simple array of `sym` and corresponding
1958 ###### exported types
1966 const struct lookup * go_to;
1975 static void gen_goto(FILE *f, struct grammar *g)
1978 fprintf(f, "#line 0 \"gen_goto\"\n");
1979 for (i = 0; i < g->states; i++) {
1980 struct symset gt = g->statetab[i]->go_to;
1985 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1987 for (j = 0; j < gt.cnt; j++)
1988 fprintf(f, "\t{ %d, %d },\n",
1989 gt.syms[j], gt.data[j]);
1994 static void gen_states(FILE *f, struct grammar *g)
1997 fprintf(f, "#line 0 \"gen_states\"\n");
1998 fprintf(f, "static const struct state states[] = {\n");
1999 for (i = 0; i < g->states; i++) {
2000 struct itemset *is = g->statetab[i];
2001 int j, prod = -1, prod_len;
2003 for (j = 0; j < is->items.cnt; j++) {
2004 int itm = is->items.syms[j];
2005 int p = item_prod(itm);
2006 int bp = item_dot(itm);
2007 struct production *pr = g->productions[p];
2009 if (bp < pr->body_size)
2011 /* This is what we reduce - choose longest */
2012 if (prod < 0 || prod_len < pr->body_size) {
2014 prod_len = pr->body_size;
2018 fprintf(f, "\t[%d] = { %d, goto_%d, ",
2019 i, is->go_to.cnt, i);
2021 fprintf(f, "\t[%d] = { 0, NULL, ", i);
2023 struct production *pr = g->productions[prod];
2024 struct symbol *hd = pr->head;
2025 fprintf(f, "%d, %d, %d, ",
2026 prod, pr->body_size, pr->head->num);
2027 if (hd->struct_name.txt == NULL)
2028 fprintf(f, "0 },\n");
2030 fprintf(f, "sizeof(struct %.*s%s) },\n",
2031 hd->struct_name.len,
2032 hd->struct_name.txt,
2033 hd->isref ? "*" : "");
2035 fprintf(f, "-1, -1, -1, -1 },\n");
2037 fprintf(f, "};\n\n");
2040 ### The `do_reduce` function and the code
2042 When the parser engine decides to reduce a production, it calls
2043 `do_reduce` which runs the code that was included with the production,
2046 This code needs to be able to store data somewhere so we record the size
2047 of the data expected with each state so it can be allocated before
2048 `do_reduce` is called.
2050 In order for the code to access "global" context, we pass in the
2051 "config" pointer that was passed to the parser function. If the `struct
2052 token_config` is embedded in some larger structure, the reducing code
2053 can access the larger structure using pointer manipulation. Performing
2054 that pointer manipulation, and any other common code or variables for
2055 all reduce actions, can be provided in code node called "reduce" which
2056 is passed around in `pre_reduce` which you might have already noticed.
2058 The code fragments require translation when written out. Any `$N` needs
2059 to be converted to a reference either to that buffer (if `$0`) or to the
2060 structure returned by a previous reduction. These pointers need to be
2061 cast to the appropriate type for each access. All this is handled in
2064 `gen_code` also allows symbol references to contain a '`<`' as in
2065 '`$<2`'. This is particularly useful for references (or pointers), but
2066 can be used with structures too. The `<` implies that the value is
2067 being moved out, so the object will not be automatically freed. It is
2068 equivalent to assigning `NULL` to the pointer or filling a structure
2071 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2072 and possibly a number. A number by itself (other than zero) selects a
2073 symbol from the body of the production. A sequence of letters selects
2074 the shortest symbol in the body which contains those letters in the
2075 given order. If a number follows the letters, then a later occurrence
2076 of that symbol is chosen. So "`$AB2`" will refer to the structure
2077 attached to the second occurrence of the shortest symbol which contains
2078 an `A` followed by a `B`. If there is no unique shortest system, or if
2079 the number given is too large, then the symbol reference is not
2080 transformed, and will cause an error when the code is compiled.
2084 static int textchr(struct text t, char c, int s)
2087 for (i = s; i < t.len; i++)
2093 static int subseq_match(char *seq, int slen, struct text name)
2097 st = textchr(name, *seq, st);
2107 static int choose_sym(char **namep, int len, struct production *p)
2109 char *name = *namep;
2118 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2124 while (len > 0 && (c >= '0' && c <= '9')) {
2127 n = n * 10 + (c - '0');
2131 if (name == *namep || n > p->body_size)
2137 for (i = 0; i < p->body_size; i++) {
2138 if (!subseq_match(nam, namlen, p->body[i]->name))
2140 if (slen == 0 || p->body[i]->name.len < slen) {
2142 slen = p->body[i]->name.len;
2144 if (s >= 0 && p->body[i] != p->body[s] &&
2145 p->body[i]->name.len == p->body[s]->name.len)
2146 /* not unique, so s cannot be used */
2153 for (i = 0; i < p->body_size; i++)
2154 if (p->body[i] == p->body[s]) {
2159 if (n > 0 || i == p->body_size)
2165 static void gen_code(struct production *p, FILE *f, struct grammar *g)
2168 char *used = calloc(1, p->body_size);
2171 fprintf(f, "\t\t\t");
2172 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2186 n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2195 fprintf(f, "(*(struct %.*s*%s)ret)",
2196 p->head->struct_name.len,
2197 p->head->struct_name.txt,
2198 p->head->isref ? "*":"");
2199 else if (p->body[n-1]->type == Terminal)
2200 fprintf(f, "(*(struct token *)body[%d])",
2202 else if (p->body[n-1]->struct_name.txt == NULL)
2203 fprintf(f, "$%d", n);
2205 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2206 p->body[n-1]->struct_name.len,
2207 p->body[n-1]->struct_name.txt,
2208 p->body[n-1]->isref ? "*":"", n-1);
2214 for (i = 0; i < p->body_size; i++) {
2215 if (p->body[i]->struct_name.txt &&
2217 // assume this has been copied out
2218 if (p->body[i]->isref)
2219 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2221 fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2229 static void gen_reduce(FILE *f, struct grammar *g, char *file,
2230 struct code_node *pre_reduce)
2233 fprintf(f, "#line 1 \"gen_reduce\"\n");
2234 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2236 fprintf(f, "\tint ret_size = 0;\n");
2238 code_node_print(f, pre_reduce, file);
2240 fprintf(f, "#line 4 \"gen_reduce\"\n");
2241 fprintf(f, "\tswitch(prod) {\n");
2242 for (i = 0; i < g->production_count; i++) {
2243 struct production *p = g->productions[i];
2244 fprintf(f, "\tcase %d:\n", i);
2247 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2251 if (p->head->struct_name.txt)
2252 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2253 p->head->struct_name.len,
2254 p->head->struct_name.txt,
2255 p->head->isref ? "*":"");
2257 fprintf(f, "\t\tbreak;\n");
2259 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2264 As each non-terminal can potentially cause a different type of data
2265 structure to be allocated and filled in, we need to be able to free it when
2268 It is particularly important to have fine control over freeing during error
2269 recovery where individual stack frames might need to be freed.
2271 For this, the grammar author is required to defined a `free_XX` function for
2272 each structure that is used by a non-terminal. `do_free` will call whichever
2273 is appropriate given a symbol number, and will call `free` (as is
2274 appropriate for tokens) on any terminal symbol.
2278 static void gen_free(FILE *f, struct grammar *g)
2282 fprintf(f, "#line 0 \"gen_free\"\n");
2283 fprintf(f, "static void do_free(short sym, void *asn)\n");
2285 fprintf(f, "\tif (!asn) return;\n");
2286 fprintf(f, "\tif (sym < %d", g->first_nonterm);
2287 /* Need to handle special terminals too */
2288 for (i = 0; i < g->num_syms; i++) {
2289 struct symbol *s = g->symtab[i];
2290 if (i >= g->first_nonterm && s->type == Terminal &&
2292 fprintf(f, " || sym == %d", s->num);
2294 fprintf(f, ") {\n");
2295 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2296 fprintf(f, "\tswitch(sym) {\n");
2298 for (i = 0; i < g->num_syms; i++) {
2299 struct symbol *s = g->symtab[i];
2301 s->type != Nonterminal ||
2302 s->struct_name.txt == NULL)
2305 fprintf(f, "\tcase %d:\n", s->num);
2307 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2309 s->struct_name.txt);
2310 fprintf(f, "\t\tfree(asn);\n");
2312 fprintf(f, "\t\tfree_%.*s(asn);\n",
2314 s->struct_name.txt);
2315 fprintf(f, "\t\tbreak;\n");
2317 fprintf(f, "\t}\n}\n\n");
2320 ## The main routine.
2322 There are three key parts to the "main" function of parsergen: processing
2323 the arguments, loading the grammar file, and dealing with the grammar.
2325 ### Argument processing.
2327 Command line options allow the selection of analysis mode, name of output
2328 file, and whether or not a report should be generated. By default we create
2329 a report only if no code output was requested.
2331 The `parse_XX` function name uses the basename of the output file which
2332 should not have a suffix (`.c`). `.c` is added to the given name for the
2333 code, and `.h` is added for the header (if header text is specifed in the
2340 static const struct option long_options[] = {
2341 { "LR0", 0, NULL, '0' },
2342 { "LR05", 0, NULL, '5' },
2343 { "SLR", 0, NULL, 'S' },
2344 { "LALR", 0, NULL, 'L' },
2345 { "LR1", 0, NULL, '1' },
2346 { "tag", 1, NULL, 't' },
2347 { "report", 0, NULL, 'R' },
2348 { "output", 1, NULL, 'o' },
2349 { NULL, 0, NULL, 0 }
2351 const char *options = "05SL1t:Ro:";
2353 ###### process arguments
2355 char *outfile = NULL;
2360 enum grammar_type type = LR05;
2361 while ((opt = getopt_long(argc, argv, options,
2362 long_options, NULL)) != -1) {
2377 outfile = optarg; break;
2379 tag = optarg; break;
2381 fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2386 infile = argv[optind++];
2388 fprintf(stderr, "No input file given\n");
2391 if (outfile && report == 1)
2394 if (name && strchr(name, '/'))
2395 name = strrchr(name, '/')+1;
2397 if (optind < argc) {
2398 fprintf(stderr, "Excess command line arguments\n");
2402 ### Loading the grammar file
2404 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2407 Once we have extracted the code (with `mdcode`) we expect to find three
2408 or four sections: header, code, grammar, reduce.
2409 Anything else that is not excluded by the `--tag` option is an error.
2411 "header", "code", and "reduce" are optional, though it is hard to build
2412 a working parser without the first two. "grammar" must be provided.
2416 #include <sys/mman.h>
2421 static void pr_err(char *msg)
2424 fprintf(stderr, "%s\n", msg);
2428 struct section *table;
2432 fd = open(infile, O_RDONLY);
2434 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2435 infile, strerror(errno));
2438 len = lseek(fd, 0, 2);
2439 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2440 table = code_extract(file, file+len, pr_err);
2442 struct code_node *hdr = NULL;
2443 struct code_node *code = NULL;
2444 struct code_node *gram = NULL;
2445 struct code_node *pre_reduce = NULL;
2446 for (s = table; s; s = s->next) {
2447 struct text sec = s->section;
2448 if (tag && !strip_tag(&sec, tag))
2450 if (text_is(sec, "header"))
2452 else if (text_is(sec, "code"))
2454 else if (text_is(sec, "grammar"))
2456 else if (text_is(sec, "reduce"))
2457 pre_reduce = s->code;
2459 fprintf(stderr, "Unknown content section: %.*s\n",
2460 s->section.len, s->section.txt);
2465 ### Processing the input
2467 As we need to append an extention to a filename and then open it for
2468 writing, and we need to do this twice, it helps to have a separate function.
2472 static FILE *open_ext(char *base, char *ext)
2474 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2476 strcat(strcpy(fn, base), ext);
2482 If we can read the grammar, then we analyse and optionally report on it. If
2483 the report finds conflicts we will exit with an error status.
2485 ###### process input
2486 struct grammar *g = NULL;
2488 fprintf(stderr, "No grammar section provided\n");
2491 g = grammar_read(gram);
2493 fprintf(stderr, "Failure to parse grammar\n");
2498 grammar_analyse(g, type);
2500 if (grammar_report(g, type))
2504 If a "headers" section is defined, we write it out and include a declaration
2505 for the `parse_XX` function so it can be used from separate code.
2507 if (rv == 0 && hdr && outfile) {
2508 FILE *f = open_ext(outfile, ".h");
2510 code_node_print(f, hdr, infile);
2511 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2515 fprintf(stderr, "Cannot create %s.h\n",
2521 And if all goes well and an output file was provided, we create the `.c`
2522 file with the code section (if any) and the parser tables and function.
2524 if (rv == 0 && outfile) {
2525 FILE *f = open_ext(outfile, ".c");
2528 code_node_print(f, code, infile);
2529 gen_parser(f, g, infile, name, pre_reduce);
2532 fprintf(stderr, "Cannot create %s.c\n",
2538 And that about wraps it up. We need to set the locale so that UTF-8 is
2539 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2541 ###### File: parsergen.mk
2542 parsergen : parsergen.o libscanner.o libmdcode.o
2543 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2550 int main(int argc, char *argv[])
2555 setlocale(LC_ALL,"");
2557 ## process arguments
2564 ## The SHIFT/REDUCE parser
2566 Having analysed the grammar and generated all the tables, we only need
2567 the shift/reduce engine to bring it all together.
2569 ### Goto table lookup
2571 The parser generator has nicely provided us with goto tables sorted by
2572 symbol number. We need a binary search function to find a symbol in the
2575 ###### parser functions
2577 static int search(const struct state *l, int sym)
2580 int hi = l->go_to_cnt;
2584 while (lo + 1 < hi) {
2585 int mid = (lo + hi) / 2;
2586 if (l->go_to[mid].sym <= sym)
2591 if (l->go_to[lo].sym == sym)
2592 return l->go_to[lo].state;
2597 ### Memory allocation
2599 The `scanner` returns tokens in a local variable - we want them in allocated
2600 memory so they can live in the `asn_stack`. So we provide `tok_copy` to
2601 make an allocated copy as required.
2603 ###### parser includes
2606 ###### parser functions
2608 static struct token *tok_copy(struct token tk)
2610 struct token *new = malloc(sizeof(*new));
2615 ### The state stack.
2617 The core data structure for the parser is the stack. This tracks all
2618 the symbols that have been recognised or partially recognised.
2620 The stack usually won't grow very large - maybe a few tens of entries.
2621 So we dynamically grow an array as required but never bother to
2622 shrink it down again.
2624 We keep the stack as two separate allocations. One, `asn_stack`, stores
2625 the "abstract syntax nodes" which are created by each reduction. When
2626 we call `do_reduce` we need to pass an array of the `asn`s of the body
2627 of the production, and by keeping a separate `asn` stack, we can just
2628 pass a pointer into this stack.
2630 The other allocation stores all other stack fields of which there are
2631 two. The `state` is the most important one and guides the parsing
2632 process. The `sym` is nearly unnecessary as it is implicit in the
2633 state. However when we want to free entries from the `asn_stack`, it
2634 helps to know what type they are so we can call the right freeing
2635 function. The symbol leads us to the right free function through
2638 The stack is most properly seen as alternating states and symbols -
2639 states, like the 'DOT' in items, are between symbols. Each frame in
2640 our stack holds a state and the symbol that was before it. The
2641 bottom of stack holds the start state but no symbol, as nothing came
2642 before the beginning. As we need to store some value, `TK_eof` is used
2643 to mark the beginning of the file as well as the end.
2645 ###### parser functions
2661 Two operations are needed on the stack - shift (which is like push) and pop.
2663 Shift applies not only to terminals but also to non-terminals. When we
2664 reduce a production we will pop off frames corresponding to the body
2665 symbols, then push on a frame for the head of the production. This last
2666 is exactly the same process as shifting in a terminal so we use the same
2667 function for both. In both cases we provide the symbol. The state is
2668 deduced from the current top-of-stack state and the new symbol.
2670 To simplify other code we arrange for `shift` to fail if there is no `goto`
2671 state for the symbol. This is useful in basic parsing due to our design
2672 that we shift when we can, and reduce when we cannot. So the `shift`
2673 function reports if it could.
2675 `shift` is also used to push state zero onto the stack, so if the
2676 stack is empty, it always chooses zero as the next state.
2678 So `shift` finds the next state. If that succeeds it extends the
2679 allocations if needed and pushes all the information onto the stacks.
2681 An extra complication is added to `shift` by the `EOL` token. This
2682 token must be generated when a `NEWLINE` is seen, but an `EOL` is
2683 expected. When this happens, the `NEWLINE` is NOT consumed, so multiple
2684 EOL can appear before a NEWLINE. To indicate that the token was shifted
2685 by not consumed, `shift` can return the special value `2`. The token
2686 number for `EOL` cannot be statically declared, so when the parser
2687 starts we need to look through the array of non-terminals to find the
2695 while (strcmp(non_term[p.tk_eol], "EOL") != 0)
2697 p.tk_eol += TK_reserved + config->known_count;
2700 ###### parser functions
2702 static int shift(struct parser *p,
2703 short sym, void *asn,
2704 const struct state states[])
2706 struct frame next = {0};
2708 int newstate = p->tos
2709 ? search(&states[p->stack[p->tos-1].state],
2714 else if (sym != TK_newline)
2717 // have a NEWLINE, might need an EOL
2720 ? search(&states[p->stack[p->tos-1].state],
2726 asn = tok_copy(*(struct token*)asn);
2729 if (p->tos >= p->stack_size) {
2730 p->stack_size += 10;
2731 p->stack = realloc(p->stack, p->stack_size
2732 * sizeof(p->stack[0]));
2733 p->asn_stack = realloc(p->asn_stack, p->stack_size
2734 * sizeof(p->asn_stack[0]));
2737 next.state = newstate;
2739 p->stack[p->tos] = next;
2740 p->asn_stack[p->tos] = asn;
2745 `pop` primarily moves the top of stack (`tos`) back down the required
2746 amount and frees any `asn` entries that need to be freed. It is called
2747 _after_ we reduce a production, just before we `shift` the nonterminal
2750 ###### parser functions
2752 static void pop(struct parser *p, int num,
2753 void(*do_free)(short sym, void *asn))
2758 for (i = 0; i < num; i++)
2759 do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]);
2762 ### The heart of the parser.
2764 Now we have the parser. For each token we might shift it, trigger a
2765 reduction, or start error handling. 2D tokens (IN, OUT, NEWLINE, EOL)
2766 might also be ignored. Ignoring tokens is combined with shifting.
2770 struct parser p = { 0 };
2771 struct token *tk = NULL;
2774 ###### heart of parser
2776 shift(&p, TK_eof, NULL, states);
2777 while (!accepted && p.tos > 0) {
2778 struct frame *tos = &p.stack[p.tos-1];
2780 tk = tok_copy(token_next(tokens));
2781 parser_trace(trace, &p,
2782 tk, states, non_term, config->known_count);
2784 ## try shift or ignore
2789 Indents are ignored unless they can be shifted onto the stack
2790 immediately. The end of an indented section - the OUT token - is
2791 ignored precisely when the indent was ignored. To keep track of this we
2792 need a small stack of flags, which is easily stored as bits in an
2793 `unsigned long`. This will never overflow and the scanner only allows
2794 20 levels of indentation.
2797 unsigned long ignored_indents;
2799 NEWLINE/EOL is ignored when in an indented section of text which was not
2800 explicitly expected by the grammar. So if the most recent indent is
2801 ignored, so is any EOL token.
2803 For other tokens, we shift the next token if that is possible, otherwise
2804 we try to reduce a production.
2806 ###### try shift or ignore
2808 if ((tk->num == TK_newline || tk->num == TK_out) &&
2809 (p.ignored_indents & 1)) {
2810 /* indented, so ignore OUT and NEWLINE */
2811 if (tk->num == TK_out)
2812 p.ignored_indents >>= 1;
2815 parser_trace_action(trace, "Ignore");
2819 switch (shift(&p, tk->num, tk, states)) {
2821 if (tk->num == TK_out)
2822 p.ignored_indents >>= 1;
2823 if (tk->num == TK_in)
2824 p.ignored_indents <<= 1;
2829 parser_trace_action(trace, tk ? "ShiftEOL" : "Shift");
2834 if (tk->num == TK_in) {
2835 /* No indent expected here, so ignore IN */
2838 p.ignored_indents <<= 1;
2839 p.ignored_indents |= 1;
2840 parser_trace_action(trace, "Ignore");
2844 We have already discussed the bulk of the handling of a "reduce" action,
2845 with the `pop()` and `shift()` functions doing much of the work. There
2846 is a little more complexity needed to manage storage for the asn (Abstract
2847 Syntax Node), and also a test of whether the reduction is permitted.
2849 When we try to shift the result of reducing production-zero, it will
2850 fail because there is no next state. In this case the asn will not have
2851 been stored on the stack, so it get stored in the `ret` variable, and we
2852 report that that input has been accepted.
2860 if (states[tos->state].reduce_prod >= 0) {
2863 const struct state *nextstate = &states[tos->state];
2864 int prod = nextstate->reduce_prod;
2865 int size = nextstate->reduce_size;
2866 int res_size = nextstate->result_size;
2868 body = p.asn_stack + (p.tos - size);
2869 res = res_size ? calloc(1, res_size) : NULL;
2870 res_size = do_reduce(prod, body, config, res);
2871 if (res_size != nextstate->result_size)
2873 pop(&p, size, do_free);
2874 if (!shift(&p, nextstate->reduce_sym, res, states)) {
2877 parser_trace_action(trace, "Accept");
2879 parser_trace_action(trace, "Reduce");
2883 If we can neither shift nor reduce we have an error to handle. There
2884 are two possible responses to an error: we can pop single frames off the
2885 stack until we find a frame that can shift `TK_error`, or we can discard
2886 the current look-ahead symbol.
2888 When we first see an error we do the first of these and set a flag to
2889 record that we are processing an error. If the normal shift/reduce
2890 tests fail to find that anything can be done from that state, we start
2891 dropping tokens until either we manage to shift one, or reach end-of-file.
2904 parser_trace_action(trace, "DISCARD");
2905 if (tk->num == TK_eof)
2910 struct token *err_tk;
2912 parser_trace_action(trace, "ERROR");
2914 err_tk = tok_copy(*tk);
2915 while (p.tos > 0 && shift(&p, TK_error, err_tk, states) == 0)
2916 // discard this state
2917 pop(&p, 1, do_free);
2920 // no state accepted TK_error
2926 ###### parser includes
2931 void *parser_run(struct token_state *tokens,
2932 const struct state states[],
2933 int (*do_reduce)(int, void**, struct token_config*, void*),
2934 void (*do_free)(short, void*),
2935 FILE *trace, const char *non_term[],
2936 struct token_config *config)
2945 pop(&p, p.tos, do_free);
2951 ###### exported functions
2952 void *parser_run(struct token_state *tokens,
2953 const struct state states[],
2954 int (*do_reduce)(int, void**, struct token_config*, void*),
2955 void (*do_free)(short, void*),
2956 FILE *trace, const char *non_term[],
2957 struct token_config *config);
2961 Being able to visualize the parser in action can be invaluable when
2962 debugging the parser code, or trying to understand how the parse of a
2963 particular grammar progresses. The stack contains all the important
2964 state, so just printing out the stack every time around the parse loop
2965 can make it possible to see exactly what is happening.
2967 This doesn't explicitly show each SHIFT and REDUCE action. However they
2968 are easily deduced from the change between consecutive lines, and the
2969 details of each state can be found by cross referencing the states list
2970 in the stack with the "report" that parsergen can generate.
2972 For terminal symbols, we just dump the token. For non-terminals we
2973 print the name of the symbol. The look ahead token is reported at the
2974 end inside square brackets.
2976 ###### parser functions
2978 static char *reserved_words[] = {
2979 [TK_error] = "ERROR",
2982 [TK_newline] = "NEWLINE",
2986 void parser_trace(FILE *trace, struct parser *p,
2987 struct token *tk, const struct state states[],
2988 const char *non_term[], int knowns)
2993 for (i = 0; i < p->tos; i++) {
2994 struct frame *f = &p->stack[i];
2997 if (sym < TK_reserved &&
2998 reserved_words[sym] != NULL)
2999 fputs(reserved_words[sym], trace);
3000 else if (sym < TK_reserved + knowns) {
3001 struct token *t = p->asn_stack[i];
3002 text_dump(trace, t->txt, 20);
3004 fputs(non_term[sym - TK_reserved - knowns],
3008 fprintf(trace, "(%d) ", f->state);
3010 fprintf(trace, "[");
3011 if (tk->num < TK_reserved &&
3012 reserved_words[tk->num] != NULL)
3013 fputs(reserved_words[tk->num], trace);
3015 text_dump(trace, tk->txt, 20);
3016 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3019 void parser_trace_action(FILE *trace, char *action)
3022 fprintf(trace, " - %s\n", action);
3027 The obvious example for a parser is a calculator.
3029 As `scanner` provides number parsing function using `libgmp` it is not much
3030 work to perform arbitrary rational number calculations.
3032 This calculator takes one expression, or an equality test, per line.
3033 The results are printed and if any equality test fails, the program
3034 exits with an error.
3036 ###### File: parsergen.mk
3037 calc.c calc.h : parsergen parsergen.mdc
3038 ./parsergen --tag calc -o calc parsergen.mdc
3039 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3040 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3042 ./calc parsergen.mdc
3047 #include "parse_number.h"
3048 // what do we use for a demo-grammar? A calculator of course.
3060 #include <sys/mman.h>
3066 #include "scanner.h"
3071 static void free_number(struct number *n)
3077 static int text_is(struct text t, char *s)
3079 return (strlen(s) == t.len &&
3080 strncmp(s, t.txt, t.len) == 0);
3083 int main(int argc, char *argv[])
3085 int fd = open(argv[1], O_RDONLY);
3086 int len = lseek(fd, 0, 2);
3087 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3088 struct section *table = code_extract(file, file+len, NULL);
3090 struct token_config config = {
3091 .ignored = (1 << TK_line_comment)
3094 .number_chars = ".,_+-",
3098 for (s = table; s; s = s->next)
3099 if (text_is(s->section, "example: input"))
3100 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3102 struct section *t = table->next;
3103 code_free(table->code);
3115 Session -> Session Line
3118 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3119 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3120 gmp_printf(" or as a decimal: %Fg\n", fl);
3124 | Expression = Expression NEWLINE ${
3125 if (mpq_equal($1.val, $3.val))
3126 gmp_printf("Both equal %Qd\n", $1.val);
3128 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
3133 | NEWLINE ${ printf("Blank line\n"); }$
3134 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3137 Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3138 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3139 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3140 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3141 | Expression // Expression ${ {
3144 mpz_init(z0); mpz_init(z1); mpz_init(z2);
3145 mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3146 mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3147 mpz_tdiv_q(z0, z1, z2);
3148 mpq_set_z($0.val, z0);
3149 mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3151 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3152 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3157 3.1415926535 - 355/113
3159 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3161 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1