1 # LR(1) Parser Generator with 2D support #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 "2D support" means that indentation and line breaks can be significant.
8 Indent tokens (IN and OUT) and end-of-line (EOL) tokens can be used to
9 describe the grammar and the parser can selectively ignore these where
12 There are several distinct sections.
14 1. `grammar_read` will read a grammar from a literate-code file and
15 store the productions, symbols, and code fragments.
16 2. `grammar_analyse` will take that grammar and build LR parsing
17 states and look-ahead tables.
18 3. `grammar_report` will present the details of the analysis
19 in a readable format and will report any conflicts.
20 4. `parser_generate` will write out C code files with various
21 tables and with the code fragments arranged in useful places.
22 5. `parser_run` is a library function intended to be linked together
23 with the generated parser tables to complete the implementation of
25 6. Finally `calc` is a test grammar for a simple calculator. The
26 `parsergen` program built from the C code in this file can extract
27 that grammar directly from this file and process it.
29 ###### File: parsergen.c
34 ## forward declarations
45 ###### File: libparser.c
52 ###### File: parsergen.mk
55 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
58 ## Reading the grammar
60 The grammar must be stored in a markdown literate code file as parsed by
61 "[mdcode][]". It may have up to four top level (i.e. unreferenced)
62 sections: `header`, `code`, `grammar` and `reduce`. The first two, if
63 present, will be literally copied into the generated `.c` and `.h`
64 files. The third is required and contains the grammar which is
65 tokenised with "[scanner][]". `reduce` can provide variable
66 declarations and common intialization code for all the reduction code
67 fragments in the grammar.
69 If the `--tag` option is given, then any top level header that doesn't
70 start with the tag is ignored, and the tag is striped from the rest. So
71 `--tag Foo` means that the four sections expected are `Foo: header`,
72 `Foo: code`, `Foo: grammar` and `Foo: reduce`. The tag `calc` is used
73 to extract the sample calculator from this file.
76 [scanner]: scanner.html
82 ###### parser includes
86 The grammar contains production sets, precedence/associativity
87 declarations, general terminal declarations, and data type declarations.
88 These are all parsed with _ad hoc_ parsing as we don't have a parser
91 The precedence and associativity can be set for each production, and
92 can be inherited from symbols. The data type (either structure or a
93 reference to a structure) is potentially defined for each non-terminal
94 and describes what C structure is used to store information about each
98 enum assoc {Left, Right, Non};
99 char *assoc_names[] = {"Left","Right","Non"};
102 struct text struct_name;
105 unsigned short precedence;
109 unsigned short precedence;
117 The strings reported by `mdcode` and `scanner` are `struct text` which
118 have length rather than being nul terminated. To help with printing and
119 comparing we define `text_is` and `prtxt`, which should possibly go in
120 `mdcode`. `scanner` does provide `text_dump` which is useful for
121 strings which might contain control characters.
123 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
124 because that is what we need to detect tags.
127 static int text_is(struct text t, char *s)
129 return (strlen(s) == t.len &&
130 strncmp(s, t.txt, t.len) == 0);
132 static void prtxt(struct text t)
134 printf("%.*s", t.len, t.txt);
137 static int strip_tag(struct text *t, char *tag)
139 int skip = strlen(tag) + 1;
140 if (skip >= t->len ||
141 strncmp(t->txt, tag, skip-1) != 0 ||
142 t->txt[skip-1] != ':')
144 while (skip < t->len && t->txt[skip] == ' ')
153 Productions are comprised primarily of symbols - terminal and
154 non-terminal. We do not make a syntactic distinction between the two,
155 though non-terminals must be identifiers. Non-terminal symbols are
156 those which appear in the head of a production, terminal symbols are
157 those which don't. There are also "virtual" symbols used for precedence
158 marking discussed later, and sometimes we won't know what type a symbol
161 To help with code safety it is possible to declare the terminal symbols.
162 If this is done, then any symbol used in a production that does not
163 appear in the head of any production and is not declared as a terminal
164 is treated as an error.
166 ###### forward declarations
167 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
168 char *symtypes = "UVTN";
171 ###### grammar fields
172 int terminals_declared;
174 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
175 table of known symbols and the resulting parser will report them as
176 `TK_reserved + N`. A small set of identifiers are reserved for the
177 different token types that `scanner` can report, and an even smaller set
178 are reserved for a special token that the parser can generate (`EOL`) as
179 will be described later. This latter set cannot use predefined numbers,
180 so they are marked as `isspecial` for now and will get assigned a number
181 with the non-terminals later.
185 static struct { int num; char *name; } reserved_words[] = {
186 { TK_error, "ERROR" },
187 { TK_number, "NUMBER" },
188 { TK_ident, "IDENTIFIER" },
190 { TK_string, "STRING" },
191 { TK_multi_string, "MULTI_STRING" },
194 { TK_newline, "NEWLINE" },
201 unsigned int isspecial:1;
203 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
204 recognised. The former is automatically expected at the end of the text
205 being parsed. The latter are always ignored by the parser.
207 All of these symbols are stored in a simple symbol table. We use the
208 `struct text` from `mdcode` to store the name and link them together
209 into a sorted list using an insertion sort.
211 We don't have separate `find` and `insert` functions as any symbol we
212 find needs to be remembered. We simply expect `find` to always return a
213 symbol, but its type might be `Unknown`.
222 ###### grammar fields
227 static struct symbol *sym_find(struct grammar *g, struct text s)
229 struct symbol **l = &g->syms;
234 (cmp = text_cmp((*l)->name, s)) < 0)
238 n = calloc(1, sizeof(*n));
247 static void symbols_init(struct grammar *g)
249 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
251 for (i = 0; i < entries; i++) {
254 t.txt = reserved_words[i].name;
255 t.len = strlen(t.txt);
258 s->num = reserved_words[i].num;
263 ### Data types and precedence.
265 Data type specification, precedence specification, and declaration of
266 terminals are all introduced by a dollar sign at the start of the line.
267 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
268 precedence, if it is `TERM` then the line declares terminals without
269 precedence, otherwise it specifies a data type.
271 The data type name is simply stored and applied to the head of all
272 subsequent productions. It must be the name of a structure optionally
273 preceded by an asterisk which means a reference or "pointer". So
274 `$expression` maps to `struct expression` and `$*statement` maps to
275 `struct statement *`.
277 Any productions given before the first data type declaration will have
278 no data type associated with them and can carry no information. In
279 order to allow other non-terminals to have no type, the data type
280 `$void` can be given. This does *not* mean that `struct void` will be
281 used, but rather than no type will be associated with subsequent
284 The precedence line must contain a list of symbols, either terminal
285 symbols or virtual symbols. It can only contain symbols that have not
286 been seen yet, so precedence declaration must precede the productions
289 A "virtual" symbol is an identifier preceded by `$$`. Virtual symbols
290 carry precedence information but are not included in the grammar. A
291 production can declare that it inherits the precedence of a given
294 This use for `$$` precludes it from being used as a symbol in the
295 described language. Two other symbols: `${` and `}$` are also
298 Each new precedence line introduces a new precedence level and
299 declares how it associates. This level is stored in each symbol
300 listed and may be inherited by any production which uses the symbol. A
301 production inherits from the last symbol which has a precedence.
303 The symbols on the first precedence line have the lowest precedence.
304 Subsequent lines introduce symbols with higher precedence and so bind
307 Note that a declaration line (or "dollar line") can start with either of
308 two different marks: "$" or "$*". The `dollar_line()` function defined
309 here is told which was found in the `isref` argument.
311 ###### grammar fields
312 struct text current_type;
317 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
318 static const char *known[] = { "$$", "${", "}$" };
321 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
323 struct token t = token_next(ts);
329 if (t.num != TK_ident) {
330 err = "type or assoc expected after '$'";
333 if (text_is(t.txt, "LEFT"))
335 else if (text_is(t.txt, "RIGHT"))
337 else if (text_is(t.txt, "NON"))
339 else if (text_is(t.txt, "TERM")) {
341 g->terminals_declared = 1;
343 g->current_type = t.txt;
344 g->type_isref = isref;
345 if (text_is(t.txt, "void"))
346 g->current_type.txt = NULL;
348 if (t.num != TK_newline) {
349 err = "Extra tokens after type name";
356 err = "$* cannot be followed by a precedence";
360 // This is a precedence or TERM line, need some symbols.
364 while (t.num != TK_newline) {
365 enum symtype type = Terminal;
367 if (t.num == TK_virtual) {
370 if (t.num != TK_ident) {
371 err = "$$ must be followed by a word";
375 err = "Virtual symbols not permitted on $TERM line";
378 } else if (t.num != TK_ident &&
380 err = "Illegal token in precedence line";
383 s = sym_find(g, t.txt);
384 if (s->type != Unknown) {
385 err = "Symbols in precedence/TERM line must not already be known.";
390 s->precedence = g->prec_levels;
397 err = "No symbols given on precedence/TERM line";
401 while (t.num != TK_newline && t.num != TK_eof)
408 A production either starts with an identifier which is the head
409 non-terminal, or a vertical bar (`|`) in which case this production
410 uses the same head as the previous one. The identifier must be
411 followed by a `->` mark. All productions for a given non-terminal must
412 be grouped together with the `nonterminal ->` given only once.
414 After this a (possibly empty) sequence of identifiers and marks form
415 the body of the production. A virtual symbol may be given after the
416 body by preceding it with `$$`. If a virtual symbol is given, the
417 precedence of the production is that for the virtual symbol. If none
418 is given, the precedence is inherited from the last symbol in the
419 production which has a precedence specified.
421 After the optional precedence may come the `${` mark. This indicates
422 the start of a code fragment. If present, this must be on the same
423 line as the start of the production.
425 All of the text from the `${` through to the matching `}$` is
426 collected and forms the code-fragment for the production. It must all
427 be in one `code_node` of the literate code. The `}$` must be
428 at the end of a line.
430 Text in the code fragment will undergo substitutions where `$N` or
431 `$<N`,for some numeric `N` (or non-numeric indicator as described
432 later), will be replaced with a variable holding the parse information
433 for the particular symbol in the production. `$0` is the head of the
434 production, `$1` is the first symbol of the body, etc. The type of `$N`
435 for a terminal symbol is `struct token`. For a non-terminal, it is
436 whatever has been declared for that symbol. The `<` may be included and
437 means that the value (usually a reference) is being moved out, so it
438 will not automatically be freed. The effect of using '<' is that the
439 variable is cleared to all-zeros.
441 While building productions we will need to add to an array which needs to
445 static void array_add(void *varray, int *cnt, void *new)
447 void ***array = varray;
450 current = ((*cnt-1) | (step-1))+1;
451 if (*cnt == current) {
454 *array = realloc(*array, current * sizeof(void*));
456 (*array)[*cnt] = new;
460 Collecting the code fragment simply involves reading tokens until we
461 hit the end token `}$`, and noting the character position of the start and
465 static struct text collect_code(struct token_state *state,
470 code.txt = start.txt.txt + start.txt.len;
472 t = token_next(state);
473 while (t.node == start.node &&
474 t.num != TK_close && t.num != TK_error &&
476 if (t.num == TK_close && t.node == start.node)
477 code.len = t.txt.txt - code.txt;
483 Now we have all the bits we need to parse a full production.
488 ###### grammar fields
489 struct production **productions;
490 int production_count;
492 ###### production fields
494 struct symbol **body;
500 int first_production;
503 static char *parse_production(struct grammar *g,
505 struct token_state *state)
507 /* Head has already been parsed. */
510 struct production p, *pp;
512 memset(&p, 0, sizeof(p));
514 tk = token_next(state);
515 while (tk.num == TK_ident || tk.num == TK_mark) {
516 struct symbol *bs = sym_find(g, tk.txt);
517 if (bs->type == Unknown) {
518 if (!g->terminals_declared)
521 if (bs->type == Virtual) {
522 err = "Virtual symbol not permitted in production";
525 if (bs->precedence) {
526 p.precedence = bs->precedence;
529 array_add(&p.body, &p.body_size, bs);
530 tk = token_next(state);
532 if (tk.num == TK_virtual) {
534 tk = token_next(state);
535 if (tk.num != TK_ident) {
536 err = "word required after $$";
539 vs = sym_find(g, tk.txt);
540 if (vs->precedence == 0) {
541 err = "symbol after $$ must have precedence";
544 p.precedence = vs->precedence;
547 tk = token_next(state);
549 if (tk.num == TK_open) {
550 p.code_line = tk.line;
551 p.code = collect_code(state, tk);
552 if (p.code.txt == NULL) {
553 err = "code fragment not closed properly";
556 tk = token_next(state);
559 if (tk.num != TK_newline && tk.num != TK_eof) {
560 err = "stray tokens at end of line";
563 pp = malloc(sizeof(*pp));
565 array_add(&g->productions, &g->production_count, pp);
568 while (tk.num != TK_newline && tk.num != TK_eof)
569 tk = token_next(state);
574 With the ability to parse productions and dollar-lines, we have nearly all
575 that we need to parse a grammar from a `code_node`.
577 The head of the first production will effectively be the `start` symbol
578 of the grammar. However it won't _actually_ be so. Processing the
579 grammar is greatly simplified if the real start symbol only has a single
580 production, and expects `$eof` as the final terminal. So when we find
581 the first explicit production we insert an extra implicit production as
582 production zero which looks like
584 ###### Example: production 0
587 where `START` is the first non-terminal given.
589 ###### create production zero
590 struct production *p = calloc(1,sizeof(*p));
591 struct text start = {"$start",6};
592 struct text eof = {"$eof",4};
593 struct text code = {"$0 = $<1;", 9};
594 p->head = sym_find(g, start);
595 p->head->type = Nonterminal;
596 p->head->struct_name = g->current_type;
597 p->head->isref = g->type_isref;
598 if (g->current_type.txt)
600 array_add(&p->body, &p->body_size, head);
601 array_add(&p->body, &p->body_size, sym_find(g, eof));
602 p->head->first_production = g->production_count;
603 array_add(&g->productions, &g->production_count, p);
605 Now we are ready to read in the grammar. We ignore comments
606 and strings so that the marks which introduce them can be
607 used as terminals in the grammar. We don't ignore numbers
608 even though we don't allow them as that causes the scanner
609 to produce errors that the parser is better positioned to handle.
610 We also ignore indentation, but we expect and use newlines.
612 To allow comments in the grammar, we explicitly check for "//" as the
613 first token on the line and those lines are skipped. "//" can still be
614 used as a terminal anywhere that a terminal is expected.
617 static struct grammar *grammar_read(struct code_node *code)
619 struct token_config conf = {
622 .words_marks = known,
623 .known_count = sizeof(known)/sizeof(known[0]),
625 .ignored = (1 << TK_line_comment)
626 | (1 << TK_block_comment)
629 | (1 << TK_multi_string)
634 struct token_state *state = token_open(code, &conf);
636 struct symbol *head = NULL;
640 g = calloc(1, sizeof(*g));
643 for (tk = token_next(state); tk.num != TK_eof;
644 tk = token_next(state)) {
645 if (tk.num == TK_newline)
647 if (tk.num == TK_ident) {
649 head = sym_find(g, tk.txt);
650 if (head->type == Nonterminal)
651 err = "This non-terminal has already be used.";
652 else if (head->type == Virtual)
653 err = "Virtual symbol not permitted in head of production";
655 head->type = Nonterminal;
656 head->struct_name = g->current_type;
657 head->isref = g->type_isref;
658 if (g->production_count == 0) {
659 ## create production zero
661 head->first_production = g->production_count;
662 tk = token_next(state);
663 if (tk.num == TK_mark &&
664 text_is(tk.txt, "->"))
665 err = parse_production(g, head, state);
667 err = "'->' missing in production";
669 } else if (tk.num == TK_mark
670 && text_is(tk.txt, "|")) {
671 // another production for same non-term
673 err = parse_production(g, head, state);
675 err = "First production must have a head";
676 } else if (tk.num == TK_mark
677 && text_is(tk.txt, "$")) {
678 err = dollar_line(state, g, 0);
679 } else if (tk.num == TK_mark
680 && text_is(tk.txt, "$*")) {
681 err = dollar_line(state, g, 1);
682 } else if (tk.num == TK_mark
683 && text_is(tk.txt, "//")) {
684 while (tk.num != TK_newline &&
686 tk = token_next(state);
688 err = "Unrecognised token at start of line.";
694 if (g->terminals_declared) {
697 for (s = g->syms; s; s = s->next) {
698 if (s->type != Unknown)
701 fprintf(stderr, "Token %.*s not declared\n",
702 s->name.len, s->name.txt);
705 free(g); // FIXME free content
711 fprintf(stderr, "Error at line %d: %s\n",
714 free(g); // FIXME free content
718 ## Analysing the grammar
720 The central task in analysing the grammar is to determine a set of
721 states to drive the parsing process. This will require finding various
722 sets of symbols and of "items". Some of these sets will need to attach
723 information to each element in the set, so they are more like maps.
725 Each "item" may have a 'look-ahead' or `LA` set associated with it.
726 Many of these will be the same as each other. To avoid memory wastage,
727 and to simplify some comparisons of sets, these sets will be stored in a
728 list of unique sets, each assigned a number.
730 Once we have the data structures in hand to manage these sets and lists,
731 we can start setting the 'nullable' flag, build the 'FIRST' sets, and
732 then create the item sets which define the various states.
736 Though we don't only store symbols in these sets, they are the main
737 things we store, so they are called symbol sets or "symsets".
739 A symset has a size, an array of shorts, and an optional array of data
740 values, which are also shorts. If the array of data is not present, we
741 store an impossible pointer, as `NULL` is used to indicate that no
742 memory has been allocated yet;
747 unsigned short *syms, *data;
749 #define NO_DATA ((unsigned short *)1)
750 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
751 const struct symset INIT_DATASET = { 0, NULL, NULL };
753 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
754 using an insertion sort. We don't explicitly record the amount of
755 allocated space as it can be derived directly from the current `cnt`
756 using `((cnt - 1) | 7) + 1`.
759 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
762 int current = ((s->cnt-1) | 7) + 1;
763 if (current == s->cnt) {
765 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
766 if (s->data != NO_DATA)
767 s->data = realloc(s->data, sizeof(*s->data) * current);
770 while (i > 0 && s->syms[i-1] > key) {
771 s->syms[i] = s->syms[i-1];
772 if (s->data != NO_DATA)
773 s->data[i] = s->data[i-1];
777 if (s->data != NO_DATA)
782 Finding a symbol (or item) in a `symset` uses a simple binary search.
783 We return the index where the value was found (so data can be accessed),
784 or `-1` to indicate failure.
786 static int symset_find(struct symset *ss, unsigned short key)
793 while (lo + 1 < hi) {
794 int mid = (lo + hi) / 2;
795 if (ss->syms[mid] <= key)
800 if (ss->syms[lo] == key)
805 We will often want to form the union of two symsets. When we do, we
806 will often want to know if anything changed (as that might mean there is
807 more work to do). So `symset_union` reports whether anything was added
808 to the first set. We use a slow quadratic approach as these sets are
809 rarely large. If profiling shows this to be a problem it can be
812 static int symset_union(struct symset *a, struct symset *b)
816 for (i = 0; i < b->cnt; i++)
817 if (symset_find(a, b->syms[i]) < 0) {
818 unsigned short data = 0;
819 if (b->data != NO_DATA)
821 symset_add(a, b->syms[i], data);
827 And of course we must be able to free a symset.
829 static void symset_free(struct symset ss)
832 if (ss.data != NO_DATA)
838 Some symsets are simply stored somewhere appropriate in the `grammar`
839 data structure, others needs a bit of indirection. This applies
840 particularly to "LA" sets. These will be paired with "items" in an
841 "itemset". As itemsets will be stored in a symset, the "LA" set needs
842 to be stored in the `data` field, so we need a mapping from a small
843 (short) number to an LA `symset`.
845 As mentioned earlier this involves creating a list of unique symsets.
847 For now, we just use a linear list sorted by insertion. A skiplist
848 would be more efficient and may be added later.
855 struct setlist *next;
858 ###### grammar fields
859 struct setlist *sets;
864 static int ss_cmp(struct symset a, struct symset b)
867 int diff = a.cnt - b.cnt;
870 for (i = 0; i < a.cnt; i++) {
871 diff = (int)a.syms[i] - (int)b.syms[i];
878 static int save_set(struct grammar *g, struct symset ss)
880 struct setlist **sl = &g->sets;
884 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
891 s = malloc(sizeof(*s));
900 Finding a set by number is currently performed by a simple linear
901 search. If this turns out to hurt performance, we can store the sets in
902 a dynamic array like the productions.
904 static struct symset set_find(struct grammar *g, int num)
906 struct setlist *sl = g->sets;
907 while (sl && sl->num != num)
912 ### Setting `nullable`
914 We set `nullable` on the head symbol for any production for which all
915 the body symbols (if any) are nullable. As this is a recursive
916 definition, any change in the `nullable` setting means that we need to
917 re-evaluate where it needs to be set.
919 We simply loop around performing the same calculations until no more
926 static void set_nullable(struct grammar *g)
929 while (check_again) {
932 for (p = 0; p < g->production_count; p++) {
933 struct production *pr = g->productions[p];
936 if (pr->head->nullable)
938 for (s = 0; s < pr->body_size; s++)
939 if (! pr->body[s]->nullable)
941 if (s == pr->body_size) {
942 pr->head->nullable = 1;
949 ### Building the `first` sets
951 When calculating what can follow a particular non-terminal, we will need
952 to know what the "first" terminal in any subsequent non-terminal might
953 be. So we calculate the `first` set for every non-terminal and store
954 them in an array. We don't bother recording the "first" set for
955 terminals as they are trivial.
957 As the "first" for one symbol might depend on the "first" of another, we
958 repeat the calculations until no changes happen, just like with
959 `set_nullable`. This makes use of the fact that `symset_union` reports
960 if any change happens.
962 The core of this, which finds the "first" of part of a production body,
963 will be reused for computing the "follow" sets, so we split that out
964 into a separate function, `add_first`, which also reports if it got all
965 the way to the end of the production without finding a non-nullable
968 ###### grammar fields
969 struct symset *first;
973 static int add_first(struct production *pr, int start,
974 struct symset *target, struct grammar *g,
979 for (s = start; s < pr->body_size; s++) {
980 struct symbol *bs = pr->body[s];
981 if (bs->type == Terminal) {
982 if (symset_find(target, bs->num) < 0) {
983 symset_add(target, bs->num, 0);
987 } else if (symset_union(target, &g->first[bs->num]))
993 *to_end = (s == pr->body_size);
997 static void build_first(struct grammar *g)
1001 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1002 for (s = 0; s < g->num_syms; s++)
1003 g->first[s] = INIT_SYMSET;
1005 while (check_again) {
1008 for (p = 0; p < g->production_count; p++) {
1009 struct production *pr = g->productions[p];
1010 struct symset *head = &g->first[pr->head->num];
1012 if (add_first(pr, 0, head, g, NULL))
1018 ### Building the `follow` sets.
1020 There are two different situations when we will want to generate
1021 "follow" sets. If we are doing an SLR analysis, we want to generate a
1022 single "follow" set for each non-terminal in the grammar. That is what
1023 is happening here. If we are doing an LALR or LR analysis we will want
1024 to generate a separate "LA" set for each item. We do that later in
1027 There are two parts to generating a "follow" set. Firstly we look at
1028 every place that any non-terminal appears in the body of any production,
1029 and we find the set of possible "first" symbols after there. This is
1030 done using `add_first` above and only needs to be done once as the
1031 "first" sets are now stable and will not change.
1033 ###### grammar fields
1034 struct symset *follow;
1038 for (p = 0; p < g->production_count; p++) {
1039 struct production *pr = g->productions[p];
1042 for (b = 0; b < pr->body_size - 1; b++) {
1043 struct symbol *bs = pr->body[b];
1044 if (bs->type == Terminal)
1046 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1050 The second part is to add the "follow" set of the head of a production
1051 to the "follow" sets of the final symbol in the production, and any
1052 other symbol which is followed only by `nullable` symbols. As this
1053 depends on "follow" itself we need to repeatedly perform the process
1054 until no further changes happen.
1056 Rather than 2 nested loops, one that repeats the whole process until
1057 there is no change, and one that iterates of the set of productions, we
1058 combine these two functions into a single loop.
1062 for (check_again = 0, p = 0;
1063 p < g->production_count;
1064 p < g->production_count-1
1065 ? p++ : check_again ? (check_again = 0, p = 0)
1067 struct production *pr = g->productions[p];
1070 for (b = pr->body_size - 1; b >= 0; b--) {
1071 struct symbol *bs = pr->body[b];
1072 if (bs->type == Terminal)
1074 if (symset_union(&g->follow[bs->num],
1075 &g->follow[pr->head->num]))
1082 We now just need to create and initialise the `follow` list to get a
1086 static void build_follow(struct grammar *g)
1088 int s, check_again, p;
1089 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1090 for (s = 0; s < g->num_syms; s++)
1091 g->follow[s] = INIT_SYMSET;
1095 ### Building itemsets and states
1097 There are three different levels of detail that can be chosen for
1098 building the itemsets and states for the LR grammar. They are:
1100 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1101 itemsets - look-ahead, if used at all, is simply the 'follow' sets
1103 2. LALR(1) where we build look-ahead sets with each item and merge
1104 the LA sets when we find two paths to the same "kernel" set of items.
1105 3. LR(1) where different look-ahead for any item in the set means
1106 a different item set must be created.
1108 ###### forward declarations
1109 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1111 We need to be able to look through existing states to see if a newly
1112 generated state already exists. For now we use a simple sorted linked
1115 An item is a pair of numbers: the production number and the position of
1116 "DOT", which is an index into the body of the production. As the
1117 numbers are not enormous we can combine them into a single "short" and
1118 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1119 production number (so 2000 productions with maximum of 31 symbols in the
1122 Comparing the itemsets will be a little different to comparing symsets
1123 as we want to do the lookup after generating the "kernel" of an
1124 itemset, so we need to ignore the offset=zero items which are added during
1127 To facilitate this, we modify the "DOT" number so that "0" sorts to
1128 the end of the list in the symset, and then only compare items before
1132 static inline unsigned short item_num(int production, int dot)
1134 return production | ((31-dot) << 11);
1136 static inline int item_prod(unsigned short item)
1138 return item & 0x7ff;
1140 static inline int item_dot(unsigned short item)
1142 return (31-(item >> 11)) & 0x1f;
1145 For LR(1) analysis we need to compare not just the itemset in a state
1146 but also the LA sets. As we assign each unique LA set a number, we
1147 can just compare the symset and the data values together.
1150 static int itemset_cmp(struct symset a, struct symset b,
1151 enum grammar_type type)
1157 i < a.cnt && i < b.cnt &&
1158 item_dot(a.syms[i]) > 0 &&
1159 item_dot(b.syms[i]) > 0;
1161 int diff = a.syms[i] - b.syms[i];
1165 diff = a.data[i] - b.data[i];
1170 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1174 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1180 if (type < LR1 || av == -1)
1183 a.data[i] - b.data[i];
1186 It will be helpful to know if an itemset has been "completed" or not,
1187 particularly for LALR where itemsets get merged, at which point they
1188 need to be consider for completion again. So a `completed` flag is
1191 And now we can build the list of itemsets. The lookup routine returns
1192 both a success flag and a pointer to where in the list an insert should
1193 happen, so we don't need to search a second time.
1197 struct itemset *next;
1199 struct symset items;
1200 struct symset go_to;
1202 unsigned short precedence;
1206 ###### grammar fields
1207 struct itemset *items;
1211 static int itemset_find(struct grammar *g, struct itemset ***where,
1212 struct symset kernel, enum grammar_type type)
1214 struct itemset **ip;
1216 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1217 struct itemset *i = *ip;
1219 diff = itemset_cmp(i->items, kernel, type);
1232 Adding an itemset may require merging the LA sets if LALR analysis is
1233 happening. If any new LA set adds any symbols that weren't in the old
1234 LA set, we clear the `completed` flag so that the dependants of this
1235 itemset will be recalculated and their LA sets updated.
1237 `add_itemset` must consume the symsets it is passed, either by adding
1238 them to a data structure, of freeing them.
1240 static int add_itemset(struct grammar *g, struct symset ss,
1241 enum assoc assoc, unsigned short precedence,
1242 enum grammar_type type)
1244 struct itemset **where, *is;
1246 int found = itemset_find(g, &where, ss, type);
1248 is = calloc(1, sizeof(*is));
1249 is->state = g->states;
1253 is->precedence = precedence;
1255 is->go_to = INIT_DATASET;
1264 for (i = 0; i < ss.cnt; i++) {
1265 struct symset temp = INIT_SYMSET, s;
1266 if (ss.data[i] == is->items.data[i])
1268 s = set_find(g, is->items.data[i]);
1269 symset_union(&temp, &s);
1270 s = set_find(g, ss.data[i]);
1271 if (symset_union(&temp, &s)) {
1272 is->items.data[i] = save_set(g, temp);
1283 To build all the itemsets, we first insert the initial itemset made from
1284 production zero, complete each itemset, and then generate new itemsets
1285 from old until no new ones can be made.
1287 Completing an itemset means finding all the items where "DOT" is
1288 followed by a nonterminal and adding "DOT=0" items for every production
1289 from that non-terminal - providing each item hasn't already been added.
1290 When we add such an item it might get inserted before the current
1291 one, so to make sure we process it we reset the loop counter to the
1294 If LA sets are needed, the LA set for each new item is found using
1295 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This
1296 may be supplemented by the LA set for the item which produced the new
1299 We also collect a set of all symbols which follow "DOT" (in `done`) as
1300 this is used in the next stage.
1302 When itemsets are created we assign a precedence to the itemset from the
1303 complete item, if there is one. We ignore the possibility of there
1304 being two and don't (currently) handle precedence in such grammars.
1305 When completing a grammar we ignore any item where DOT is followed by a
1306 terminal with a precedence lower than that for the itemset. Unless the
1307 terminal has right associativity, we also ignore items where the
1308 terminal has the same precedence. The result is that unwanted items are
1309 still in the itemset, but the terminal doesn't get into the go to set,
1310 so the item is ineffective.
1312 ###### complete itemset
1313 for (i = 0; i < is->items.cnt; i++) {
1314 int p = item_prod(is->items.syms[i]);
1315 int bs = item_dot(is->items.syms[i]);
1316 struct production *pr = g->productions[p];
1319 struct symset LA = INIT_SYMSET;
1320 unsigned short sn = 0;
1322 if (bs == pr->body_size)
1325 if (s->precedence && is->precedence &&
1326 is->precedence > s->precedence)
1327 /* This terminal has a low precedence and
1328 * shouldn't be shifted
1331 if (s->precedence && is->precedence &&
1332 is->precedence == s->precedence && s->assoc != Right)
1333 /* This terminal has a matching precedence and is
1334 * not Right-associative, so we mustn't shift it.
1337 if (symset_find(&done, s->num) < 0)
1338 symset_add(&done, s->num, 0);
1340 if (s->type != Nonterminal)
1346 add_first(pr, bs+1, &LA, g, &to_end);
1348 struct symset ss = set_find(g, is->items.data[i]);
1349 symset_union(&LA, &ss);
1351 sn = save_set(g, LA);
1352 LA = set_find(g, sn);
1355 /* Add productions for this symbol */
1356 for (p2 = s->first_production;
1357 p2 < g->production_count &&
1358 g->productions[p2]->head == s;
1360 int itm = item_num(p2, 0);
1361 int pos = symset_find(&is->items, itm);
1363 symset_add(&is->items, itm, sn);
1364 /* Will have re-ordered, so start
1365 * from beginning again */
1367 } else if (type >= LALR) {
1368 struct symset ss = set_find(g, is->items.data[pos]);
1369 struct symset tmp = INIT_SYMSET;
1370 struct symset *la = &LA;
1372 symset_union(&tmp, &ss);
1373 if (symset_union(&tmp, la)) {
1374 is->items.data[pos] = save_set(g, tmp);
1382 For each symbol we found (and placed in `done`) we collect all the items
1383 for which this symbol is next, and create a new itemset with "DOT"
1384 advanced over the symbol. This is then added to the collection of
1385 itemsets (or merged with a pre-existing itemset).
1387 ###### derive itemsets
1388 // Now we have a completed itemset, so we need to
1389 // compute all the 'next' itemsets and create them
1390 // if they don't exist.
1391 for (i = 0; i < done.cnt; i++) {
1393 unsigned short state;
1394 struct symbol *sym = g->symtab[done.syms[i]];
1395 enum assoc assoc = Non;
1396 unsigned short precedence = 0;
1397 struct symset newitemset = INIT_SYMSET;
1399 newitemset = INIT_DATASET;
1401 for (j = 0; j < is->items.cnt; j++) {
1402 int itm = is->items.syms[j];
1403 int p = item_prod(itm);
1404 int bp = item_dot(itm);
1405 struct production *pr = g->productions[p];
1406 unsigned short la = 0;
1409 if (bp == pr->body_size)
1411 if (pr->body[bp] != sym)
1416 la = is->items.data[j];
1417 if (bp == pr->body_size &&
1418 pr->precedence > 0 &&
1419 pr->precedence > precedence) {
1420 // new itemset is reducible and has a precedence.
1421 precedence = pr->precedence;
1424 pos = symset_find(&newitemset, item_num(p, bp));
1427 symset_add(&newitemset, item_num(p, bp), la);
1428 else if (type >= LALR) {
1429 // Need to merge la set.
1430 int la2 = newitemset.data[pos];
1432 struct symset ss = set_find(g, la2);
1433 struct symset LA = INIT_SYMSET;
1434 symset_union(&LA, &ss);
1435 ss = set_find(g, la);
1436 if (symset_union(&LA, &ss))
1437 newitemset.data[pos] = save_set(g, LA);
1443 state = add_itemset(g, newitemset, assoc, precedence, type);
1444 if (symset_find(&is->go_to, done.syms[i]) < 0)
1445 symset_add(&is->go_to, done.syms[i], state);
1448 All that is left is to create the initial itemset from production zero, and
1449 with `TK_eof` as the LA set.
1452 static void build_itemsets(struct grammar *g, enum grammar_type type)
1454 struct symset first = INIT_SYMSET;
1457 unsigned short la = 0;
1459 // LA set just has eof
1460 struct symset eof = INIT_SYMSET;
1461 symset_add(&eof, TK_eof, 0);
1462 la = save_set(g, eof);
1463 first = INIT_DATASET;
1465 // production 0, offset 0 (with no data)
1466 symset_add(&first, item_num(0, 0), la);
1467 add_itemset(g, first, Non, 0, type);
1468 for (check_again = 0, is = g->items;
1470 is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1472 struct symset done = INIT_SYMSET;
1483 ### Completing the analysis.
1485 The exact process of analysis depends on which level was requested. For
1486 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1487 `LR1` we need first, but not follow. For `SLR` we need both.
1489 We don't build the "action" tables that you might expect as the parser
1490 doesn't use them. They will be calculated to some extent if needed for
1493 Once we have built everything we allocate arrays for the two lists:
1494 symbols and itemsets. This allows more efficient access during
1495 reporting. The symbols are grouped as terminals, then non-terminals,
1496 then virtual, with the start of non-terminals recorded as `first_nonterm`.
1497 Special terminals -- meaning just EOL -- are included with the
1498 non-terminals so that they are not expected by the scanner.
1500 ###### grammar fields
1501 struct symbol **symtab;
1502 struct itemset **statetab;
1505 ###### grammar_analyse
1507 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1511 int snum = TK_reserved;
1512 for (s = g->syms; s; s = s->next)
1513 if (s->num < 0 && s->type == Terminal && !s->isspecial) {
1517 g->first_nonterm = snum;
1518 for (s = g->syms; s; s = s->next)
1519 if (s->num < 0 && s->type != Virtual) {
1523 for (s = g->syms; s; s = s->next)
1529 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1530 for (s = g->syms; s; s = s->next)
1531 g->symtab[s->num] = s;
1540 build_itemsets(g, type);
1542 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1543 for (is = g->items; is ; is = is->next)
1544 g->statetab[is->state] = is;
1547 ## Reporting on the Grammar
1549 The purpose of the report is to give the grammar developer insight into
1550 how the grammar parser will work. It is basically a structured dump of
1551 all the tables that have been generated, plus a description of any conflicts.
1553 ###### grammar_report
1554 static int grammar_report(struct grammar *g, enum grammar_type type)
1560 return report_conflicts(g, type);
1563 Firstly we have the complete list of symbols, together with the
1564 "FIRST" set if that was generated. We add a mark to each symbol to
1565 show if it is nullable (`.`).
1569 static void report_symbols(struct grammar *g)
1573 printf("SYMBOLS + FIRST:\n");
1575 printf("SYMBOLS:\n");
1577 for (n = 0; n < g->num_syms; n++) {
1578 struct symbol *s = g->symtab[n];
1582 printf(" %c%3d%c: ",
1583 s->nullable ? '.':' ',
1584 s->num, symtypes[s->type]);
1587 printf(" (%d%s)", s->precedence,
1588 assoc_names[s->assoc]);
1590 if (g->first && s->type == Nonterminal) {
1593 for (j = 0; j < g->first[n].cnt; j++) {
1596 prtxt(g->symtab[g->first[n].syms[j]]->name);
1603 Then we have the follow sets if they were computed.
1605 static void report_follow(struct grammar *g)
1608 printf("FOLLOW:\n");
1609 for (n = 0; n < g->num_syms; n++)
1610 if (g->follow[n].cnt) {
1614 prtxt(g->symtab[n]->name);
1615 for (j = 0; j < g->follow[n].cnt; j++) {
1618 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1624 And finally the item sets. These include the GO TO tables and, for
1625 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1626 it up a bit. First the items, with production number and associativity.
1628 static void report_item(struct grammar *g, int itm)
1630 int p = item_prod(itm);
1631 int dot = item_dot(itm);
1632 struct production *pr = g->productions[p];
1636 prtxt(pr->head->name);
1638 for (i = 0; i < pr->body_size; i++) {
1639 printf(" %s", (dot == i ? ". ": ""));
1640 prtxt(pr->body[i]->name);
1642 if (dot == pr->body_size)
1645 if (pr->precedence && dot == pr->body_size)
1646 printf(" (%d%s)", pr->precedence,
1647 assoc_names[pr->assoc]);
1648 if (dot < pr->body_size &&
1649 pr->body[dot]->precedence) {
1650 struct symbol *s = pr->body[dot];
1651 printf(" [%d%s]", s->precedence,
1652 assoc_names[s->assoc]);
1657 The LA sets which are (possibly) reported with each item:
1659 static void report_la(struct grammar *g, int lanum)
1661 struct symset la = set_find(g, lanum);
1665 printf(" LOOK AHEAD(%d)", lanum);
1666 for (i = 0; i < la.cnt; i++) {
1669 prtxt(g->symtab[la.syms[i]]->name);
1674 Then the go to sets:
1676 static void report_goto(struct grammar *g, struct symset gt)
1681 for (i = 0; i < gt.cnt; i++) {
1683 prtxt(g->symtab[gt.syms[i]]->name);
1684 printf(" -> %d\n", gt.data[i]);
1688 Now we can report all the item sets complete with items, LA sets, and GO TO.
1690 static void report_itemsets(struct grammar *g)
1693 printf("ITEM SETS(%d)\n", g->states);
1694 for (s = 0; s < g->states; s++) {
1696 struct itemset *is = g->statetab[s];
1697 printf(" Itemset %d:", s);
1699 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1701 for (j = 0; j < is->items.cnt; j++) {
1702 report_item(g, is->items.syms[j]);
1703 if (is->items.data != NO_DATA)
1704 report_la(g, is->items.data[j]);
1706 report_goto(g, is->go_to);
1710 ### Reporting conflicts
1712 Conflict detection varies a lot among different analysis levels. However
1713 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1714 LR1 are also similar as they have FOLLOW or LA sets.
1718 ## conflict functions
1720 static int report_conflicts(struct grammar *g, enum grammar_type type)
1723 printf("Conflicts:\n");
1725 cnt = conflicts_lr0(g, type);
1727 cnt = conflicts_slr(g, type);
1729 printf(" - no conflicts\n");
1733 LR0 conflicts are any state which have both a reducible item and
1734 a shiftable item, or two reducible items.
1736 LR05 conflicts only occur if two possible reductions exist,
1737 as shifts always over-ride reductions.
1739 ###### conflict functions
1740 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1744 for (i = 0; i < g->states; i++) {
1745 struct itemset *is = g->statetab[i];
1746 int last_reduce = -1;
1747 int prev_reduce = -1;
1748 int last_shift = -1;
1752 for (j = 0; j < is->items.cnt; j++) {
1753 int itm = is->items.syms[j];
1754 int p = item_prod(itm);
1755 int bp = item_dot(itm);
1756 struct production *pr = g->productions[p];
1758 if (bp == pr->body_size) {
1759 prev_reduce = last_reduce;
1763 if (pr->body[bp]->type == Terminal)
1766 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1767 printf(" State %d has both SHIFT and REDUCE:\n", i);
1768 report_item(g, is->items.syms[last_shift]);
1769 report_item(g, is->items.syms[last_reduce]);
1772 if (prev_reduce >= 0) {
1773 printf(" State %d has 2 (or more) reducible items\n", i);
1774 report_item(g, is->items.syms[prev_reduce]);
1775 report_item(g, is->items.syms[last_reduce]);
1782 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1783 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1784 in the source of the look ahead set.
1786 We build two datasets to reflect the "action" table: one which maps
1787 terminals to items where that terminal could be shifted and another
1788 which maps terminals to items that could be reduced when the terminal
1789 is in look-ahead. We report when we get conflicts between the two.
1791 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1796 for (i = 0; i < g->states; i++) {
1797 struct itemset *is = g->statetab[i];
1798 struct symset shifts = INIT_DATASET;
1799 struct symset reduce = INIT_DATASET;
1803 /* First collect the shifts */
1804 for (j = 0; j < is->items.cnt; j++) {
1805 unsigned short itm = is->items.syms[j];
1806 int p = item_prod(itm);
1807 int bp = item_dot(itm);
1808 struct production *pr = g->productions[p];
1811 if (bp >= pr->body_size ||
1812 pr->body[bp]->type != Terminal)
1817 if (s->precedence && is->precedence)
1818 /* Precedence resolves this, so no conflict */
1821 if (symset_find(&shifts, s->num) < 0)
1822 symset_add(&shifts, s->num, itm);
1824 /* Now look for reductions and conflicts */
1825 for (j = 0; j < is->items.cnt; j++) {
1826 unsigned short itm = is->items.syms[j];
1827 int p = item_prod(itm);
1828 int bp = item_dot(itm);
1829 struct production *pr = g->productions[p];
1831 if (bp < pr->body_size)
1836 la = g->follow[pr->head->num];
1838 la = set_find(g, is->items.data[j]);
1840 for (k = 0; k < la.cnt; k++) {
1841 int pos = symset_find(&shifts, la.syms[k]);
1843 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1845 prtxt(g->symtab[la.syms[k]]->name);
1847 report_item(g, shifts.data[pos]);
1848 report_item(g, itm);
1850 pos = symset_find(&reduce, la.syms[k]);
1852 symset_add(&reduce, la.syms[k], itm);
1855 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1856 prtxt(g->symtab[la.syms[k]]->name);
1858 report_item(g, itm);
1859 report_item(g, reduce.data[pos]);
1863 symset_free(shifts);
1864 symset_free(reduce);
1870 ## Generating the parser
1872 The exported part of the parser is the `parse_XX` function, where the name
1873 `XX` is based on the name of the parser files.
1875 This takes a `code_node`, a partially initialized `token_config`, and an
1876 optional `FILE` to send tracing to. The `token_config` gets the list of
1877 known words added and then is used with the `code_node` to initialize the
1880 `parse_XX` then calls the library function `parser_run` to actually complete
1881 the parse. This needs the `states` table and functions to call the various
1882 pieces of code provided in the grammar file, so they are generated first.
1884 ###### parser_generate
1886 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1887 struct code_node *pre_reduce)
1893 gen_reduce(f, g, file, pre_reduce);
1896 fprintf(f, "#line 0 \"gen_parser\"\n");
1897 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1900 fprintf(f, "\tstruct token_state *tokens;\n");
1901 fprintf(f, "\tconfig->words_marks = known;\n");
1902 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1903 fprintf(f, "\ttokens = token_open(code, config);\n");
1904 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1905 fprintf(f, "\ttoken_close(tokens);\n");
1906 fprintf(f, "\treturn rv;\n");
1907 fprintf(f, "}\n\n");
1910 ### Known words table
1912 The known words table is simply an array of terminal symbols.
1913 The table of nonterminals used for tracing is a similar array.
1917 static void gen_known(FILE *f, struct grammar *g)
1920 fprintf(f, "#line 0 \"gen_known\"\n");
1921 fprintf(f, "static const char *known[] = {\n");
1922 for (i = TK_reserved;
1923 i < g->num_syms && g->symtab[i]->type == Terminal;
1925 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1926 g->symtab[i]->name.txt);
1927 fprintf(f, "};\n\n");
1930 static void gen_non_term(FILE *f, struct grammar *g)
1933 fprintf(f, "#line 0 \"gen_non_term\"\n");
1934 fprintf(f, "static const char *non_term[] = {\n");
1935 for (i = TK_reserved;
1938 if (g->symtab[i]->type == Nonterminal ||
1939 g->symtab[i]->isspecial)
1940 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1941 g->symtab[i]->name.txt);
1942 fprintf(f, "};\n\n");
1945 ### States and the goto tables.
1947 For each state we record the goto table and details of the reducible
1948 production if there is one.
1949 Some of the details of the reducible production are stored in the
1950 `do_reduce` function to come later. Here we store the production
1951 number, the body size (useful for stack management), and the resulting
1952 symbol (useful for knowing how to free data later).
1954 The go to table is stored in a simple array of `sym` and corresponding
1957 ###### exported types
1965 const struct lookup * go_to;
1974 static void gen_goto(FILE *f, struct grammar *g)
1977 fprintf(f, "#line 0 \"gen_goto\"\n");
1978 for (i = 0; i < g->states; i++) {
1979 struct symset gt = g->statetab[i]->go_to;
1984 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1986 for (j = 0; j < gt.cnt; j++)
1987 fprintf(f, "\t{ %d, %d },\n",
1988 gt.syms[j], gt.data[j]);
1993 static void gen_states(FILE *f, struct grammar *g)
1996 fprintf(f, "#line 0 \"gen_states\"\n");
1997 fprintf(f, "static const struct state states[] = {\n");
1998 for (i = 0; i < g->states; i++) {
1999 struct itemset *is = g->statetab[i];
2000 int j, prod = -1, prod_len;
2002 for (j = 0; j < is->items.cnt; j++) {
2003 int itm = is->items.syms[j];
2004 int p = item_prod(itm);
2005 int bp = item_dot(itm);
2006 struct production *pr = g->productions[p];
2008 if (bp < pr->body_size)
2010 /* This is what we reduce - choose longest */
2011 if (prod < 0 || prod_len < pr->body_size) {
2013 prod_len = pr->body_size;
2017 fprintf(f, "\t[%d] = { %d, goto_%d, ",
2018 i, is->go_to.cnt, i);
2020 fprintf(f, "\t[%d] = { 0, NULL, ", i);
2022 struct production *pr = g->productions[prod];
2023 struct symbol *hd = pr->head;
2024 fprintf(f, "%d, %d, %d, ",
2025 prod, pr->body_size, pr->head->num);
2026 if (hd->struct_name.txt == NULL)
2027 fprintf(f, "0 },\n");
2029 fprintf(f, "sizeof(struct %.*s%s) },\n",
2030 hd->struct_name.len,
2031 hd->struct_name.txt,
2032 hd->isref ? "*" : "");
2034 fprintf(f, "-1, -1, -1, -1 },\n");
2036 fprintf(f, "};\n\n");
2039 ### The `do_reduce` function and the code
2041 When the parser engine decides to reduce a production, it calls
2042 `do_reduce` which runs the code that was included with the production,
2045 This code needs to be able to store data somewhere. Rather than
2046 requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
2047 buffer and have `do_reduce` return the size to be saved.
2049 In order for the code to access "global" context, we pass in the
2050 "config" pointer that was passed to the parser function. If the `struct
2051 token_config` is embedded in some larger structure, the reducing code
2052 can access the larger structure using pointer manipulation. Performing
2053 that pointer manipulation, and any other common code or variables for
2054 all reduce actions, can be provided in code node called "reduce" which
2055 is passed around in `pre_reduce` which you might have already noticed.
2057 The code fragments require translation when written out. Any `$N` needs
2058 to be converted to a reference either to that buffer (if `$0`) or to the
2059 structure returned by a previous reduction. These pointers need to be
2060 cast to the appropriate type for each access. All this is handled in
2063 `gen_code` also allows symbol references to contain a '`<`' as in
2064 '`$<2`'. This is particularly useful for references (or pointers), but
2065 can be used with structures too. The `<` implies that the value is
2066 being moved out, so the object will not be automatically freed. It is
2067 equivalent to assigning `NULL` to the pointer or filling a structure
2070 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2071 and possibly a number. A number by itself (other than zero) selects a
2072 symbol from the body of the production. A sequence of letters selects
2073 the shortest symbol in the body which contains those letters in the
2074 given order. If a number follows the letters, then a later occurrence
2075 of that symbol is chosen. So "`$AB2`" will refer to the structure
2076 attached to the second occurrence of the shortest symbol which contains
2077 an `A` followed by a `B`. If there is no unique shortest system, or if
2078 the number given is too large, then the symbol reference is not
2079 transformed, and will cause an error when the code is compiled.
2083 static int textchr(struct text t, char c, int s)
2086 for (i = s; i < t.len; i++)
2092 static int subseq_match(char *seq, int slen, struct text name)
2096 st = textchr(name, *seq, st);
2106 static int choose_sym(char **namep, int len, struct production *p)
2108 char *name = *namep;
2117 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2123 while (len > 0 && (c >= '0' && c <= '9')) {
2126 n = n * 10 + (c - '0');
2130 if (name == *namep || n > p->body_size)
2136 for (i = 0; i < p->body_size; i++) {
2137 if (!subseq_match(nam, namlen, p->body[i]->name))
2139 if (slen == 0 || p->body[i]->name.len < slen) {
2141 slen = p->body[i]->name.len;
2143 if (s >= 0 && p->body[i] != p->body[s] &&
2144 p->body[i]->name.len == p->body[s]->name.len)
2145 /* not unique, so s cannot be used */
2152 for (i = 0; i < p->body_size; i++)
2153 if (p->body[i] == p->body[s]) {
2158 if (n > 0 || i == p->body_size)
2164 static void gen_code(struct production *p, FILE *f, struct grammar *g)
2167 char *used = calloc(1, p->body_size);
2170 fprintf(f, "\t\t\t");
2171 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2185 n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2194 fprintf(f, "(*(struct %.*s*%s)ret)",
2195 p->head->struct_name.len,
2196 p->head->struct_name.txt,
2197 p->head->isref ? "*":"");
2198 else if (p->body[n-1]->type == Terminal)
2199 fprintf(f, "(*(struct token *)body[%d])",
2201 else if (p->body[n-1]->struct_name.txt == NULL)
2202 fprintf(f, "$%d", n);
2204 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2205 p->body[n-1]->struct_name.len,
2206 p->body[n-1]->struct_name.txt,
2207 p->body[n-1]->isref ? "*":"", n-1);
2213 for (i = 0; i < p->body_size; i++) {
2214 if (p->body[i]->struct_name.txt &&
2216 // assume this has been copied out
2217 if (p->body[i]->isref)
2218 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2220 fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2228 static void gen_reduce(FILE *f, struct grammar *g, char *file,
2229 struct code_node *pre_reduce)
2232 fprintf(f, "#line 1 \"gen_reduce\"\n");
2233 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2235 fprintf(f, "\tint ret_size = 0;\n");
2237 code_node_print(f, pre_reduce, file);
2239 fprintf(f, "#line 4 \"gen_reduce\"\n");
2240 fprintf(f, "\tswitch(prod) {\n");
2241 for (i = 0; i < g->production_count; i++) {
2242 struct production *p = g->productions[i];
2243 fprintf(f, "\tcase %d:\n", i);
2246 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2250 if (p->head->struct_name.txt)
2251 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2252 p->head->struct_name.len,
2253 p->head->struct_name.txt,
2254 p->head->isref ? "*":"");
2256 fprintf(f, "\t\tbreak;\n");
2258 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2263 As each non-terminal can potentially cause a different type of data
2264 structure to be allocated and filled in, we need to be able to free it when
2267 It is particularly important to have fine control over freeing during error
2268 recovery where individual stack frames might need to be freed.
2270 For this, the grammar author is required to defined a `free_XX` function for
2271 each structure that is used by a non-terminal. `do_free` will call whichever
2272 is appropriate given a symbol number, and will call `free` (as is
2273 appropriate for tokens) on any terminal symbol.
2277 static void gen_free(FILE *f, struct grammar *g)
2281 fprintf(f, "#line 0 \"gen_free\"\n");
2282 fprintf(f, "static void do_free(short sym, void *asn)\n");
2284 fprintf(f, "\tif (!asn) return;\n");
2285 fprintf(f, "\tif (sym < %d", g->first_nonterm);
2286 /* Need to handle special terminals too */
2287 for (i = 0; i < g->num_syms; i++) {
2288 struct symbol *s = g->symtab[i];
2289 if (i >= g->first_nonterm && s->type == Terminal &&
2291 fprintf(f, " || sym == %d", s->num);
2293 fprintf(f, ") {\n");
2294 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2295 fprintf(f, "\tswitch(sym) {\n");
2297 for (i = 0; i < g->num_syms; i++) {
2298 struct symbol *s = g->symtab[i];
2300 s->type != Nonterminal ||
2301 s->struct_name.txt == NULL)
2304 fprintf(f, "\tcase %d:\n", s->num);
2306 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2308 s->struct_name.txt);
2309 fprintf(f, "\t\tfree(asn);\n");
2311 fprintf(f, "\t\tfree_%.*s(asn);\n",
2313 s->struct_name.txt);
2314 fprintf(f, "\t\tbreak;\n");
2316 fprintf(f, "\t}\n}\n\n");
2319 ## The main routine.
2321 There are three key parts to the "main" function of parsergen: processing
2322 the arguments, loading the grammar file, and dealing with the grammar.
2324 ### Argument processing.
2326 Command line options allow the selection of analysis mode, name of output
2327 file, and whether or not a report should be generated. By default we create
2328 a report only if no code output was requested.
2330 The `parse_XX` function name uses the basename of the output file which
2331 should not have a suffix (`.c`). `.c` is added to the given name for the
2332 code, and `.h` is added for the header (if header text is specifed in the
2339 static const struct option long_options[] = {
2340 { "LR0", 0, NULL, '0' },
2341 { "LR05", 0, NULL, '5' },
2342 { "SLR", 0, NULL, 'S' },
2343 { "LALR", 0, NULL, 'L' },
2344 { "LR1", 0, NULL, '1' },
2345 { "tag", 1, NULL, 't' },
2346 { "report", 0, NULL, 'R' },
2347 { "output", 1, NULL, 'o' },
2348 { NULL, 0, NULL, 0 }
2350 const char *options = "05SL1t:Ro:";
2352 ###### process arguments
2354 char *outfile = NULL;
2359 enum grammar_type type = LR05;
2360 while ((opt = getopt_long(argc, argv, options,
2361 long_options, NULL)) != -1) {
2376 outfile = optarg; break;
2378 tag = optarg; break;
2380 fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2385 infile = argv[optind++];
2387 fprintf(stderr, "No input file given\n");
2390 if (outfile && report == 1)
2393 if (name && strchr(name, '/'))
2394 name = strrchr(name, '/')+1;
2396 if (optind < argc) {
2397 fprintf(stderr, "Excess command line arguments\n");
2401 ### Loading the grammar file
2403 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2406 Once we have extracted the code (with `mdcode`) we expect to find three
2407 or four sections: header, code, grammar, reduce.
2408 Anything else that is not excluded by the `--tag` option is an error.
2410 "header", "code", and "reduce" are optional, though it is hard to build
2411 a working parser without the first two. "grammar" must be provided.
2415 #include <sys/mman.h>
2420 static void pr_err(char *msg)
2423 fprintf(stderr, "%s\n", msg);
2427 struct section *table;
2431 fd = open(infile, O_RDONLY);
2433 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2434 infile, strerror(errno));
2437 len = lseek(fd, 0, 2);
2438 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2439 table = code_extract(file, file+len, pr_err);
2441 struct code_node *hdr = NULL;
2442 struct code_node *code = NULL;
2443 struct code_node *gram = NULL;
2444 struct code_node *pre_reduce = NULL;
2445 for (s = table; s; s = s->next) {
2446 struct text sec = s->section;
2447 if (tag && !strip_tag(&sec, tag))
2449 if (text_is(sec, "header"))
2451 else if (text_is(sec, "code"))
2453 else if (text_is(sec, "grammar"))
2455 else if (text_is(sec, "reduce"))
2456 pre_reduce = s->code;
2458 fprintf(stderr, "Unknown content section: %.*s\n",
2459 s->section.len, s->section.txt);
2464 ### Processing the input
2466 As we need to append an extention to a filename and then open it for
2467 writing, and we need to do this twice, it helps to have a separate function.
2471 static FILE *open_ext(char *base, char *ext)
2473 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2475 strcat(strcpy(fn, base), ext);
2481 If we can read the grammar, then we analyse and optionally report on it. If
2482 the report finds conflicts we will exit with an error status.
2484 ###### process input
2485 struct grammar *g = NULL;
2487 fprintf(stderr, "No grammar section provided\n");
2490 g = grammar_read(gram);
2492 fprintf(stderr, "Failure to parse grammar\n");
2497 grammar_analyse(g, type);
2499 if (grammar_report(g, type))
2503 If a "headers" section is defined, we write it out and include a declaration
2504 for the `parse_XX` function so it can be used from separate code.
2506 if (rv == 0 && hdr && outfile) {
2507 FILE *f = open_ext(outfile, ".h");
2509 code_node_print(f, hdr, infile);
2510 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2514 fprintf(stderr, "Cannot create %s.h\n",
2520 And if all goes well and an output file was provided, we create the `.c`
2521 file with the code section (if any) and the parser tables and function.
2523 if (rv == 0 && outfile) {
2524 FILE *f = open_ext(outfile, ".c");
2527 code_node_print(f, code, infile);
2528 gen_parser(f, g, infile, name, pre_reduce);
2531 fprintf(stderr, "Cannot create %s.c\n",
2537 And that about wraps it up. We need to set the locale so that UTF-8 is
2538 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2540 ###### File: parsergen.mk
2541 parsergen : parsergen.o libscanner.o libmdcode.o
2542 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2549 int main(int argc, char *argv[])
2554 setlocale(LC_ALL,"");
2556 ## process arguments
2563 ## The SHIFT/REDUCE parser
2565 Having analysed the grammar and generated all the tables, we only need
2566 the shift/reduce engine to bring it all together.
2568 ### Goto table lookup
2570 The parser generator has nicely provided us with goto tables sorted by
2571 symbol number. We need a binary search function to find a symbol in the
2574 ###### parser functions
2576 static int search(const struct state *l, int sym)
2579 int hi = l->go_to_cnt;
2583 while (lo + 1 < hi) {
2584 int mid = (lo + hi) / 2;
2585 if (l->go_to[mid].sym <= sym)
2590 if (l->go_to[lo].sym == sym)
2591 return l->go_to[lo].state;
2596 ### Memory allocation
2598 The `scanner` returns tokens in a local variable - we want them in allocated
2599 memory so they can live in the `asn_stack`. So we provide `tok_copy` to
2600 make an allocated copy as required.
2602 ###### parser includes
2605 ###### parser functions
2607 static struct token *tok_copy(struct token tk)
2609 struct token *new = malloc(sizeof(*new));
2614 ### The state stack.
2616 The core data structure for the parser is the stack. This tracks all
2617 the symbols that have been recognised or partially recognised.
2619 The stack usually won't grow very large - maybe a few tens of entries.
2620 So we dynamically grow an array as required but never bother to
2621 shrink it down again.
2623 We keep the stack as two separate allocations. One, `asn_stack` stores
2624 the "abstract syntax nodes" which are created by each reduction. When
2625 we call `do_reduce` we need to pass an array of the `asn`s of the body
2626 of the production, and by keeping a separate `asn` stack, we can just
2627 pass a pointer into this stack.
2629 The other allocation stores all other stack fields of which there are
2630 several. The `state` is the most important one and guides the parsing
2631 process. The `sym` is nearly unnecessary as it is implicit in the
2632 state. However when we want to free entries from the `asn_stack`, it
2633 helps to know what type they are so we can call the right freeing
2634 function. The symbol leads us to the right free function through
2637 The stack is most properly seen as alternating states and symbols -
2638 states, like the 'DOT' in items, are between symbols. Each frame in
2639 our stack holds a state and the symbol that was before it. The
2640 bottom of stack holds the start state but no symbol, as nothing came
2641 before the beginning. As we need to store some value, `TK_eof` is used
2642 to mark the beginning of the file as well as the end.
2644 Indents (IN) are sometimes shifted and sometimes only accounted.
2645 Whatever decision is made must apply equally to the matching OUT. To
2646 ensure this we keep a stack of bits in `ignored_indents` and
2647 `indent_depth`. When we process an IN, we record if it was ignored.
2648 When we see an out, we pop of the relavant bit and use it to decide how
2651 ###### parser functions
2667 Two operations are needed on the stack - shift (which is like push) and pop.
2669 Shift applies not only to terminals but also to non-terminals. When we
2670 reduce a production we will pop off frames corresponding to the body
2671 symbols, then push on a frame for the head of the production. This last
2672 is exactly the same process as shifting in a terminal so we use the same
2673 function for both. In both cases we provide the symbol. The state is
2674 deduced from the current top-of-stack state and the new symbol.
2676 To simplify other code we arrange for `shift` to fail if there is no `goto`
2677 state for the symbol. This is useful in basic parsing due to our design
2678 that we shift when we can, and reduce when we cannot. So the `shift`
2679 function reports if it could.
2681 `shift` is also used to push state zero onto the stack, so if the
2682 stack is empty, it always chooses zero as the next state.
2684 So `shift` finds the next state. If that succeeds it extends the
2685 allocations if needed and pushes all the information onto the stacks.
2687 An extra complication is added to `shift` by the `EOL` token. This
2688 token must be generated when a `NEWLINE` is seen, but an `EOL` is
2689 expected. When this happens, the `NEWLINE` is NOT consumed, so multiple
2690 EOL can appear before a NEWLINE. To indicate that the token was shifted
2691 by not consumed, `shift` can return the special value `2`. The token
2692 number for `EOL` cannot be statically declared, so when the parser
2693 starts we need to look through the array of non-terminals to find the
2701 while (strcmp(non_term[p.tk_eol], "EOL") != 0)
2703 p.tk_eol += TK_reserved + config->known_count;
2706 ###### parser functions
2708 static int shift(struct parser *p,
2709 short sym, void *asn,
2710 const struct state states[])
2712 struct frame next = {0};
2714 int newstate = p->tos
2715 ? search(&states[p->stack[p->tos-1].state],
2720 else if (sym != TK_newline)
2723 // have a NEWLINE, might need an EOL
2726 ? search(&states[p->stack[p->tos-1].state],
2732 asn = tok_copy(*(struct token*)asn);
2735 if (p->tos >= p->stack_size) {
2736 p->stack_size += 10;
2737 p->stack = realloc(p->stack, p->stack_size
2738 * sizeof(p->stack[0]));
2739 p->asn_stack = realloc(p->asn_stack, p->stack_size
2740 * sizeof(p->asn_stack[0]));
2743 next.state = newstate;
2745 p->stack[p->tos] = next;
2746 p->asn_stack[p->tos] = asn;
2751 `pop` primarily moves the top of stack (`tos`) back down the required
2752 amount and frees any `asn` entries that need to be freed. It is called
2753 _after_ we reduce a production, just before we `shift` the nonterminal
2756 ###### parser functions
2758 static void pop(struct parser *p, int num,
2759 void(*do_free)(short sym, void *asn))
2764 for (i = 0; i < num; i++)
2765 do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]);
2768 ### The heart of the parser.
2770 Now we have the parser. For each token we might shift it, trigger a
2771 reduction, or start error handling. 2D tokens (IN, OUT, NEWLINE, EOL)
2772 might also be ignored. Ignoring tokens is combined with shifting.
2776 struct parser p = { 0 };
2777 struct token *tk = NULL;
2780 ###### heart of parser
2782 shift(&p, TK_eof, NULL, states);
2783 while (!accepted && p.tos > 0) {
2784 struct frame *tos = &p.stack[p.tos-1];
2786 tk = tok_copy(token_next(tokens));
2787 parser_trace(trace, &p,
2788 tk, states, non_term, config->known_count);
2790 ## try shift or ignore
2795 Indents are ignored unless they can be shifted onto the stack
2796 immediately. The end of an indented section - the OUT token - is
2797 ignored precisely when the indent was ignored. To keep track of this we
2798 need a small stack of flags, which is easily stored as bits in an
2799 `unsigned long`. This will never overflow and the scanner only allows
2800 20 levels of indentation.
2803 unsigned long ignored_indents;
2806 NEWLINE/EOL is ignored when in an indented section of text which was not
2807 explicitly expected by the grammar. So if the most recent indent is
2808 ignored, so is any EOL token.
2810 For other tokens, we shift the next token if that is possible, otherwise
2811 we try to reduce a production.
2813 ###### try shift or ignore
2815 if ((tk->num == TK_newline || tk->num == TK_out) &&
2816 (p.ignored_indents & (1 << p.indent_depth))) {
2817 /* indented, so ignore OUT and NEWLINE */
2818 if (tk->num == TK_out)
2819 p.indent_depth -= 1;
2822 parser_trace_action(trace, "Ignore");
2826 switch (shift(&p, tk->num, tk, states)) {
2828 if (tk->num == TK_out)
2829 p.indent_depth -= 1;
2830 if (tk->num == TK_in) {
2831 p.indent_depth += 1;
2832 p.ignored_indents &= ~(1 << p.indent_depth);
2837 parser_trace_action(trace, tk ? "ShiftEOL" : "Shift");
2842 if (tk->num == TK_in) {
2843 /* No indent expected here, so ignore IN */
2846 p.indent_depth += 1;
2847 p.ignored_indents |= (1 << p.indent_depth);
2848 parser_trace_action(trace, "Ignore");
2852 We have already discussed the bulk of the handling of a "reduce" action,
2853 with the `pop()` and `shift()` functions doing much of the work. There
2854 is a little more complexity needed to manage storage for the asn (Abstract
2855 Syntax Node), and also a test of whether the reduction is permitted.
2857 When we try to shift the result of reducing production-zero, it will
2858 fail because there is no next state. In this case the asn will not have
2859 been stored on the stack, so it get stored in the `ret` variable, and we
2860 report that that input has been accepted.
2868 if (states[tos->state].reduce_prod >= 0) {
2871 const struct state *nextstate = &states[tos->state];
2872 int prod = nextstate->reduce_prod;
2873 int size = nextstate->reduce_size;
2874 int res_size = nextstate->result_size;
2876 body = p.asn_stack + (p.tos - size);
2877 res = res_size ? calloc(1, res_size) : NULL;
2878 res_size = do_reduce(prod, body, config, res);
2879 if (res_size != nextstate->result_size)
2881 pop(&p, size, do_free);
2882 if (!shift(&p, nextstate->reduce_sym, res, states)) {
2885 parser_trace_action(trace, "Accept");
2887 parser_trace_action(trace, "Reduce");
2891 If we can neither shift nor reduce we have an error to handle. There
2892 are two possible responses to an error: we can pop single frames off the
2893 stack until we find a frame that can shift `TK_error`, or we can discard
2894 the current look-ahead symbol.
2896 When we first see an error we do the first of these and set a flag to
2897 record that we are processing an error. If the normal shift/reduce
2898 tests fail to find that anything can be done from that state, we start
2899 dropping tokens until either we manage to shift one, or reach end-of-file.
2912 parser_trace_action(trace, "DISCARD");
2913 if (tk->num == TK_eof)
2918 struct token *err_tk;
2920 parser_trace_action(trace, "ERROR");
2922 err_tk = tok_copy(*tk);
2923 while (p.tos > 0 && shift(&p, TK_error, err_tk, states) == 0)
2924 // discard this state
2925 pop(&p, 1, do_free);
2928 // no state accepted TK_error
2934 ###### parser includes
2939 void *parser_run(struct token_state *tokens,
2940 const struct state states[],
2941 int (*do_reduce)(int, void**, struct token_config*, void*),
2942 void (*do_free)(short, void*),
2943 FILE *trace, const char *non_term[],
2944 struct token_config *config)
2953 pop(&p, p.tos, do_free);
2959 ###### exported functions
2960 void *parser_run(struct token_state *tokens,
2961 const struct state states[],
2962 int (*do_reduce)(int, void**, struct token_config*, void*),
2963 void (*do_free)(short, void*),
2964 FILE *trace, const char *non_term[],
2965 struct token_config *config);
2969 Being able to visualize the parser in action can be invaluable when
2970 debugging the parser code, or trying to understand how the parse of a
2971 particular grammar progresses. The stack contains all the important
2972 state, so just printing out the stack every time around the parse loop
2973 can make it possible to see exactly what is happening.
2975 This doesn't explicitly show each SHIFT and REDUCE action. However they
2976 are easily deduced from the change between consecutive lines, and the
2977 details of each state can be found by cross referencing the states list
2978 in the stack with the "report" that parsergen can generate.
2980 For terminal symbols, we just dump the token. For non-terminals we
2981 print the name of the symbol. The look ahead token is reported at the
2982 end inside square brackets.
2984 ###### parser functions
2986 static char *reserved_words[] = {
2987 [TK_error] = "ERROR",
2990 [TK_newline] = "NEWLINE",
2994 void parser_trace(FILE *trace, struct parser *p,
2995 struct token *tk, const struct state states[],
2996 const char *non_term[], int knowns)
3001 for (i = 0; i < p->tos; i++) {
3002 struct frame *f = &p->stack[i];
3005 if (sym < TK_reserved &&
3006 reserved_words[sym] != NULL)
3007 fputs(reserved_words[sym], trace);
3008 else if (sym < TK_reserved + knowns) {
3009 struct token *t = p->asn_stack[i];
3010 text_dump(trace, t->txt, 20);
3012 fputs(non_term[sym - TK_reserved - knowns],
3016 fprintf(trace, "(%d) ", f->state);
3018 fprintf(trace, "[");
3019 if (tk->num < TK_reserved &&
3020 reserved_words[tk->num] != NULL)
3021 fputs(reserved_words[tk->num], trace);
3023 text_dump(trace, tk->txt, 20);
3024 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3027 void parser_trace_action(FILE *trace, char *action)
3030 fprintf(trace, " - %s\n", action);
3035 The obvious example for a parser is a calculator.
3037 As `scanner` provides number parsing function using `libgmp` it is not much
3038 work to perform arbitrary rational number calculations.
3040 This calculator takes one expression, or an equality test, per line.
3041 The results are printed and if any equality test fails, the program
3042 exits with an error.
3044 ###### File: parsergen.mk
3045 calc.c calc.h : parsergen parsergen.mdc
3046 ./parsergen --tag calc -o calc parsergen.mdc
3047 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3048 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3050 ./calc parsergen.mdc
3055 #include "parse_number.h"
3056 // what do we use for a demo-grammar? A calculator of course.
3068 #include <sys/mman.h>
3074 #include "scanner.h"
3079 static void free_number(struct number *n)
3085 static int text_is(struct text t, char *s)
3087 return (strlen(s) == t.len &&
3088 strncmp(s, t.txt, t.len) == 0);
3091 int main(int argc, char *argv[])
3093 int fd = open(argv[1], O_RDONLY);
3094 int len = lseek(fd, 0, 2);
3095 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3096 struct section *table = code_extract(file, file+len, NULL);
3098 struct token_config config = {
3099 .ignored = (1 << TK_line_comment)
3102 .number_chars = ".,_+-",
3106 for (s = table; s; s = s->next)
3107 if (text_is(s->section, "example: input"))
3108 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3110 struct section *t = table->next;
3111 code_free(table->code);
3123 Session -> Session Line
3126 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3127 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3128 gmp_printf(" or as a decimal: %Fg\n", fl);
3132 | Expression = Expression NEWLINE ${
3133 if (mpq_equal($1.val, $3.val))
3134 gmp_printf("Both equal %Qd\n", $1.val);
3136 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
3141 | NEWLINE ${ printf("Blank line\n"); }$
3142 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3145 Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3146 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3147 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3148 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3149 | Expression // Expression ${ {
3152 mpz_init(z0); mpz_init(z1); mpz_init(z2);
3153 mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3154 mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3155 mpz_tdiv_q(z0, z1, z2);
3156 mpq_set_z($0.val, z0);
3157 mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3159 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3160 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3165 3.1415926535 - 355/113
3167 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3169 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1