1 # LR(1) Parser Generator with 2D support #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 "2D support" means that indentation and line breaks can be significant.
8 Indent tokens (IN and OUT) and end-of-line tokens (EOL and NEWLINE) can
9 be used to describe the grammar and the parser can selectively ignore
10 these where they aren't relevant.
12 There are several distinct sections.
14 1. `grammar_read` will read a grammar from a literate-code file and
15 store the productions, symbols, and code fragments.
16 2. `grammar_analyse` will take that grammar and build LR parsing
17 states and look-ahead tables.
18 3. `grammar_report` will present the details of the analysis
19 in a readable format and will report any conflicts.
20 4. `parser_generate` will write out C code files with various
21 tables and with the code fragments arranged in useful places.
22 5. `parser_run` is a library function intended to be linked together
23 with the generated parser tables to complete the implementation of
25 6. Finally `calc` is a test grammar for a simple calculator. The
26 `parsergen` program built from the C code in this file can extract
27 that grammar directly from this file and process it.
29 ###### File: parsergen.c
34 ## forward declarations
45 ###### File: libparser.c
52 ###### File: parsergen.mk
55 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
58 ## Reading the grammar
60 The grammar must be stored in a markdown literate code file as parsed by
61 "[mdcode][]". It may have up to four top level (i.e. unreferenced)
62 sections: `header`, `code`, `grammar` and `reduce`. The first two, if
63 present, will be literally copied into the generated `.c` and `.h`
64 files. The third is required and contains the grammar which is
65 tokenised with "[scanner][]". `reduce` can provide variable
66 declarations and common intialization code for all the reduction code
67 fragments in the grammar.
69 If the `--tag` option is given, then any top level header that doesn't
70 start with the tag is ignored, and the tag is striped from the rest. So
71 `--tag Foo` means that the four sections expected are `Foo: header`,
72 `Foo: code`, `Foo: grammar` and `Foo: reduce`. The tag `calc` is used
73 to extract the sample calculator from this file.
76 [scanner]: scanner.html
82 ###### parser includes
86 The grammar contains production sets, precedence/associativity
87 declarations, general terminal declarations, and data type declarations.
88 These are all parsed with _ad hoc_ parsing as we don't have a parser
91 The precedence and associativity can be set for each production, and
92 can be inherited from symbols. The data type (either structure or a
93 reference to a structure) is potentially defined for each non-terminal
94 and describes what C structure is used to store information about each
98 enum assoc {Left, Right, Non};
99 char *assoc_names[] = {"Left","Right","Non"};
102 struct text struct_name;
105 unsigned short precedence;
109 unsigned short precedence;
117 The strings reported by `mdcode` and `scanner` are `struct text` which
118 have length rather than being nul terminated. To help with printing and
119 comparing we define `text_is` and `prtxt`, which should possibly go in
120 `mdcode`. `scanner` does provide `text_dump` which is useful for
121 strings which might contain control characters.
123 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
124 because that is what we need to detect tags.
127 static int text_is(struct text t, char *s)
129 return (strlen(s) == t.len &&
130 strncmp(s, t.txt, t.len) == 0);
132 static void prtxt(struct text t)
134 printf("%.*s", t.len, t.txt);
137 static int strip_tag(struct text *t, char *tag)
139 int skip = strlen(tag) + 1;
140 if (skip >= t->len ||
141 strncmp(t->txt, tag, skip-1) != 0 ||
142 t->txt[skip-1] != ':')
144 while (skip < t->len && t->txt[skip] == ' ')
153 Productions are comprised primarily of symbols - terminal and
154 non-terminal. We do not make a syntactic distinction between the two,
155 though non-terminals must be identifiers while terminals can also be
156 marks. Non-terminal symbols are those which appear in the head of a
157 production, terminal symbols are those which don't. There are also
158 "virtual" symbols used for precedence marking discussed later, and
159 sometimes we won't know what type a symbol is yet.
161 To help with code safety it is possible to declare the terminal symbols.
162 If this is done, then any symbol used in a production that does not
163 appear in the head of any production and is not declared as a terminal
164 is treated as an error.
166 ###### forward declarations
167 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
168 char *symtypes = "UVTN";
171 ###### grammar fields
172 int terminals_declared;
174 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
175 table of known symbols and the resulting parser will report them as
176 `TK_reserved + N`. A small set of identifiers are reserved for the
177 different token types that `scanner` can report, and an even smaller set
178 are reserved for a special token that the parser can generate (`EOL`) as
179 will be described later. This latter set cannot use predefined numbers,
180 so they are marked as `isspecial` for now and will get assigned a number
181 with the non-terminals later.
185 static struct { int num; char *name; } reserved_words[] = {
186 { TK_error, "ERROR" },
187 { TK_number, "NUMBER" },
188 { TK_ident, "IDENTIFIER" },
190 { TK_string, "STRING" },
191 { TK_multi_string, "MULTI_STRING" },
194 { TK_newline, "NEWLINE" },
201 unsigned int isspecial:1;
203 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
204 recognised. The former is automatically expected at the end of the text
205 being parsed. The latter are always ignored by the parser.
207 All of these symbols are stored in a simple symbol table. We use the
208 `struct text` from `mdcode` to store the name and link them together
209 into a sorted list using an insertion sort.
211 We don't have separate `find` and `insert` functions as any symbol we
212 find needs to be remembered. We simply expect `find` to always return a
213 symbol, but its type might be `Unknown`.
222 ###### grammar fields
227 static struct symbol *sym_find(struct grammar *g, struct text s)
229 struct symbol **l = &g->syms;
234 (cmp = text_cmp((*l)->name, s)) < 0)
238 n = calloc(1, sizeof(*n));
247 static void symbols_init(struct grammar *g)
249 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
251 for (i = 0; i < entries; i++) {
254 t.txt = reserved_words[i].name;
255 t.len = strlen(t.txt);
258 s->num = reserved_words[i].num;
263 ### Data types and precedence.
265 Data type specification, precedence specification, and declaration of
266 terminals are all introduced by a dollar sign at the start of the line.
267 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
268 precedence, if it is `TERM` then the line declares terminals without
269 precedence, otherwise it specifies a data type.
271 The data type name is simply stored and applied to the head of all
272 subsequent productions. It must be the name of a structure optionally
273 preceded by an asterisk which means a reference or "pointer". So
274 `$expression` maps to `struct expression` and `$*statement` maps to
275 `struct statement *`.
277 Any productions given before the first data type declaration will have
278 no data type associated with them and can carry no information. In
279 order to allow other non-terminals to have no type, the data type
280 `$void` can be given. This does *not* mean that `struct void` will be
281 used, but rather than no type will be associated with subsequent
284 The precedence line must contain a list of symbols, either terminal
285 symbols or virtual symbols. It can only contain symbols that have not
286 been seen yet, so precedence declaration must precede the productions
289 A "virtual" symbol is an identifier preceded by `$$`. Virtual symbols
290 carry precedence information but are not included in the grammar. A
291 production can declare that it inherits the precedence of a given
294 This use of `$$` precludes it from being used as a symbol in the
295 described language. Two other symbols: `${` and `}$` are also
298 Each new precedence line introduces a new precedence level and
299 declares how it associates. This level is stored in each symbol
300 listed and may be inherited by any production which uses the symbol. A
301 production inherits from the last symbol which has a precedence.
303 The symbols on the first precedence line have the lowest precedence.
304 Subsequent lines introduce symbols with higher precedence and so bind
307 Note that a declaration line (or "dollar line") can start with either of
308 two different marks: "$" or "$*". The `dollar_line()` function defined
309 here is told which was found in the `isref` argument.
311 ###### grammar fields
312 struct text current_type;
317 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
318 static const char *known[] = { "$$", "${", "}$" };
321 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
323 struct token t = token_next(ts);
329 if (t.num != TK_ident) {
330 err = "type or assoc expected after '$'";
333 if (text_is(t.txt, "LEFT"))
335 else if (text_is(t.txt, "RIGHT"))
337 else if (text_is(t.txt, "NON"))
339 else if (text_is(t.txt, "TERM")) {
341 g->terminals_declared = 1;
343 g->current_type = t.txt;
344 g->type_isref = isref;
345 if (text_is(t.txt, "void"))
346 g->current_type.txt = NULL;
348 if (t.num != TK_newline) {
349 err = "Extra tokens after type name";
356 err = "$* cannot be followed by a precedence";
360 // This is a precedence or TERM line, need some symbols.
364 while (t.num != TK_newline) {
365 enum symtype type = Terminal;
367 if (t.num == TK_virtual) {
370 if (t.num != TK_ident) {
371 err = "$$ must be followed by a word";
375 err = "Virtual symbols not permitted on $TERM line";
378 } else if (t.num != TK_ident &&
380 err = "Illegal token in precedence line";
383 s = sym_find(g, t.txt);
384 if (s->type != Unknown) {
385 err = "Symbols in precedence/TERM line must not already be known.";
390 s->precedence = g->prec_levels;
397 err = "No symbols given on precedence/TERM line";
402 while (t.num != TK_newline && t.num != TK_eof)
409 A production either starts with an identifier which is the head
410 non-terminal, or a vertical bar (`|`) in which case this production
411 uses the same head as the previous one. The identifier must be
412 followed by a `->` mark. All productions for a given non-terminal must
413 be grouped together with the `nonterminal ->` given only once.
415 After this a (possibly empty) sequence of identifiers and marks form
416 the body of the production. A virtual symbol may be given after the
417 body by preceding it with `$$`. If a virtual symbol is given, the
418 precedence of the production is that for the virtual symbol. If none
419 is given, the precedence is inherited from the last symbol in the
420 production which has a precedence specified.
422 After the optional precedence may come the `${` mark. This indicates
423 the start of a code fragment. If present, this must be on the same
424 line as the start of the production.
426 All of the text from the `${` through to the matching `}$` is
427 collected and forms the code-fragment for the production. It must all
428 be in one `code_node` of the literate code. The `}$` must be
429 at the end of a line.
431 Text in the code fragment will undergo substitutions where `$N` or
432 `$<N`,for some numeric `N` (or non-numeric indicator as described
433 later), will be replaced with a variable holding the parse information
434 for the particular symbol in the production. `$0` is the head of the
435 production, `$1` is the first symbol of the body, etc. The type of `$N`
436 for a terminal symbol is `struct token`. For a non-terminal, it is
437 whatever has been declared for that symbol. The `<` may be included and
438 means that the value (usually a reference) is being moved out, so it
439 will not automatically be freed. The effect of using '<' is that the
440 variable is cleared to all-zeros.
442 While building productions we will need to add to an array which needs to
446 static void array_add(void *varray, int *cnt, void *new)
448 void ***array = varray;
451 current = ((*cnt-1) | (step-1))+1;
452 if (*cnt == current) {
455 *array = realloc(*array, current * sizeof(void*));
457 (*array)[*cnt] = new;
461 Collecting the code fragment simply involves reading tokens until we
462 hit the end token `}$`, and noting the character position of the start and
466 static struct text collect_code(struct token_state *state,
471 code.txt = start.txt.txt + start.txt.len;
473 t = token_next(state);
474 while (t.node == start.node &&
475 t.num != TK_close && t.num != TK_error &&
477 if (t.num == TK_close && t.node == start.node)
478 code.len = t.txt.txt - code.txt;
484 Now we have all the bits we need to parse a full production.
489 ###### grammar fields
490 struct production **productions;
491 int production_count;
493 ###### production fields
495 struct symbol **body;
501 int first_production;
504 static char *parse_production(struct grammar *g,
506 struct token_state *state)
508 /* Head has already been parsed. */
511 struct production p, *pp;
513 memset(&p, 0, sizeof(p));
515 tk = token_next(state);
516 while (tk.num == TK_ident || tk.num == TK_mark) {
517 struct symbol *bs = sym_find(g, tk.txt);
518 if (bs->type == Virtual) {
519 err = "Virtual symbol not permitted in production";
522 if (bs->precedence) {
523 p.precedence = bs->precedence;
526 array_add(&p.body, &p.body_size, bs);
527 tk = token_next(state);
529 if (tk.num == TK_virtual) {
531 tk = token_next(state);
532 if (tk.num != TK_ident) {
533 err = "word required after $$";
536 vs = sym_find(g, tk.txt);
537 if (vs->precedence == 0) {
538 err = "symbol after $$ must have precedence";
541 p.precedence = vs->precedence;
544 tk = token_next(state);
546 if (tk.num == TK_open) {
547 p.code_line = tk.line;
548 p.code = collect_code(state, tk);
549 if (p.code.txt == NULL) {
550 err = "code fragment not closed properly";
553 tk = token_next(state);
556 if (tk.num != TK_newline && tk.num != TK_eof) {
557 err = "stray tokens at end of line";
560 pp = malloc(sizeof(*pp));
562 array_add(&g->productions, &g->production_count, pp);
565 while (tk.num != TK_newline && tk.num != TK_eof)
566 tk = token_next(state);
571 With the ability to parse productions and dollar-lines, we have nearly all
572 that we need to parse a grammar from a `code_node`.
574 The head of the first production will effectively be the `start` symbol
575 of the grammar. However it won't _actually_ be so. Processing the
576 grammar is greatly simplified if the real start symbol only has a single
577 production, and expects `$eof` as the final terminal. So when we find
578 the first explicit production we insert an extra implicit production as
579 production zero which looks like
581 ###### Example: production 0
584 where `START` is the first non-terminal given.
586 ###### create production zero
587 struct production *p = calloc(1,sizeof(*p));
588 struct text start = {"$start",6};
589 struct text eof = {"$eof",4};
590 struct text code = {"$0 = $<1;", 9};
591 p->head = sym_find(g, start);
592 p->head->type = Nonterminal;
593 p->head->struct_name = g->current_type;
594 p->head->isref = g->type_isref;
595 if (g->current_type.txt)
597 array_add(&p->body, &p->body_size, head);
598 array_add(&p->body, &p->body_size, sym_find(g, eof));
599 p->head->first_production = g->production_count;
600 array_add(&g->productions, &g->production_count, p);
602 Now we are ready to read in the grammar. We ignore comments
603 and strings so that the marks which introduce them can be
604 used as terminals in the grammar. We don't ignore numbers
605 even though we don't allow them as that causes the scanner
606 to produce errors that the parser is better positioned to handle.
607 We also ignore indentation, but we expect and use newlines.
609 To allow comments in the grammar, we explicitly check for "//" as the
610 first token on the line and those lines are skipped. "//" can still be
611 used as a terminal anywhere that a terminal is expected.
614 static struct grammar *grammar_read(struct code_node *code)
616 struct token_config conf = {
619 .words_marks = known,
620 .known_count = sizeof(known)/sizeof(known[0]),
622 .ignored = (1 << TK_line_comment)
623 | (1 << TK_block_comment)
626 | (1 << TK_multi_string)
631 struct token_state *state = token_open(code, &conf);
633 struct symbol *head = NULL;
637 g = calloc(1, sizeof(*g));
640 for (tk = token_next(state); tk.num != TK_eof;
641 tk = token_next(state)) {
642 if (tk.num == TK_newline)
644 if (tk.num == TK_ident) {
646 head = sym_find(g, tk.txt);
647 if (head->type == Nonterminal)
648 err = "This non-terminal has already be used.";
649 else if (head->type == Virtual)
650 err = "Virtual symbol not permitted in head of production";
652 head->type = Nonterminal;
653 head->struct_name = g->current_type;
654 head->isref = g->type_isref;
655 if (g->production_count == 0) {
656 ## create production zero
658 head->first_production = g->production_count;
659 tk = token_next(state);
660 if (tk.num == TK_mark &&
661 text_is(tk.txt, "->"))
662 err = parse_production(g, head, state);
664 err = "'->' missing in production";
666 } else if (tk.num == TK_mark
667 && text_is(tk.txt, "|")) {
668 // another production for same non-term
670 err = parse_production(g, head, state);
672 err = "First production must have a head";
673 } else if (tk.num == TK_mark
674 && text_is(tk.txt, "$")) {
675 err = dollar_line(state, g, 0);
676 } else if (tk.num == TK_mark
677 && text_is(tk.txt, "$*")) {
678 err = dollar_line(state, g, 1);
679 } else if (tk.num == TK_mark
680 && text_is(tk.txt, "//")) {
681 while (tk.num != TK_newline &&
683 tk = token_next(state);
685 err = "Unrecognised token at start of line.";
693 for (s = g->syms; s; s = s->next) {
694 if (s->type != Unknown)
696 if (!g->terminals_declared) {
700 err = "not declared";
701 fprintf(stderr, "Token %.*s not declared\n",
702 s->name.len, s->name.txt);
705 free(g); // FIXME free content
710 fprintf(stderr, "Error at line %d: %s\n",
713 free(g); // FIXME free content
717 ## Analysing the grammar
719 The central task in analysing the grammar is to determine a set of
720 states to drive the parsing process. This will require finding various
721 sets of symbols and of "items". Some of these sets will need to attach
722 information to each element in the set, so they are more like maps.
724 Each "item" may have a 'look-ahead' or `LA` set associated with it.
725 Many of these will be the same as each other. To avoid memory wastage,
726 and to simplify some comparisons of sets, these sets will be stored in a
727 list of unique sets, each assigned a number.
729 Once we have the data structures in hand to manage these sets and lists,
730 we can start setting the 'nullable' flag, build the 'FIRST' and 'FOLLOW'
731 sets, and then create the item sets which define the various states.
735 Though we don't only store symbols in these sets, they are the main
736 things we store, so they are called symbol sets or "symsets".
738 A symset has a size, an array of shorts, and an optional array of data
739 values, which are also shorts. If the array of data is not present, we
740 store an impossible pointer, as `NULL` is used to indicate that no
741 memory has been allocated yet;
746 unsigned short *syms, *data;
748 #define NO_DATA ((unsigned short *)1)
749 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
750 const struct symset INIT_DATASET = { 0, NULL, NULL };
752 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
753 using an insertion sort. We don't explicitly record the amount of
754 allocated space as it can be derived directly from the current `cnt`
755 using `((cnt - 1) | 7) + 1`.
758 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
761 int current = ((s->cnt-1) | 7) + 1;
762 if (current == s->cnt) {
764 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
765 if (s->data != NO_DATA)
766 s->data = realloc(s->data, sizeof(*s->data) * current);
769 while (i > 0 && s->syms[i-1] > key) {
770 s->syms[i] = s->syms[i-1];
771 if (s->data != NO_DATA)
772 s->data[i] = s->data[i-1];
776 if (s->data != NO_DATA)
781 Finding a symbol (or item) in a `symset` uses a simple binary search.
782 We return the index where the value was found (so data can be accessed),
783 or `-1` to indicate failure.
785 static int symset_find(struct symset *ss, unsigned short key)
792 while (lo + 1 < hi) {
793 int mid = (lo + hi) / 2;
794 if (ss->syms[mid] <= key)
799 if (ss->syms[lo] == key)
804 We will often want to form the union of two symsets. When we do, we
805 will often want to know if anything changed (as that might mean there is
806 more work to do). So `symset_union` reports whether anything was added
807 to the first set. We use a slow quadratic approach as these sets are
808 rarely large. If profiling shows this to be a problem it can be
811 static int symset_union(struct symset *a, struct symset *b)
815 for (i = 0; i < b->cnt; i++)
816 if (symset_find(a, b->syms[i]) < 0) {
817 unsigned short data = 0;
818 if (b->data != NO_DATA)
820 symset_add(a, b->syms[i], data);
826 And of course we must be able to free a symset.
828 static void symset_free(struct symset ss)
831 if (ss.data != NO_DATA)
837 Some symsets are simply stored somewhere appropriate in the `grammar`
838 data structure, others needs a bit of indirection. This applies
839 particularly to "LA" sets. These will be paired with "items" in an
840 "itemset". As itemsets will be stored in a symset, the "LA" set needs
841 to be stored in the `data` field, so we need a mapping from a small
842 (short) number to an LA `symset`.
844 As mentioned earlier this involves creating a list of unique symsets.
846 For now, we just use a linear list sorted by insertion. A skiplist
847 would be more efficient and may be added later.
854 struct setlist *next;
857 ###### grammar fields
858 struct setlist *sets;
863 static int ss_cmp(struct symset a, struct symset b)
866 int diff = a.cnt - b.cnt;
869 for (i = 0; i < a.cnt; i++) {
870 diff = (int)a.syms[i] - (int)b.syms[i];
877 static int save_set(struct grammar *g, struct symset ss)
879 struct setlist **sl = &g->sets;
883 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
890 s = malloc(sizeof(*s));
899 Finding a set by number is currently performed by a simple linear
900 search. If this turns out to hurt performance, we can store the sets in
901 a dynamic array like the productions.
903 static struct symset set_find(struct grammar *g, int num)
905 struct setlist *sl = g->sets;
906 while (sl && sl->num != num)
911 ### Setting `nullable`
913 We set `nullable` on the head symbol for any production for which all
914 the body symbols (if any) are nullable. As this is a recursive
915 definition, any change in the `nullable` setting means that we need to
916 re-evaluate where it needs to be set.
918 We simply loop around performing the same calculations until no more
925 static void set_nullable(struct grammar *g)
928 while (check_again) {
931 for (p = 0; p < g->production_count; p++) {
932 struct production *pr = g->productions[p];
935 if (pr->head->nullable)
937 for (s = 0; s < pr->body_size; s++)
938 if (! pr->body[s]->nullable)
940 if (s == pr->body_size) {
941 pr->head->nullable = 1;
948 ### Building the `first` sets
950 When calculating what can follow a particular non-terminal, we will need
951 to know what the "first" terminal in any subsequent non-terminal might
952 be. So we calculate the `first` set for every non-terminal and store
953 them in an array. We don't bother recording the "first" set for
954 terminals as they are trivial.
956 As the "first" for one symbol might depend on the "first" of another, we
957 repeat the calculations until no changes happen, just like with
958 `set_nullable`. This makes use of the fact that `symset_union` reports
959 if any change happens.
961 The core of this, which finds the "first" of part of a production body,
962 will be reused for computing the "follow" sets, so we split that out
963 into a separate function, `add_first`, which also reports if it got all
964 the way to the end of the production without finding a non-nullable
967 ###### grammar fields
968 struct symset *first;
972 static int add_first(struct production *pr, int start,
973 struct symset *target, struct grammar *g,
978 for (s = start; s < pr->body_size; s++) {
979 struct symbol *bs = pr->body[s];
980 if (bs->type == Terminal) {
981 if (symset_find(target, bs->num) < 0) {
982 symset_add(target, bs->num, 0);
986 } else if (symset_union(target, &g->first[bs->num]))
992 *to_end = (s == pr->body_size);
996 static void build_first(struct grammar *g)
1000 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1001 for (s = 0; s < g->num_syms; s++)
1002 g->first[s] = INIT_SYMSET;
1004 while (check_again) {
1007 for (p = 0; p < g->production_count; p++) {
1008 struct production *pr = g->productions[p];
1009 struct symset *head = &g->first[pr->head->num];
1011 if (add_first(pr, 0, head, g, NULL))
1017 ### Building the `follow` sets.
1019 There are two different situations when we will want to generate
1020 "follow" sets. If we are doing an SLR analysis, we want to generate a
1021 single "follow" set for each non-terminal in the grammar. That is what
1022 is happening here. If we are doing an LALR or LR analysis we will want
1023 to generate a separate "LA" set for each item. We do that later in
1026 There are two parts to generating a "follow" set. Firstly we look at
1027 every place that any non-terminal appears in the body of any production,
1028 and we find the set of possible "first" symbols after there. This is
1029 done using `add_first` above and only needs to be done once as the
1030 "first" sets are now stable and will not change.
1032 ###### grammar fields
1033 struct symset *follow;
1037 for (p = 0; p < g->production_count; p++) {
1038 struct production *pr = g->productions[p];
1041 for (b = 0; b < pr->body_size - 1; b++) {
1042 struct symbol *bs = pr->body[b];
1043 if (bs->type == Terminal)
1045 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1049 The second part is to add the "follow" set of the head of a production
1050 to the "follow" sets of the final symbol in the production, and any
1051 other symbol which is followed only by `nullable` symbols. As this
1052 depends on "follow" itself we need to repeatedly perform the process
1053 until no further changes happen.
1055 Rather than 2 nested loops, one that repeats the whole process until
1056 there is no change, and one that iterates of the set of productions, we
1057 combine these two functions into a single loop.
1061 for (check_again = 0, p = 0;
1062 p < g->production_count;
1063 p < g->production_count-1
1064 ? p++ : check_again ? (check_again = 0, p = 0)
1066 struct production *pr = g->productions[p];
1069 for (b = pr->body_size - 1; b >= 0; b--) {
1070 struct symbol *bs = pr->body[b];
1071 if (bs->type == Terminal)
1073 if (symset_union(&g->follow[bs->num],
1074 &g->follow[pr->head->num]))
1081 We now just need to create and initialise the `follow` list to get a
1085 static void build_follow(struct grammar *g)
1087 int s, check_again, p;
1088 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1089 for (s = 0; s < g->num_syms; s++)
1090 g->follow[s] = INIT_SYMSET;
1094 ### Building itemsets and states
1096 There are three different levels of detail that can be chosen for
1097 building the itemsets and states for the LR grammar. They are:
1099 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1100 itemsets - look-ahead, if used at all, is simply the 'follow' sets
1102 2. LALR(1) where we build look-ahead sets with each item and merge
1103 the LA sets when we find two paths to the same "kernel" set of items.
1104 3. LR(1) where different look-ahead for any item in the set means
1105 a different item set must be created.
1107 ###### forward declarations
1108 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1110 We need to be able to look through existing states to see if a newly
1111 generated state already exists. For now we use a simple sorted linked
1114 An item is a pair of numbers: the production number and the position of
1115 "DOT", which is an index into the body of the production. As the
1116 numbers are not enormous we can combine them into a single "short" and
1117 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1118 production number (so 2000 productions with maximum of 31 symbols in the
1121 Comparing the itemsets will be a little different to comparing symsets
1122 as we want to do the lookup after generating the "kernel" of an
1123 itemset, so we need to ignore the offset=zero items which are added during
1126 To facilitate this, we modify the "DOT" number so that "0" sorts to
1127 the end of the list in the symset, and then only compare items before
1131 static inline unsigned short item_num(int production, int dot)
1133 return production | ((31-dot) << 11);
1135 static inline int item_prod(unsigned short item)
1137 return item & 0x7ff;
1139 static inline int item_dot(unsigned short item)
1141 return (31-(item >> 11)) & 0x1f;
1144 For LR(1) analysis we need to compare not just the itemset in a state
1145 but also the LA sets. As we assign each unique LA set a number, we
1146 can just compare the symset and the data values together.
1149 static int itemset_cmp(struct symset a, struct symset b,
1150 enum grammar_type type)
1156 i < a.cnt && i < b.cnt &&
1157 item_dot(a.syms[i]) > 0 &&
1158 item_dot(b.syms[i]) > 0;
1160 int diff = a.syms[i] - b.syms[i];
1164 diff = a.data[i] - b.data[i];
1169 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1173 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1179 if (type < LR1 || av == -1)
1182 a.data[i] - b.data[i];
1185 It will be helpful to know if an itemset has been "completed" or not,
1186 particularly for LALR where itemsets get merged, at which point they
1187 need to be consider for completion again. So a `completed` flag is
1190 And now we can build the list of itemsets. The lookup routine returns
1191 both a success flag and a pointer to where in the list an insert should
1192 happen, so we don't need to search a second time.
1196 struct itemset *next;
1198 struct symset items;
1199 struct symset go_to;
1201 unsigned short precedence;
1205 ###### grammar fields
1206 struct itemset *items;
1210 static int itemset_find(struct grammar *g, struct itemset ***where,
1211 struct symset kernel, enum grammar_type type)
1213 struct itemset **ip;
1215 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1216 struct itemset *i = *ip;
1218 diff = itemset_cmp(i->items, kernel, type);
1231 Adding an itemset may require merging the LA sets if LALR analysis is
1232 happening. If any new LA set adds any symbols that weren't in the old
1233 LA set, we clear the `completed` flag so that the dependants of this
1234 itemset will be recalculated and their LA sets updated.
1236 `add_itemset` must consume the symsets it is passed, either by adding
1237 them to a data structure, of freeing them.
1239 static int add_itemset(struct grammar *g, struct symset ss,
1240 enum assoc assoc, unsigned short precedence,
1241 enum grammar_type type)
1243 struct itemset **where, *is;
1245 int found = itemset_find(g, &where, ss, type);
1247 is = calloc(1, sizeof(*is));
1248 is->state = g->states;
1252 is->precedence = precedence;
1254 is->go_to = INIT_DATASET;
1263 for (i = 0; i < ss.cnt; i++) {
1264 struct symset temp = INIT_SYMSET, s;
1265 if (ss.data[i] == is->items.data[i])
1267 s = set_find(g, is->items.data[i]);
1268 symset_union(&temp, &s);
1269 s = set_find(g, ss.data[i]);
1270 if (symset_union(&temp, &s)) {
1271 is->items.data[i] = save_set(g, temp);
1282 To build all the itemsets, we first insert the initial itemset made from
1283 production zero, complete each itemset, and then generate new itemsets
1284 from old until no new ones can be made.
1286 Completing an itemset means finding all the items where "DOT" is
1287 followed by a nonterminal and adding "DOT=0" items for every production
1288 from that non-terminal - providing each item hasn't already been added.
1289 When we add such an item it might get inserted before the current
1290 one, so to make sure we process it we reset the loop counter to the
1293 If LA sets are needed, the LA set for each new item is found using
1294 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This
1295 may be supplemented by the LA set for the item which produced the new
1298 We also collect a set of all symbols which follow "DOT" (in `done`) as
1299 this is used in the next stage.
1301 When itemsets are created we assign a precedence to the itemset from the
1302 complete item, if there is one. We ignore the possibility of there
1303 being two and don't (currently) handle precedence in such grammars.
1304 When completing a grammar we ignore any item where DOT is followed by a
1305 terminal with a precedence lower than that for the itemset. Unless the
1306 terminal has right associativity, we also ignore items where the
1307 terminal has the same precedence. The result is that unwanted items are
1308 still in the itemset, but the terminal doesn't get into the go to set,
1309 so the item is ineffective.
1311 ###### complete itemset
1312 for (i = 0; i < is->items.cnt; i++) {
1313 int p = item_prod(is->items.syms[i]);
1314 int bs = item_dot(is->items.syms[i]);
1315 struct production *pr = g->productions[p];
1318 struct symset LA = INIT_SYMSET;
1319 unsigned short sn = 0;
1321 if (bs == pr->body_size)
1324 if (s->precedence && is->precedence &&
1325 is->precedence > s->precedence)
1326 /* This terminal has a low precedence and
1327 * shouldn't be shifted
1330 if (s->precedence && is->precedence &&
1331 is->precedence == s->precedence && s->assoc != Right)
1332 /* This terminal has a matching precedence and is
1333 * not Right-associative, so we mustn't shift it.
1336 if (symset_find(&done, s->num) < 0)
1337 symset_add(&done, s->num, 0);
1339 if (s->type != Nonterminal)
1345 add_first(pr, bs+1, &LA, g, &to_end);
1347 struct symset ss = set_find(g, is->items.data[i]);
1348 symset_union(&LA, &ss);
1350 sn = save_set(g, LA);
1351 LA = set_find(g, sn);
1354 /* Add productions for this symbol */
1355 for (p2 = s->first_production;
1356 p2 < g->production_count &&
1357 g->productions[p2]->head == s;
1359 int itm = item_num(p2, 0);
1360 int pos = symset_find(&is->items, itm);
1362 symset_add(&is->items, itm, sn);
1363 /* Will have re-ordered, so start
1364 * from beginning again */
1366 } else if (type >= LALR) {
1367 struct symset ss = set_find(g, is->items.data[pos]);
1368 struct symset tmp = INIT_SYMSET;
1369 struct symset *la = &LA;
1371 symset_union(&tmp, &ss);
1372 if (symset_union(&tmp, la)) {
1373 is->items.data[pos] = save_set(g, tmp);
1381 For each symbol we found (and placed in `done`) we collect all the items
1382 for which this symbol is next, and create a new itemset with "DOT"
1383 advanced over the symbol. This is then added to the collection of
1384 itemsets (or merged with a pre-existing itemset).
1386 ###### derive itemsets
1387 // Now we have a completed itemset, so we need to
1388 // compute all the 'next' itemsets and create them
1389 // if they don't exist.
1390 for (i = 0; i < done.cnt; i++) {
1392 unsigned short state;
1393 struct symbol *sym = g->symtab[done.syms[i]];
1394 enum assoc assoc = Non;
1395 unsigned short precedence = 0;
1396 struct symset newitemset = INIT_SYMSET;
1398 newitemset = INIT_DATASET;
1400 for (j = 0; j < is->items.cnt; j++) {
1401 int itm = is->items.syms[j];
1402 int p = item_prod(itm);
1403 int bp = item_dot(itm);
1404 struct production *pr = g->productions[p];
1405 unsigned short la = 0;
1408 if (bp == pr->body_size)
1410 if (pr->body[bp] != sym)
1415 la = is->items.data[j];
1416 if (bp == pr->body_size &&
1417 pr->precedence > 0 &&
1418 pr->precedence > precedence) {
1419 // new itemset is reducible and has a precedence.
1420 precedence = pr->precedence;
1423 pos = symset_find(&newitemset, item_num(p, bp));
1426 symset_add(&newitemset, item_num(p, bp), la);
1427 else if (type >= LALR) {
1428 // Need to merge la set.
1429 int la2 = newitemset.data[pos];
1431 struct symset ss = set_find(g, la2);
1432 struct symset LA = INIT_SYMSET;
1433 symset_union(&LA, &ss);
1434 ss = set_find(g, la);
1435 if (symset_union(&LA, &ss))
1436 newitemset.data[pos] = save_set(g, LA);
1442 state = add_itemset(g, newitemset, assoc, precedence, type);
1443 if (symset_find(&is->go_to, done.syms[i]) < 0)
1444 symset_add(&is->go_to, done.syms[i], state);
1447 All that is left is to create the initial itemset from production zero, and
1448 with `TK_eof` as the LA set.
1451 static void build_itemsets(struct grammar *g, enum grammar_type type)
1453 struct symset first = INIT_SYMSET;
1456 unsigned short la = 0;
1458 // LA set just has eof
1459 struct symset eof = INIT_SYMSET;
1460 symset_add(&eof, TK_eof, 0);
1461 la = save_set(g, eof);
1462 first = INIT_DATASET;
1464 // production 0, offset 0 (with no data)
1465 symset_add(&first, item_num(0, 0), la);
1466 add_itemset(g, first, Non, 0, type);
1467 for (check_again = 0, is = g->items;
1469 is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1471 struct symset done = INIT_SYMSET;
1482 ### Completing the analysis.
1484 The exact process of analysis depends on which level was requested. For
1485 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1486 `LR1` we need first, but not follow. For `SLR` we need both.
1488 We don't build the "action" tables that you might expect as the parser
1489 doesn't use them. They will be calculated to some extent if needed for
1492 Once we have built everything we allocate arrays for the two lists:
1493 symbols and itemsets. This allows more efficient access during
1494 reporting. The symbols are grouped as terminals, then non-terminals,
1495 then virtual, with the start of non-terminals recorded as `first_nonterm`.
1496 Special terminals -- meaning just EOL -- are included with the
1497 non-terminals so that they are not expected by the scanner.
1499 ###### grammar fields
1500 struct symbol **symtab;
1501 struct itemset **statetab;
1504 ###### grammar_analyse
1506 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1510 int snum = TK_reserved;
1511 for (s = g->syms; s; s = s->next)
1512 if (s->num < 0 && s->type == Terminal && !s->isspecial) {
1516 g->first_nonterm = snum;
1517 for (s = g->syms; s; s = s->next)
1518 if (s->num < 0 && s->type != Virtual) {
1522 for (s = g->syms; s; s = s->next)
1528 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1529 for (s = g->syms; s; s = s->next)
1530 g->symtab[s->num] = s;
1539 build_itemsets(g, type);
1541 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1542 for (is = g->items; is ; is = is->next)
1543 g->statetab[is->state] = is;
1546 ## Reporting on the Grammar
1548 The purpose of the report is to give the grammar developer insight into
1549 how the grammar parser will work. It is basically a structured dump of
1550 all the tables that have been generated, plus a description of any conflicts.
1552 ###### grammar_report
1553 static int grammar_report(struct grammar *g, enum grammar_type type)
1559 return report_conflicts(g, type);
1562 Firstly we have the complete list of symbols, together with the
1563 "FIRST" set if that was generated. We add a mark to each symbol to
1564 show if it is nullable (`.`).
1568 static void report_symbols(struct grammar *g)
1572 printf("SYMBOLS + FIRST:\n");
1574 printf("SYMBOLS:\n");
1576 for (n = 0; n < g->num_syms; n++) {
1577 struct symbol *s = g->symtab[n];
1581 printf(" %c%3d%c: ",
1582 s->nullable ? '.':' ',
1583 s->num, symtypes[s->type]);
1586 printf(" (%d%s)", s->precedence,
1587 assoc_names[s->assoc]);
1589 if (g->first && s->type == Nonterminal) {
1592 for (j = 0; j < g->first[n].cnt; j++) {
1595 prtxt(g->symtab[g->first[n].syms[j]]->name);
1602 Then we have the follow sets if they were computed.
1604 static void report_follow(struct grammar *g)
1607 printf("FOLLOW:\n");
1608 for (n = 0; n < g->num_syms; n++)
1609 if (g->follow[n].cnt) {
1613 prtxt(g->symtab[n]->name);
1614 for (j = 0; j < g->follow[n].cnt; j++) {
1617 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1623 And finally the item sets. These include the GO TO tables and, for
1624 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1625 it up a bit. First the items, with production number and associativity.
1627 static void report_item(struct grammar *g, int itm)
1629 int p = item_prod(itm);
1630 int dot = item_dot(itm);
1631 struct production *pr = g->productions[p];
1635 prtxt(pr->head->name);
1637 for (i = 0; i < pr->body_size; i++) {
1638 printf(" %s", (dot == i ? ". ": ""));
1639 prtxt(pr->body[i]->name);
1641 if (dot == pr->body_size)
1644 if (pr->precedence && dot == pr->body_size)
1645 printf(" (%d%s)", pr->precedence,
1646 assoc_names[pr->assoc]);
1647 if (dot < pr->body_size &&
1648 pr->body[dot]->precedence) {
1649 struct symbol *s = pr->body[dot];
1650 printf(" [%d%s]", s->precedence,
1651 assoc_names[s->assoc]);
1656 The LA sets which are (possibly) reported with each item:
1658 static void report_la(struct grammar *g, int lanum)
1660 struct symset la = set_find(g, lanum);
1664 printf(" LOOK AHEAD(%d)", lanum);
1665 for (i = 0; i < la.cnt; i++) {
1668 prtxt(g->symtab[la.syms[i]]->name);
1673 Then the go to sets:
1675 static void report_goto(struct grammar *g, struct symset gt)
1680 for (i = 0; i < gt.cnt; i++) {
1682 prtxt(g->symtab[gt.syms[i]]->name);
1683 printf(" -> %d\n", gt.data[i]);
1687 Now we can report all the item sets complete with items, LA sets, and GO TO.
1689 static void report_itemsets(struct grammar *g)
1692 printf("ITEM SETS(%d)\n", g->states);
1693 for (s = 0; s < g->states; s++) {
1695 struct itemset *is = g->statetab[s];
1696 printf(" Itemset %d:", s);
1698 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1700 for (j = 0; j < is->items.cnt; j++) {
1701 report_item(g, is->items.syms[j]);
1702 if (is->items.data != NO_DATA)
1703 report_la(g, is->items.data[j]);
1705 report_goto(g, is->go_to);
1709 ### Reporting conflicts
1711 Conflict detection varies a lot among different analysis levels. However
1712 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1713 LR1 are also similar as they have FOLLOW or LA sets.
1717 ## conflict functions
1719 static int report_conflicts(struct grammar *g, enum grammar_type type)
1722 printf("Conflicts:\n");
1724 cnt = conflicts_lr0(g, type);
1726 cnt = conflicts_slr(g, type);
1728 printf(" - no conflicts\n");
1732 LR0 conflicts are any state which have both a reducible item and
1733 a shiftable item, or two reducible items.
1735 LR05 conflicts only occur if two possible reductions exist,
1736 as shifts always over-ride reductions.
1738 ###### conflict functions
1739 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1743 for (i = 0; i < g->states; i++) {
1744 struct itemset *is = g->statetab[i];
1745 int last_reduce = -1;
1746 int prev_reduce = -1;
1747 int last_shift = -1;
1751 for (j = 0; j < is->items.cnt; j++) {
1752 int itm = is->items.syms[j];
1753 int p = item_prod(itm);
1754 int bp = item_dot(itm);
1755 struct production *pr = g->productions[p];
1757 if (bp == pr->body_size) {
1758 prev_reduce = last_reduce;
1762 if (pr->body[bp]->type == Terminal)
1765 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1766 printf(" State %d has both SHIFT and REDUCE:\n", i);
1767 report_item(g, is->items.syms[last_shift]);
1768 report_item(g, is->items.syms[last_reduce]);
1771 if (prev_reduce >= 0) {
1772 printf(" State %d has 2 (or more) reducible items\n", i);
1773 report_item(g, is->items.syms[prev_reduce]);
1774 report_item(g, is->items.syms[last_reduce]);
1781 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1782 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1783 in the source of the look ahead set.
1785 We build two datasets to reflect the "action" table: one which maps
1786 terminals to items where that terminal could be shifted and another
1787 which maps terminals to items that could be reduced when the terminal
1788 is in look-ahead. We report when we get conflicts between the two.
1790 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1795 for (i = 0; i < g->states; i++) {
1796 struct itemset *is = g->statetab[i];
1797 struct symset shifts = INIT_DATASET;
1798 struct symset reduce = INIT_DATASET;
1802 /* First collect the shifts */
1803 for (j = 0; j < is->items.cnt; j++) {
1804 unsigned short itm = is->items.syms[j];
1805 int p = item_prod(itm);
1806 int bp = item_dot(itm);
1807 struct production *pr = g->productions[p];
1810 if (bp >= pr->body_size ||
1811 pr->body[bp]->type != Terminal)
1816 if (s->precedence && is->precedence)
1817 /* Precedence resolves this, so no conflict */
1820 if (symset_find(&shifts, s->num) < 0)
1821 symset_add(&shifts, s->num, itm);
1823 /* Now look for reductions and conflicts */
1824 for (j = 0; j < is->items.cnt; j++) {
1825 unsigned short itm = is->items.syms[j];
1826 int p = item_prod(itm);
1827 int bp = item_dot(itm);
1828 struct production *pr = g->productions[p];
1830 if (bp < pr->body_size)
1835 la = g->follow[pr->head->num];
1837 la = set_find(g, is->items.data[j]);
1839 for (k = 0; k < la.cnt; k++) {
1840 int pos = symset_find(&shifts, la.syms[k]);
1842 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1844 prtxt(g->symtab[la.syms[k]]->name);
1846 report_item(g, shifts.data[pos]);
1847 report_item(g, itm);
1849 pos = symset_find(&reduce, la.syms[k]);
1851 symset_add(&reduce, la.syms[k], itm);
1854 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1855 prtxt(g->symtab[la.syms[k]]->name);
1857 report_item(g, itm);
1858 report_item(g, reduce.data[pos]);
1862 symset_free(shifts);
1863 symset_free(reduce);
1869 ## Generating the parser
1871 The exported part of the parser is the `parse_XX` function, where the name
1872 `XX` is based on the name of the parser files.
1874 This takes a `code_node`, a partially initialized `token_config`, and an
1875 optional `FILE` to send tracing to. The `token_config` gets the list of
1876 known words added and then is used with the `code_node` to initialize the
1879 `parse_XX` then calls the library function `parser_run` to actually
1880 complete the parse. This needs the `states` table, the `reductions`
1881 table and functions to call the various pieces of code provided in the
1882 grammar file, so they are generated first.
1884 ###### parser_generate
1886 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1887 struct code_node *pre_reduce)
1893 gen_reductions(f, g);
1894 gen_reduce(f, g, file, pre_reduce);
1897 fprintf(f, "#line 0 \"gen_parser\"\n");
1898 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1901 fprintf(f, "\tstruct token_state *tokens;\n");
1902 fprintf(f, "\tconfig->words_marks = known;\n");
1903 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1904 fprintf(f, "\ttokens = token_open(code, config);\n");
1905 fprintf(f, "\tvoid *rv = parser_run(tokens, states, reductions, do_reduce, do_free, trace, non_term, config);\n");
1906 fprintf(f, "\ttoken_close(tokens);\n");
1907 fprintf(f, "\treturn rv;\n");
1908 fprintf(f, "}\n\n");
1911 ### Known words table
1913 The known words table is simply an array of terminal symbols.
1914 The table of nonterminals used for tracing is a similar array.
1918 static void gen_known(FILE *f, struct grammar *g)
1921 fprintf(f, "#line 0 \"gen_known\"\n");
1922 fprintf(f, "static const char *known[] = {\n");
1923 for (i = TK_reserved;
1924 i < g->num_syms && g->symtab[i]->type == Terminal;
1926 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1927 g->symtab[i]->name.txt);
1928 fprintf(f, "};\n\n");
1931 static void gen_non_term(FILE *f, struct grammar *g)
1934 fprintf(f, "#line 0 \"gen_non_term\"\n");
1935 fprintf(f, "static const char *non_term[] = {\n");
1936 for (i = TK_reserved;
1939 if (g->symtab[i]->type == Nonterminal ||
1940 g->symtab[i]->isspecial)
1941 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1942 g->symtab[i]->name.txt);
1943 fprintf(f, "};\n\n");
1946 ### States, reductions, and the go to tables.
1948 For each state we record the go to table and the reducible production if
1949 there is one, the details of which are in a separate table of
1950 reductions. Some of the details of the reducible production are stored
1951 in the `do_reduce` function to come later. In the go to table we store
1952 the production number and in the reductions table: the body size (useful
1953 for stack management), the resulting symbol (useful for knowing how to
1954 free data later), and the size of the resulting asn object (useful for
1955 preallocation space.
1957 The go to table is stored in a simple array of `sym` and corresponding
1960 ###### exported types
1974 const struct lookup * go_to;
1979 static void gen_goto(FILE *f, struct grammar *g)
1982 fprintf(f, "#line 0 \"gen_goto\"\n");
1983 for (i = 0; i < g->states; i++) {
1984 struct symset gt = g->statetab[i]->go_to;
1989 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1991 for (j = 0; j < gt.cnt; j++)
1992 fprintf(f, "\t{ %d, %d },\n",
1993 gt.syms[j], gt.data[j]);
1998 static void gen_reductions(FILE *f, struct grammar *g)
2001 fprintf(f, "#line 0 \"gen_reductions\"\n");
2002 fprintf(f, "static const struct reduction reductions[] = {\n");
2003 for (i = 0; i < g->production_count; i++) {
2004 struct production *pr = g->productions[i];
2005 struct symbol *hd = pr->head;
2006 fprintf(f, "\t{%d, %d, ", pr->body_size, hd->num);
2007 if (hd->struct_name.txt == NULL)
2008 fprintf(f, "0 },\n");
2010 fprintf(f, "sizeof(struct %.*s%s) },\n",
2011 hd->struct_name.len,
2012 hd->struct_name.txt,
2013 hd->isref ? "*" : "");
2015 fprintf(f, "};\n\n");
2018 static void gen_states(FILE *f, struct grammar *g)
2021 fprintf(f, "#line 0 \"gen_states\"\n");
2022 fprintf(f, "static const struct state states[] = {\n");
2023 for (i = 0; i < g->states; i++) {
2024 struct itemset *is = g->statetab[i];
2025 int j, prod = -1, prod_len;
2027 for (j = 0; j < is->items.cnt; j++) {
2028 int itm = is->items.syms[j];
2029 int p = item_prod(itm);
2030 int bp = item_dot(itm);
2031 struct production *pr = g->productions[p];
2033 if (bp < pr->body_size)
2035 /* This is what we reduce - choose longest */
2036 if (prod < 0 || prod_len < pr->body_size) {
2038 prod_len = pr->body_size;
2042 fprintf(f, "\t[%d] = { %d, %d, goto_%d },\n",
2043 i, prod, is->go_to.cnt, i);
2045 fprintf(f, "\t[%d] = { %d, 0, NULL },\n", i, prod);
2047 fprintf(f, "};\n\n");
2050 ### The `do_reduce` function and the code
2052 When the parser engine decides to reduce a production, it calls
2053 `do_reduce` which runs the code that was included with the production,
2056 This code needs to be able to store data somewhere so we record the size
2057 of the data expected with each state so it can be allocated before
2058 `do_reduce` is called.
2060 In order for the code to access "global" context, we pass in the
2061 "config" pointer that was passed to the parser function. If the `struct
2062 token_config` is embedded in some larger structure, the reducing code
2063 can access the larger structure using pointer manipulation. Performing
2064 that pointer manipulation, and any other common code or variables for
2065 all reduce actions, can be provided in code node called "reduce" which
2066 is passed around in `pre_reduce` which you might have already noticed.
2068 The code fragments require translation when written out. Any `$N` needs
2069 to be converted to a reference either to that buffer (if `$0`) or to the
2070 structure returned by a previous reduction. These pointers need to be
2071 cast to the appropriate type for each access. All this is handled in
2074 `gen_code` also allows symbol references to contain a '`<`' as in
2075 '`$<2`'. This is particularly useful for references (or pointers), but
2076 can be used with structures too. The `<` implies that the value is
2077 being moved out, so the object will not be automatically freed. It is
2078 equivalent to assigning `NULL` to the pointer or filling a structure
2081 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2082 and possibly a number. A number by itself (other than zero) selects a
2083 symbol from the body of the production. A sequence of letters selects
2084 the shortest symbol in the body which contains those letters in the
2085 given order. If a number follows the letters, then a later occurrence
2086 of that symbol is chosen. So "`$AB2`" will refer to the structure
2087 attached to the second occurrence of the shortest symbol which contains
2088 an `A` followed by a `B`. If there is no unique shortest system, or if
2089 the number given is too large, then the symbol reference is not
2090 transformed, and will cause an error when the code is compiled.
2094 static int textchr(struct text t, char c, int s)
2097 for (i = s; i < t.len; i++)
2103 static int subseq_match(char *seq, int slen, struct text name)
2107 st = textchr(name, *seq, st);
2117 static int choose_sym(char **namep, int len, struct production *p)
2119 char *name = *namep;
2128 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2134 while (len > 0 && (c >= '0' && c <= '9')) {
2137 n = n * 10 + (c - '0');
2141 if (name == *namep || n > p->body_size)
2147 for (i = 0; i < p->body_size; i++) {
2148 if (!subseq_match(nam, namlen, p->body[i]->name))
2150 if (slen == 0 || p->body[i]->name.len < slen) {
2152 slen = p->body[i]->name.len;
2154 if (s >= 0 && p->body[i] != p->body[s] &&
2155 p->body[i]->name.len == p->body[s]->name.len)
2156 /* not unique, so s cannot be used */
2163 for (i = 0; i < p->body_size; i++)
2164 if (p->body[i] == p->body[s]) {
2169 if (n > 0 || i == p->body_size)
2175 static void gen_code(struct production *p, FILE *f, struct grammar *g)
2178 char *used = calloc(1, p->body_size);
2181 fprintf(f, "\t\t\t");
2182 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2196 n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2205 fprintf(f, "(*(struct %.*s*%s)ret)",
2206 p->head->struct_name.len,
2207 p->head->struct_name.txt,
2208 p->head->isref ? "*":"");
2209 else if (p->body[n-1]->type == Terminal)
2210 fprintf(f, "(*(struct token *)body[%d])",
2212 else if (p->body[n-1]->struct_name.txt == NULL)
2213 fprintf(f, "$%d", n);
2215 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2216 p->body[n-1]->struct_name.len,
2217 p->body[n-1]->struct_name.txt,
2218 p->body[n-1]->isref ? "*":"", n-1);
2224 for (i = 0; i < p->body_size; i++) {
2225 if (p->body[i]->struct_name.txt &&
2227 // assume this has been copied out
2228 if (p->body[i]->isref)
2229 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2231 fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2239 static void gen_reduce(FILE *f, struct grammar *g, char *file,
2240 struct code_node *pre_reduce)
2243 fprintf(f, "#line 1 \"gen_reduce\"\n");
2244 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2246 fprintf(f, "\tint ret_size = 0;\n");
2248 code_node_print(f, pre_reduce, file);
2250 fprintf(f, "#line 4 \"gen_reduce\"\n");
2251 fprintf(f, "\tswitch(prod) {\n");
2252 for (i = 0; i < g->production_count; i++) {
2253 struct production *p = g->productions[i];
2254 fprintf(f, "\tcase %d:\n", i);
2257 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2261 if (p->head->struct_name.txt)
2262 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2263 p->head->struct_name.len,
2264 p->head->struct_name.txt,
2265 p->head->isref ? "*":"");
2267 fprintf(f, "\t\tbreak;\n");
2269 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2274 As each non-terminal can potentially cause a different type of data
2275 structure to be allocated and filled in, we need to be able to free it when
2278 It is particularly important to have fine control over freeing during error
2279 recovery where individual stack frames might need to be freed.
2281 For this, the grammar author is required to defined a `free_XX` function for
2282 each structure that is used by a non-terminal. `do_free` will call whichever
2283 is appropriate given a symbol number, and will call `free` (as is
2284 appropriate for tokens) on any terminal symbol.
2288 static void gen_free(FILE *f, struct grammar *g)
2292 fprintf(f, "#line 0 \"gen_free\"\n");
2293 fprintf(f, "static void do_free(short sym, void *asn)\n");
2295 fprintf(f, "\tif (!asn) return;\n");
2296 fprintf(f, "\tif (sym < %d", g->first_nonterm);
2297 /* Need to handle special terminals too */
2298 for (i = 0; i < g->num_syms; i++) {
2299 struct symbol *s = g->symtab[i];
2300 if (i >= g->first_nonterm && s->type == Terminal &&
2302 fprintf(f, " || sym == %d", s->num);
2304 fprintf(f, ") {\n");
2305 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2306 fprintf(f, "\tswitch(sym) {\n");
2308 for (i = 0; i < g->num_syms; i++) {
2309 struct symbol *s = g->symtab[i];
2311 s->type != Nonterminal ||
2312 s->struct_name.txt == NULL)
2315 fprintf(f, "\tcase %d:\n", s->num);
2317 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2319 s->struct_name.txt);
2320 fprintf(f, "\t\tfree(asn);\n");
2322 fprintf(f, "\t\tfree_%.*s(asn);\n",
2324 s->struct_name.txt);
2325 fprintf(f, "\t\tbreak;\n");
2327 fprintf(f, "\t}\n}\n\n");
2330 ## The main routine.
2332 There are three key parts to the "main" function of parsergen: processing
2333 the arguments, loading the grammar file, and dealing with the grammar.
2335 ### Argument processing.
2337 Command line options allow the selection of analysis mode, name of output
2338 file, and whether or not a report should be generated. By default we create
2339 a report only if no code output was requested.
2341 The `parse_XX` function name uses the basename of the output file which
2342 should not have a suffix (`.c`). `.c` is added to the given name for the
2343 code, and `.h` is added for the header (if header text is specifed in the
2350 static const struct option long_options[] = {
2351 { "LR0", 0, NULL, '0' },
2352 { "LR05", 0, NULL, '5' },
2353 { "SLR", 0, NULL, 'S' },
2354 { "LALR", 0, NULL, 'L' },
2355 { "LR1", 0, NULL, '1' },
2356 { "tag", 1, NULL, 't' },
2357 { "report", 0, NULL, 'R' },
2358 { "output", 1, NULL, 'o' },
2359 { NULL, 0, NULL, 0 }
2361 const char *options = "05SL1t:Ro:";
2363 ###### process arguments
2365 char *outfile = NULL;
2370 enum grammar_type type = LR05;
2371 while ((opt = getopt_long(argc, argv, options,
2372 long_options, NULL)) != -1) {
2387 outfile = optarg; break;
2389 tag = optarg; break;
2391 fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2396 infile = argv[optind++];
2398 fprintf(stderr, "No input file given\n");
2401 if (outfile && report == 1)
2404 if (name && strchr(name, '/'))
2405 name = strrchr(name, '/')+1;
2407 if (optind < argc) {
2408 fprintf(stderr, "Excess command line arguments\n");
2412 ### Loading the grammar file
2414 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2417 Once we have extracted the code (with `mdcode`) we expect to find three
2418 or four sections: header, code, grammar, reduce.
2419 Anything else that is not excluded by the `--tag` option is an error.
2421 "header", "code", and "reduce" are optional, though it is hard to build
2422 a working parser without the first two. "grammar" must be provided.
2426 #include <sys/mman.h>
2431 static void pr_err(char *msg)
2434 fprintf(stderr, "%s\n", msg);
2438 struct section *table;
2442 fd = open(infile, O_RDONLY);
2444 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2445 infile, strerror(errno));
2448 len = lseek(fd, 0, 2);
2449 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2450 table = code_extract(file, file+len, pr_err);
2452 struct code_node *hdr = NULL;
2453 struct code_node *code = NULL;
2454 struct code_node *gram = NULL;
2455 struct code_node *pre_reduce = NULL;
2456 for (s = table; s; s = s->next) {
2457 struct text sec = s->section;
2458 if (tag && !strip_tag(&sec, tag))
2460 if (text_is(sec, "header"))
2462 else if (text_is(sec, "code"))
2464 else if (text_is(sec, "grammar"))
2466 else if (text_is(sec, "reduce"))
2467 pre_reduce = s->code;
2469 fprintf(stderr, "Unknown content section: %.*s\n",
2470 s->section.len, s->section.txt);
2475 ### Processing the input
2477 As we need to append an extention to a filename and then open it for
2478 writing, and we need to do this twice, it helps to have a separate function.
2482 static FILE *open_ext(char *base, char *ext)
2484 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2486 strcat(strcpy(fn, base), ext);
2492 If we can read the grammar, then we analyse and optionally report on it. If
2493 the report finds conflicts we will exit with an error status.
2495 ###### process input
2496 struct grammar *g = NULL;
2498 fprintf(stderr, "No grammar section provided\n");
2501 g = grammar_read(gram);
2503 fprintf(stderr, "Failure to parse grammar\n");
2508 grammar_analyse(g, type);
2510 if (grammar_report(g, type))
2514 If a "headers" section is defined, we write it out and include a declaration
2515 for the `parse_XX` function so it can be used from separate code.
2517 if (rv == 0 && hdr && outfile) {
2518 FILE *f = open_ext(outfile, ".h");
2520 code_node_print(f, hdr, infile);
2521 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2525 fprintf(stderr, "Cannot create %s.h\n",
2531 And if all goes well and an output file was provided, we create the `.c`
2532 file with the code section (if any) and the parser tables and function.
2534 if (rv == 0 && outfile) {
2535 FILE *f = open_ext(outfile, ".c");
2538 code_node_print(f, code, infile);
2539 gen_parser(f, g, infile, name, pre_reduce);
2542 fprintf(stderr, "Cannot create %s.c\n",
2548 And that about wraps it up. We need to set the locale so that UTF-8 is
2549 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2551 ###### File: parsergen.mk
2552 parsergen : parsergen.o libscanner.o libmdcode.o
2553 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2560 int main(int argc, char *argv[])
2565 setlocale(LC_ALL,"");
2567 ## process arguments
2574 ## The SHIFT/REDUCE parser
2576 Having analysed the grammar and generated all the tables, we only need
2577 the shift/reduce engine to bring it all together.
2579 ### Go to table lookup
2581 The parser generator has nicely provided us with go to tables sorted by
2582 symbol number. We need a binary search function to find a symbol in the
2585 ###### parser functions
2587 static int search(const struct state *l, int sym)
2590 int hi = l->go_to_cnt;
2594 while (lo + 1 < hi) {
2595 int mid = (lo + hi) / 2;
2596 if (l->go_to[mid].sym <= sym)
2601 if (l->go_to[lo].sym == sym)
2602 return l->go_to[lo].state;
2607 ### Memory allocation
2609 The `scanner` returns tokens in a local variable - we want them in allocated
2610 memory so they can live in the `asn_stack`. So we provide `tok_copy` to
2611 make an allocated copy as required.
2613 ###### parser includes
2616 ###### parser functions
2618 static struct token *tok_copy(struct token tk)
2620 struct token *new = malloc(sizeof(*new));
2625 ### The state stack.
2627 The core data structure for the parser is the stack. This tracks all
2628 the symbols that have been recognised or partially recognised.
2630 The stack usually won't grow very large - maybe a few tens of entries.
2631 So we dynamically grow an array as required but never bother to
2632 shrink it down again.
2634 We keep the stack as two separate allocations. One, `asn_stack`, stores
2635 the "abstract syntax nodes" which are created by each reduction. When
2636 we call `do_reduce` we need to pass an array of the `asn`s of the body
2637 of the production, and by keeping a separate `asn` stack, we can just
2638 pass a pointer into this stack.
2640 The other allocation stores all other stack fields of which there are
2641 two. The `state` is the most important one and guides the parsing
2642 process. The `sym` is nearly unnecessary as it is implicit in the
2643 state. However when we want to free entries from the `asn_stack`, it
2644 helps to know what type they are so we can call the right freeing
2645 function. The symbol leads us to the right free function through
2648 The stack is most properly seen as alternating states and symbols -
2649 states, like the 'DOT' in items, are between symbols. Each frame in
2650 our stack holds a state and the symbol that was before it. The
2651 bottom of stack holds the start state but no symbol, as nothing came
2652 before the beginning. As we need to store some value, `TK_eof` is used
2653 to mark the beginning of the file as well as the end.
2655 ###### parser functions
2671 Two operations are needed on the stack - shift (which is like push) and pop.
2673 Shift applies not only to terminals but also to non-terminals. When we
2674 reduce a production we will pop off frames corresponding to the body
2675 symbols, then push on a frame for the head of the production. This last
2676 is exactly the same process as shifting in a terminal so we use the same
2677 function for both. In both cases we provide the symbol. The state is
2678 deduced from the current top-of-stack state and the new symbol.
2680 To simplify other code we arrange for `shift` to fail if there is no `go to`
2681 state for the symbol. This is useful in basic parsing due to our design
2682 that we shift when we can, and reduce when we cannot. So the `shift`
2683 function reports if it could.
2685 `shift` is also used to push state zero onto the stack, so if the
2686 stack is empty, it always chooses zero as the next state.
2688 So `shift` finds the next state. If that succeeds it extends the
2689 allocations if needed and pushes all the information onto the stacks.
2691 ###### parser functions
2693 static int shift(struct parser *p,
2694 short sym, void *asn,
2695 const struct state states[])
2697 struct frame next = {0};
2698 int newstate = p->tos
2699 ? search(&states[p->stack[p->tos-1].state],
2705 if (p->tos >= p->stack_size) {
2706 p->stack_size += 10;
2707 p->stack = realloc(p->stack, p->stack_size
2708 * sizeof(p->stack[0]));
2709 p->asn_stack = realloc(p->asn_stack, p->stack_size
2710 * sizeof(p->asn_stack[0]));
2713 next.state = newstate;
2715 p->stack[p->tos] = next;
2716 p->asn_stack[p->tos] = asn;
2721 `pop` primarily moves the top of stack (`tos`) back down the required
2722 amount and frees any `asn` entries that need to be freed. It is called
2723 _after_ we reduce a production, just before we `shift` the nonterminal
2726 ###### parser functions
2728 static void pop(struct parser *p, int num,
2729 void(*do_free)(short sym, void *asn))
2734 for (i = 0; i < num; i++)
2735 do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]);
2738 ### The heart of the parser.
2740 Now we have the parser. For each token we might shift it, trigger a
2741 reduction, or start error handling. 2D tokens (IN, OUT, NEWLINE, EOL)
2742 might also be ignored. Ignoring tokens is combined with shifting.
2746 struct parser p = { 0 };
2747 struct token *tk = NULL;
2750 ###### heart of parser
2752 shift(&p, TK_eof, NULL, states);
2753 while (!accepted && p.tos > 0) {
2754 struct frame *tos = &p.stack[p.tos-1];
2756 tk = tok_copy(token_next(tokens));
2757 parser_trace(trace, &p,
2758 tk, states, non_term, config->known_count);
2760 ## try shift or ignore
2765 Indents are ignored unless they can be shifted onto the stack
2766 immediately or nothing can be shifted (in which case we reduce, and try
2767 again). The end of an indented section - the OUT token - is ignored
2768 precisely when the indent was ignored. To keep track of this we need a
2769 small stack of flags, which is easily stored as bits in an `unsigned
2770 long`. This will never overflow and the scanner only allows 20 levels
2774 unsigned long ignored_indents;
2776 NEWLINE is ignored when in an indented section of text which was not
2777 explicitly expected by the grammar. So if the most recent indent is
2778 ignored, so is any NEWLINE token.
2780 If a NEWLINE is seen but it cannot be shifted, we try to shift an EOL
2781 token instead. If that succeeds, we make a new copy of the NEWLINE
2782 token and continue. This allows a NEWLINE to appear to be preceded by
2783 an indefinite number of EOL tokens.
2785 The token number for `EOL` cannot be statically declared, so when the
2786 parser starts we need to look through the array of non-terminals to find
2794 while (strcmp(non_term[p.tk_eol], "EOL") != 0)
2796 p.tk_eol += TK_reserved + config->known_count;
2798 For other tokens, we shift the next token if that is possible, otherwise
2799 we try to reduce a production.
2801 ###### try shift or ignore
2803 if ((tk->num == TK_newline || tk->num == TK_out) &&
2804 (p.ignored_indents & 1)) {
2805 /* indented, so ignore OUT and NEWLINE */
2806 if (tk->num == TK_out)
2807 p.ignored_indents >>= 1;
2810 parser_trace_action(trace, "Ignore");
2814 if (shift(&p, tk->num, tk, states)) {
2815 if (tk->num == TK_out)
2816 p.ignored_indents >>= 1;
2817 if (tk->num == TK_in)
2818 p.ignored_indents <<= 1;
2820 parser_trace_action(trace, "Shift");
2824 } else if (tk->num == TK_newline &&
2825 shift(&p, p.tk_eol, tk, states)) {
2827 parser_trace_action(trace, "ShiftEOL");
2831 if (tk->num == TK_in && states[p.stack[p.tos-1].state].go_to_cnt > 0) {
2832 /* No indent expected here and reduce is not mandatory, so ignore IN */
2835 p.ignored_indents <<= 1;
2836 p.ignored_indents |= 1;
2837 parser_trace_action(trace, "Ignore");
2841 We have already discussed the bulk of the handling of a "reduce" action,
2842 with the `pop()` and `shift()` functions doing much of the work. There
2843 is a little more complexity needed to manage storage for the asn (Abstract
2844 Syntax Node), and also a test of whether the reduction is permitted.
2846 When we try to shift the result of reducing production-zero, it will
2847 fail because there is no next state. In this case the asn will not have
2848 been stored on the stack, so it get stored in the `ret` variable, and we
2849 report that that input has been accepted.
2857 if (states[tos->state].reduce_prod >= 0) {
2860 const struct state *nextstate = &states[tos->state];
2861 int prod = nextstate->reduce_prod;
2862 int size = reductions[prod].size;
2863 int res_size = reductions[prod].result_size;
2865 body = p.asn_stack + (p.tos - size);
2866 res = res_size ? calloc(1, res_size) : NULL;
2867 res_size = do_reduce(prod, body, config, res);
2868 if (res_size != reductions[prod].result_size)
2870 pop(&p, size, do_free);
2871 if (!shift(&p, reductions[prod].sym, res, states)) {
2874 parser_trace_action(trace, "Accept");
2876 parser_trace_action(trace, "Reduce");
2880 If we can neither shift nor reduce we have an error to handle. There
2881 are two possible responses to an error: we can pop single frames off the
2882 stack until we find a frame that can shift `TK_error`, or we can discard
2883 the current look-ahead symbol.
2885 When we first see an error we do the first of these and set a flag to
2886 record that we are processing an error. If the normal shift/reduce
2887 tests fail to find that anything can be done from that state, we start
2888 dropping tokens until either we manage to shift one, or reach end-of-file.
2901 parser_trace_action(trace, "DISCARD");
2902 if (tk->num == TK_eof)
2907 struct token *err_tk;
2909 parser_trace_action(trace, "ERROR");
2911 err_tk = tok_copy(*tk);
2912 while (p.tos > 0 && shift(&p, TK_error, err_tk, states) == 0)
2913 // discard this state
2914 pop(&p, 1, do_free);
2917 // no state accepted TK_error
2923 ###### parser includes
2928 void *parser_run(struct token_state *tokens,
2929 const struct state states[],
2930 const struct reduction reductions[],
2931 int (*do_reduce)(int, void**, struct token_config*, void*),
2932 void (*do_free)(short, void*),
2933 FILE *trace, const char *non_term[],
2934 struct token_config *config)
2943 pop(&p, p.tos, do_free);
2949 ###### exported functions
2950 void *parser_run(struct token_state *tokens,
2951 const struct state states[],
2952 const struct reduction reductions[],
2953 int (*do_reduce)(int, void**, struct token_config*, void*),
2954 void (*do_free)(short, void*),
2955 FILE *trace, const char *non_term[],
2956 struct token_config *config);
2960 Being able to visualize the parser in action can be invaluable when
2961 debugging the parser code, or trying to understand how the parse of a
2962 particular grammar progresses. The stack contains all the important
2963 state, so just printing out the stack every time around the parse loop
2964 can make it possible to see exactly what is happening.
2966 This doesn't explicitly show each SHIFT and REDUCE action. However they
2967 are easily deduced from the change between consecutive lines, and the
2968 details of each state can be found by cross referencing the states list
2969 in the stack with the "report" that parsergen can generate.
2971 For terminal symbols, we just dump the token. For non-terminals we
2972 print the name of the symbol. The look ahead token is reported at the
2973 end inside square brackets.
2975 ###### parser functions
2977 static char *reserved_words[] = {
2978 [TK_error] = "ERROR",
2981 [TK_newline] = "NEWLINE",
2985 void parser_trace(FILE *trace, struct parser *p,
2986 struct token *tk, const struct state states[],
2987 const char *non_term[], int knowns)
2992 for (i = 0; i < p->tos; i++) {
2993 struct frame *f = &p->stack[i];
2996 if (sym < TK_reserved &&
2997 reserved_words[sym] != NULL)
2998 fputs(reserved_words[sym], trace);
2999 else if (sym < TK_reserved + knowns) {
3000 struct token *t = p->asn_stack[i];
3001 text_dump(trace, t->txt, 20);
3003 fputs(non_term[sym - TK_reserved - knowns],
3007 fprintf(trace, "(%d) ", f->state);
3009 fprintf(trace, "[");
3010 if (tk->num < TK_reserved &&
3011 reserved_words[tk->num] != NULL)
3012 fputs(reserved_words[tk->num], trace);
3014 text_dump(trace, tk->txt, 20);
3015 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3018 void parser_trace_action(FILE *trace, char *action)
3021 fprintf(trace, " - %s\n", action);
3026 The obvious example for a parser is a calculator.
3028 As `scanner` provides number parsing function using `libgmp` it is not much
3029 work to perform arbitrary rational number calculations.
3031 This calculator takes one expression, or an equality test, per line.
3032 The results are printed and if any equality test fails, the program
3033 exits with an error.
3035 ###### File: parsergen.mk
3036 calc.c calc.h : parsergen parsergen.mdc
3037 ./parsergen --tag calc -o calc parsergen.mdc
3038 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3039 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3041 ./calc parsergen.mdc
3046 #include "parse_number.h"
3047 // what do we use for a demo-grammar? A calculator of course.
3059 #include <sys/mman.h>
3065 #include "scanner.h"
3070 static void free_number(struct number *n)
3076 static int text_is(struct text t, char *s)
3078 return (strlen(s) == t.len &&
3079 strncmp(s, t.txt, t.len) == 0);
3082 int main(int argc, char *argv[])
3084 int fd = open(argv[1], O_RDONLY);
3085 int len = lseek(fd, 0, 2);
3086 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3087 struct section *table = code_extract(file, file+len, NULL);
3089 struct token_config config = {
3090 .ignored = (1 << TK_line_comment)
3093 .number_chars = ".,_+-",
3097 for (s = table; s; s = s->next)
3098 if (text_is(s->section, "example: input"))
3099 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3101 struct section *t = table->next;
3102 code_free(table->code);
3114 Session -> Session Line
3117 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3118 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3119 gmp_printf(" or as a decimal: %Fg\n", fl);
3123 | Expression = Expression NEWLINE ${
3124 if (mpq_equal($1.val, $3.val))
3125 gmp_printf("Both equal %Qd\n", $1.val);
3127 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
3132 | NEWLINE ${ printf("Blank line\n"); }$
3133 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3136 Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3137 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3138 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3139 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3140 | Expression // Expression ${ {
3143 mpz_init(z0); mpz_init(z1); mpz_init(z2);
3144 mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3145 mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3146 mpz_tdiv_q(z0, z1, z2);
3147 mpq_set_z($0.val, z0);
3148 mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3150 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3151 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3156 3.1415926535 - 355/113
3158 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3160 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1