1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. Finally `calc` is a test grammar for a simple calculator. The
21 `parsergen` program built from the C code in this file can extract
22 that grammar directly from this file and process it.
25 ###### File: parsergen.c
30 ## forward declarations
41 ###### File: libparser.c
48 ###### File: parsergen.mk
51 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
54 ## Reading the grammar
56 The grammar must be stored in a markdown literate code file as parsed
57 by "[mdcode][]". It must have three top level (i.e. unreferenced)
58 sections: `header`, `code`, and `grammar`. The first two will be
59 literally copied into the generated `.c` and `.h`. files. The last
60 contains the grammar. This is tokenised with "[scanner][]".
62 If the `--tag` option is given, then any top level header that doesn't
63 start with the tag is ignored, and the tag is striped from the rest. So
65 means that the three needed sections must be `Foo: header`, `Foo: code`,
66 and `Foo: grammar`. The tag `calc` is used to extract the same calculator
70 [scanner]: scanner.html
76 ###### parser includes
80 The grammar contains production sets, precedence/associativity
81 declarations, and data type declarations. These are all parsed with
82 _ad hoc_ parsing as we don't have a parser generator yet.
84 The precedence and associativity can be set for each production, but
85 can be inherited from symbols. The data type (either structure or a
86 reference to a structure) is potentially defined for each non-terminal
87 and describes what C structure is used to store information about each
91 enum assoc {Left, Right, Non};
92 char *assoc_names[] = {"Left","Right","Non"};
95 struct text struct_name;
98 unsigned short precedence;
102 unsigned short precedence;
110 The strings reported by `mdcode` and `scanner` are `struct text` which have
111 length rather than being null terminated. To help with printing and
112 comparing we define `text_is` and `prtxt`, which should possibly go in
113 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
114 which might contain control characters.
116 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
117 because that is what we need to detect tags.
120 static int text_is(struct text t, char *s)
122 return (strlen(s) == t.len &&
123 strncmp(s, t.txt, t.len) == 0);
125 static void prtxt(struct text t)
127 printf("%.*s", t.len, t.txt);
130 static int strip_tag(struct text *t, char *tag)
132 int skip = strlen(tag) + 1;
133 if (skip >= t->len ||
134 strncmp(t->txt, tag, skip-1) != 0 ||
135 t->txt[skip-1] != ':')
137 while (skip < t->len && t->txt[skip] == ' ')
146 Productions are comprised primarily of symbols - terminal and
147 non-terminal. We do not make a syntactic distinction between the two,
148 though non-terminals must be identifiers. Non-terminal symbols are
149 those which appear in the head of a production, terminal symbols are
150 those which don't. There are also "virtual" symbols used for precedence
151 marking discussed later, and sometimes we won't know what type a symbol
154 ###### forward declarations
155 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
156 char *symtypes = "UVTN";
160 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
161 table of known symbols and the resulting parser will report them as
162 `TK_reserved + N`. A small set of identifiers are reserved for the
163 different token types that `scanner` can report.
166 static char *reserved_words[] = {
167 [TK_error] = "ERROR",
168 [TK_number] = "NUMBER",
169 [TK_ident] = "IDENTIFIER",
171 [TK_string] = "STRING",
172 [TK_multi_string] = "MULTI_STRING",
175 [TK_newline] = "NEWLINE",
181 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
182 recognised. The former is automatically expected at the end of the text
183 being parsed. The latter are always ignored by the parser.
185 All of these symbols are stored in a simple symbol table. We use the
186 `struct text` from `mdcode` to store the name and link them together
187 into a sorted list using an insertion sort.
189 We don't have separate `find` and `insert` functions as any symbol we
190 find needs to be remembered. We simply expect `find` to always return a
191 symbol, but its type might be `Unknown`.
200 ###### grammar fields
205 static struct symbol *sym_find(struct grammar *g, struct text s)
207 struct symbol **l = &g->syms;
212 (cmp = text_cmp((*l)->name, s)) < 0)
216 n = calloc(1, sizeof(*n));
225 static void symbols_init(struct grammar *g)
227 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
229 for (i = 0; i < entries; i++) {
232 t.txt = reserved_words[i];
235 t.len = strlen(t.txt);
242 ### Data types and precedence.
244 Data type specification and precedence specification are both
245 introduced by a dollar sign at the start of the line. If the next
246 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
247 precedence, otherwise it specifies a data type.
249 The data type name is simply stored and applied to the head of all
250 subsequent productions. It must be the name of a structure optionally
251 preceded by an asterisk which means a reference or "pointer". So
252 `$expression` maps to `struct expression` and `$*statement` maps to
253 `struct statement *`.
255 Any productions given before the first data type declaration will have
256 no data type associated with them and can carry no information. In
257 order to allow other non-terminals to have no type, the data type
258 `$void` can be given. This does *not* mean that `struct void` will be
259 used, but rather than no type will be associated with future
262 The precedence line must contain a list of symbols - typically
263 terminal symbols, but not necessarily. It can only contain symbols
264 that have not been seen yet, so precedence declaration must precede
265 the productions that they affect.
267 A precedence line may also contain a "virtual" symbol which is an
268 identifier preceded by `$$`. Virtual terminals carry precedence
269 information but are not included in the grammar. A production can
270 declare that it inherits the precedence of a given virtual symbol.
272 This use for `$$` precludes it from being used as a symbol in the
273 described language. Two other symbols: `${` and `}$` are also
276 Each new precedence line introduces a new precedence level and
277 declares how it associates. This level is stored in each symbol
278 listed and may be inherited by any production which uses the symbol. A
279 production inherits from the last symbol which has a precedence.
281 ###### grammar fields
282 struct text current_type;
287 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
288 static const char *known[] = { "$$", "${", "}$" };
291 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
293 struct token t = token_next(ts);
298 if (t.num != TK_ident) {
299 err = "type or assoc expected after '$'";
302 if (text_is(t.txt, "LEFT"))
304 else if (text_is(t.txt, "RIGHT"))
306 else if (text_is(t.txt, "NON"))
309 g->current_type = t.txt;
310 g->type_isref = isref;
311 if (text_is(t.txt, "void"))
312 g->current_type.txt = NULL;
314 if (t.num != TK_newline) {
315 err = "Extra tokens after type name";
322 err = "$* cannot be followed by a precedence";
326 // This is a precedence line, need some symbols.
330 while (t.num != TK_newline) {
331 enum symtype type = Terminal;
333 if (t.num == TK_virtual) {
336 if (t.num != TK_ident) {
337 err = "$$ must be followed by a word";
340 } else if (t.num != TK_ident &&
342 err = "Illegal token in precedence line";
345 s = sym_find(g, t.txt);
346 if (s->type != Unknown) {
347 err = "Symbols in precedence line must not already be known.";
351 s->precedence = g->prec_levels;
357 err = "No symbols given on precedence line";
361 while (t.num != TK_newline && t.num != TK_eof)
368 A production either starts with an identifier which is the head
369 non-terminal, or a vertical bar (`|`) in which case this production
370 uses the same head as the previous one. The identifier must be
371 followed by a `->` mark. All productions for a given non-terminal must
372 be grouped together with the `nonterminal ->` given only once.
374 After this a (possibly empty) sequence of identifiers and marks form
375 the body of the production. A virtual symbol may be given after the
376 body by preceding it with `$$`. If a virtual symbol is given, the
377 precedence of the production is that for the virtual symbol. If none
378 is given, the precedence is inherited from the last symbol in the
379 production which has a precedence specified.
381 After the optional precedence may come the `${` mark. This indicates
382 the start of a code fragment. If present, this must be on the same
383 line as the start of the production.
385 All of the text from the `${` through to the matching `}$` is
386 collected and forms the code-fragment for the production. It must all
387 be in one `code_node` of the literate code. The `}$` must be
388 at the end of a line.
390 Text in the code fragment will undergo substitutions where `$N` or
391 `$<N`,for some numeric `N`, will be replaced with a variable holding
392 the parse information for the particular symbol in the production.
393 `$0` is the head of the production, `$1` is the first symbol of the
394 body, etc. The type of `$N` for a terminal symbol is `struct token`.
395 For a non-terminal, it is whatever has been declared for that symbol.
396 The `<` may be included for symbols declared as storing a reference
397 (not a structure) and means that the reference is being moved out, so
398 it will not automatically be freed.
400 While building productions we will need to add to an array which needs to
404 static void array_add(void *varray, int *cnt, void *new)
406 void ***array = varray;
409 current = ((*cnt-1) | (step-1))+1;
410 if (*cnt == current) {
413 *array = realloc(*array, current * sizeof(void*));
415 (*array)[*cnt] = new;
419 Collecting the code fragment simply involves reading tokens until we
420 hit the end token `}$`, and noting the character position of the start and
424 static struct text collect_code(struct token_state *state,
429 code.txt = start.txt.txt + start.txt.len;
431 t = token_next(state);
432 while (t.node == start.node &&
433 t.num != TK_close && t.num != TK_error &&
435 if (t.num == TK_close && t.node == start.node)
436 code.len = t.txt.txt - code.txt;
442 Now we have all the bits we need to parse a full production.
447 ###### grammar fields
448 struct production **productions;
449 int production_count;
451 ###### production fields
453 struct symbol **body;
459 int first_production;
462 static char *parse_production(struct grammar *g,
464 struct token_state *state)
466 /* Head has already been parsed. */
469 struct production p, *pp;
471 memset(&p, 0, sizeof(p));
473 tk = token_next(state);
474 while (tk.num == TK_ident || tk.num == TK_mark) {
475 struct symbol *bs = sym_find(g, tk.txt);
476 if (bs->type == Unknown)
478 if (bs->type == Virtual) {
479 err = "Virtual symbol not permitted in production";
482 if (bs->precedence) {
483 p.precedence = bs->precedence;
486 array_add(&p.body, &p.body_size, bs);
487 tk = token_next(state);
489 if (tk.num == TK_virtual) {
491 tk = token_next(state);
492 if (tk.num != TK_ident) {
493 err = "word required after $$";
496 vs = sym_find(g, tk.txt);
497 if (vs->type != Virtual) {
498 err = "symbol after $$ must be virtual";
501 p.precedence = vs->precedence;
503 tk = token_next(state);
505 if (tk.num == TK_open) {
506 p.code_line = tk.line;
507 p.code = collect_code(state, tk);
508 if (p.code.txt == NULL) {
509 err = "code fragment not closed properly";
512 tk = token_next(state);
514 if (tk.num != TK_newline && tk.num != TK_eof) {
515 err = "stray tokens at end of line";
518 pp = malloc(sizeof(*pp));
520 array_add(&g->productions, &g->production_count, pp);
523 while (tk.num != TK_newline && tk.num != TK_eof)
524 tk = token_next(state);
528 With the ability to parse production and dollar-lines, we have nearly all
529 that we need to parse a grammar from a `code_node`.
531 The head of the first production will effectively be the `start` symbol of
532 the grammar. However it won't _actually_ be so. Processing the grammar is
533 greatly simplified if the real start symbol only has a single production,
534 and expects `$eof` as the final terminal. So when we find the first
535 explicit production we insert an extra production as production zero which
538 ###### Example: production 0
541 where `START` is the first non-terminal given.
543 ###### create production zero
544 struct production *p = calloc(1,sizeof(*p));
545 struct text start = {"$start",6};
546 struct text eof = {"$eof",4};
547 struct text code = {"$0 = $<1;", 9};
548 p->head = sym_find(g, start);
549 p->head->type = Nonterminal;
550 p->head->struct_name = g->current_type;
551 p->head->isref = g->type_isref;
552 if (g->current_type.txt)
554 array_add(&p->body, &p->body_size, head);
555 array_add(&p->body, &p->body_size, sym_find(g, eof));
556 p->head->first_production = g->production_count;
557 array_add(&g->productions, &g->production_count, p);
559 Now we are ready to read in the grammar. We ignore comments
560 and strings so that the marks which introduce them can be
561 used as terminals in the grammar. We don't ignore numbers
562 even though we don't allow them as that causes the scanner
563 to produce errors that the parser is better positioned to handle.
566 static struct grammar *grammar_read(struct code_node *code)
568 struct token_config conf = {
571 .words_marks = known,
572 .known_count = sizeof(known)/sizeof(known[0]),
574 .ignored = (1 << TK_line_comment)
575 | (1 << TK_block_comment)
578 | (1 << TK_multi_string)
583 struct token_state *state = token_open(code, &conf);
585 struct symbol *head = NULL;
589 g = calloc(1, sizeof(*g));
592 for (tk = token_next(state); tk.num != TK_eof;
593 tk = token_next(state)) {
594 if (tk.num == TK_newline)
596 if (tk.num == TK_ident) {
598 head = sym_find(g, tk.txt);
599 if (head->type == Nonterminal)
600 err = "This non-terminal has already be used.";
601 else if (head->type == Virtual)
602 err = "Virtual symbol not permitted in head of production";
604 head->type = Nonterminal;
605 head->struct_name = g->current_type;
606 head->isref = g->type_isref;
607 if (g->production_count == 0) {
608 ## create production zero
610 head->first_production = g->production_count;
611 tk = token_next(state);
612 if (tk.num == TK_mark &&
613 text_is(tk.txt, "->"))
614 err = parse_production(g, head, state);
616 err = "'->' missing in production";
618 } else if (tk.num == TK_mark
619 && text_is(tk.txt, "|")) {
620 // another production for same non-term
622 err = parse_production(g, head, state);
624 err = "First production must have a head";
625 } else if (tk.num == TK_mark
626 && text_is(tk.txt, "$")) {
627 err = dollar_line(state, g, 0);
628 } else if (tk.num == TK_mark
629 && text_is(tk.txt, "$*")) {
630 err = dollar_line(state, g, 1);
632 err = "Unrecognised token at start of line.";
640 fprintf(stderr, "Error at line %d: %s\n",
647 ## Analysing the grammar
649 The central task in analysing the grammar is to determine a set of
650 states to drive the parsing process. This will require finding
651 various sets of symbols and of "items". Some of these sets will need
652 to attach information to each element in the set, so they are more
655 Each "item" may have a 'look-ahead' or `LA` set associated with
656 it. Many of these will be the same as each other. To avoid memory
657 wastage, and to simplify some comparisons of sets, these sets will be
658 stored in a list of unique sets, each assigned a number.
660 Once we have the data structures in hand to manage these sets and
661 lists, we can start setting the 'nullable' flag, build the 'FIRST'
662 sets, and then create the item sets which define the various states.
666 Though we don't only store symbols in these sets, they are the main
667 things we store, so they are called symbol sets or "symsets".
669 A symset has a size, an array of shorts, and an optional array of data
670 values, which are also shorts. If the array of data is not present,
671 we store an impossible pointer, as `NULL` is used to indicate that no
672 memory has been allocated yet;
677 unsigned short *syms, *data;
679 #define NO_DATA ((unsigned short *)1)
680 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
681 const struct symset INIT_DATASET = { 0, NULL, NULL };
683 The arrays of shorts are allocated in blocks of 8 and are kept sorted
684 by using an insertion sort. We don't explicitly record the amount of
685 allocated space as it can be derived directly from the current `cnt` using
686 `((cnt - 1) | 7) + 1`.
689 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
692 int current = ((s->cnt-1) | 7) + 1;
693 if (current == s->cnt) {
695 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
696 if (s->data != NO_DATA)
697 s->data = realloc(s->data, sizeof(*s->data) * current);
700 while (i > 0 && s->syms[i-1] > key) {
701 s->syms[i] = s->syms[i-1];
702 if (s->data != NO_DATA)
703 s->data[i] = s->data[i-1];
707 if (s->data != NO_DATA)
712 Finding a symbol (or item) in a `symset` uses a simple binary search.
713 We return the index where the value was found (so data can be accessed),
714 or `-1` to indicate failure.
716 static int symset_find(struct symset *ss, unsigned short key)
723 while (lo + 1 < hi) {
724 int mid = (lo + hi) / 2;
725 if (ss->syms[mid] <= key)
730 if (ss->syms[lo] == key)
735 We will often want to form the union of two symsets. When we do, we
736 will often want to know if anything changed (as that might mean there
737 is more work to do). So `symset_union` reports whether anything was
738 added to the first set. We use a slow quadratic approach as these
739 sets don't really get very big. If profiles shows this to be a problem it
740 can be optimised later.
742 static int symset_union(struct symset *a, struct symset *b)
746 for (i = 0; i < b->cnt; i++)
747 if (symset_find(a, b->syms[i]) < 0) {
748 unsigned short data = 0;
749 if (b->data != NO_DATA)
751 symset_add(a, b->syms[i], data);
757 And of course we must be able to free a symset.
759 static void symset_free(struct symset ss)
762 if (ss.data != NO_DATA)
768 Some symsets are simply stored somewhere appropriate in the `grammar`
769 data structure, others needs a bit of indirection. This applies
770 particularly to "LA" sets. These will be paired with "items" in an
771 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
772 stored in the `data` field, so we need a mapping from a small (short)
773 number to an LA `symset`.
775 As mentioned earlier this involves creating a list of unique symsets.
777 For now, we just use a linear list sorted by insertion. A skiplist
778 would be more efficient and may be added later.
785 struct setlist *next;
788 ###### grammar fields
789 struct setlist *sets;
794 static int ss_cmp(struct symset a, struct symset b)
797 int diff = a.cnt - b.cnt;
800 for (i = 0; i < a.cnt; i++) {
801 diff = (int)a.syms[i] - (int)b.syms[i];
808 static int save_set(struct grammar *g, struct symset ss)
810 struct setlist **sl = &g->sets;
814 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
821 s = malloc(sizeof(*s));
830 Finding a set by number is currently performed by a simple linear search.
831 If this turns out to hurt performance, we can store the sets in a dynamic
832 array like the productions.
834 static struct symset set_find(struct grammar *g, int num)
836 struct setlist *sl = g->sets;
837 while (sl && sl->num != num)
843 ### Setting `nullable`
845 We set `nullable` on the head symbol for any production for which all
846 the body symbols (if any) are nullable. As this is a recursive
847 definition, any change in the `nullable` setting means that we need to
848 re-evaluate where it needs to be set.
850 We simply loop around performing the same calculations until no more
857 static void set_nullable(struct grammar *g)
860 while (check_again) {
863 for (p = 0; p < g->production_count; p++) {
864 struct production *pr = g->productions[p];
867 if (pr->head->nullable)
869 for (s = 0; s < pr->body_size; s++)
870 if (! pr->body[s]->nullable)
872 if (s == pr->body_size) {
873 pr->head->nullable = 1;
880 ### Setting `can_eol` and `line_like`
882 In order to be able to ignore newline tokens when not relevant, but
883 still include them in the parse when needed, we will need to know
884 which states can start a "line-like" section of code. We ignore
885 newlines when there is an indent since the most recent start of a
888 To know which symbols are line-like, we first need to know which
889 symbols start with a NEWLINE token. Any symbol which is followed by a
890 NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol.
891 Certainly when trying to parse one of these we must take note of NEWLINEs.
893 Clearly the `TK_newline` token can start with a NEWLINE. Any symbol
894 which is the head of a production that contains a starts-with-NEWLINE
895 symbol preceeded only by nullable symbols is also a
896 starts-with-NEWLINE symbol. We use a new field `can_eol` to record
897 this attribute of symbols, and compute it in a repetitive manner
898 similar to `set_nullable`.
900 Once we have that, we can determine which symbols are `line_like` by
901 seeing which are followed by a `can_eol` symbol in any production.
908 static void set_can_eol(struct grammar *g)
911 g->symtab[TK_newline]->can_eol = 1;
912 while (check_again) {
915 for (p = 0; p < g->production_count; p++) {
916 struct production *pr = g->productions[p];
919 if (pr->head->can_eol)
922 for (s = 0 ; s < pr->body_size; s++) {
923 if (pr->body[s]->can_eol) {
924 pr->head->can_eol = 1;
928 if (!pr->body[s]->nullable)
935 static void set_line_like(struct grammar *g)
938 for (p = 0; p < g->production_count; p++) {
939 struct production *pr = g->productions[p];
942 for (s = 1; s < pr->body_size; s++)
943 if (pr->body[s]->can_eol)
944 pr->body[s-1]->line_like = 1;
948 ### Building the `first` sets
950 When calculating what can follow a particular non-terminal, we will need to
951 know what the "first" terminal in any subsequent non-terminal might be. So
952 we calculate the `first` set for every non-terminal and store them in an
953 array. We don't bother recording the "first" set for terminals as they are
956 As the "first" for one symbol might depend on the "first" of another,
957 we repeat the calculations until no changes happen, just like with
958 `set_nullable`. This makes use of the fact that `symset_union`
959 reports if any change happens.
961 The core of this, which finds the "first" of part of a production body,
962 will be reused for computing the "follow" sets, so we split it out
963 into a separate function.
965 ###### grammar fields
966 struct symset *first;
970 static int add_first(struct production *pr, int start,
971 struct symset *target, struct grammar *g,
976 for (s = start; s < pr->body_size; s++) {
977 struct symbol *bs = pr->body[s];
978 if (bs->type == Terminal) {
979 if (symset_find(target, bs->num) < 0) {
980 symset_add(target, bs->num, 0);
984 } else if (symset_union(target, &g->first[bs->num]))
990 *to_end = (s == pr->body_size);
994 static void build_first(struct grammar *g)
998 g->first = calloc(g->num_syms, sizeof(g->first[0]));
999 for (s = 0; s < g->num_syms; s++)
1000 g->first[s] = INIT_SYMSET;
1002 while (check_again) {
1005 for (p = 0; p < g->production_count; p++) {
1006 struct production *pr = g->productions[p];
1007 struct symset *head = &g->first[pr->head->num];
1009 if (add_first(pr, 0, head, g, NULL))
1015 ### Building the `follow` sets.
1017 There are two different situations when we will want to generate "follow"
1018 sets. If we are doing an SLR analysis, we want to generate a single
1019 "follow" set for each non-terminal in the grammar. That is what is
1020 happening here. If we are doing an LALR or LR analysis we will want
1021 to generate a separate "LA" set for each item. We do that later
1022 in state generation.
1024 There are two parts to generating a "follow" set. Firstly we look at
1025 every place that any non-terminal appears in the body of any
1026 production, and we find the set of possible "first" symbols after
1027 there. This is done using `add_first` above and only needs to be done
1028 once as the "first" sets are now stable and will not change.
1032 for (p = 0; p < g->production_count; p++) {
1033 struct production *pr = g->productions[p];
1036 for (b = 0; b < pr->body_size - 1; b++) {
1037 struct symbol *bs = pr->body[b];
1038 if (bs->type == Terminal)
1040 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1044 The second part is to add the "follow" set of the head of a production
1045 to the "follow" sets of the final symbol in the production, and any
1046 other symbol which is followed only by `nullable` symbols. As this
1047 depends on "follow" itself we need to repeatedly perform the process
1048 until no further changes happen.
1052 for (again = 0, p = 0;
1053 p < g->production_count;
1054 p < g->production_count-1
1055 ? p++ : again ? (again = 0, p = 0)
1057 struct production *pr = g->productions[p];
1060 for (b = pr->body_size - 1; b >= 0; b--) {
1061 struct symbol *bs = pr->body[b];
1062 if (bs->type == Terminal)
1064 if (symset_union(&g->follow[bs->num],
1065 &g->follow[pr->head->num]))
1072 We now just need to create and initialise the `follow` list to get a
1075 ###### grammar fields
1076 struct symset *follow;
1079 static void build_follow(struct grammar *g)
1082 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1083 for (s = 0; s < g->num_syms; s++)
1084 g->follow[s] = INIT_SYMSET;
1088 ### Building itemsets and states
1090 There are three different levels of detail that can be chosen for
1091 building the itemsets and states for the LR grammar. They are:
1093 1. LR(0) or SLR(1), where no look-ahead is considered.
1094 2. LALR(1) where we build look-ahead sets with each item and merge
1095 the LA sets when we find two paths to the same "kernel" set of items.
1096 3. LR(1) where different look-ahead for any item in the set means
1097 a different state must be created.
1099 ###### forward declarations
1100 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1102 We need to be able to look through existing states to see if a newly
1103 generated state already exists. For now we use a simple sorted linked
1106 An item is a pair of numbers: the production number and the position of
1107 "DOT", which is an index into the body of the production.
1108 As the numbers are not enormous we can combine them into a single "short"
1109 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1110 production number (so 4000 productions with maximum of 15 symbols in the
1113 Comparing the itemsets will be a little different to comparing symsets
1114 as we want to do the lookup after generating the "kernel" of an
1115 itemset, so we need to ignore the offset=zero items which are added during
1118 To facilitate this, we modify the "DOT" number so that "0" sorts to
1119 the end of the list in the symset, and then only compare items before
1123 static inline unsigned short item_num(int production, int index)
1125 return production | ((31-index) << 11);
1127 static inline int item_prod(unsigned short item)
1129 return item & 0x7ff;
1131 static inline int item_index(unsigned short item)
1133 return (31-(item >> 11)) & 0x1f;
1136 For LR(1) analysis we need to compare not just the itemset in a state
1137 but also the LA sets. As we assign each unique LA set a number, we
1138 can just compare the symset and the data values together.
1141 static int itemset_cmp(struct symset a, struct symset b,
1142 enum grammar_type type)
1148 i < a.cnt && i < b.cnt &&
1149 item_index(a.syms[i]) > 0 &&
1150 item_index(b.syms[i]) > 0;
1152 int diff = a.syms[i] - b.syms[i];
1156 diff = a.data[i] - b.data[i];
1161 if (i == a.cnt || item_index(a.syms[i]) == 0)
1165 if (i == b.cnt || item_index(b.syms[i]) == 0)
1171 if (type < LR1 || av == -1)
1174 a.data[i] - b.data[i];
1177 And now we can build the list of itemsets. The lookup routine returns
1178 both a success flag and a pointer to where in the list an insert
1179 should happen, so we don't need to search a second time.
1181 FIXME: document min_prefix
1185 struct itemset *next;
1187 struct symset items;
1188 struct symset go_to;
1190 unsigned short precedence;
1196 ###### grammar fields
1197 struct itemset *items;
1201 static int itemset_find(struct grammar *g, struct itemset ***where,
1202 struct symset kernel, enum grammar_type type)
1204 struct itemset **ip;
1206 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1207 struct itemset *i = *ip;
1209 diff = itemset_cmp(i->items, kernel, type);
1222 Adding an itemset may require merging the LA sets if LALR analysis is
1223 happening. If any new LA set adds any symbols that weren't in the old LA set, we
1224 clear the `completed` flag so that the dependants of this itemset will be
1225 recalculated and their LA sets updated.
1227 `add_itemset` must consume the symsets it is passed, either by adding
1228 them to a data structure, of freeing them.
1230 static int add_itemset(struct grammar *g, struct symset ss,
1231 enum assoc assoc, unsigned short precedence,
1232 enum grammar_type type)
1234 struct itemset **where, *is;
1236 int found = itemset_find(g, &where, ss, type);
1238 is = calloc(1, sizeof(*is));
1239 is->state = g->states;
1243 is->precedence = precedence;
1245 is->go_to = INIT_DATASET;
1254 for (i = 0; i < ss.cnt; i++) {
1255 struct symset temp = INIT_SYMSET, s;
1256 if (ss.data[i] == is->items.data[i])
1258 s = set_find(g, is->items.data[i]);
1259 symset_union(&temp, &s);
1260 s = set_find(g, ss.data[i]);
1261 if (symset_union(&temp, &s)) {
1262 is->items.data[i] = save_set(g, temp);
1273 To build all the itemsets, we first insert the initial itemset made
1274 from production zero, complete each itemset, and then generate new
1275 itemsets from old until no new ones can be made.
1277 Completing an itemset means finding all the items where "DOT" is followed by
1278 a nonterminal and adding "DOT=0" items for every production from that
1279 non-terminal - providing each item hasn't already been added.
1281 If LA sets are needed, the LA set for each new item is found using
1282 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1283 be supplemented by the LA set for the item which produce the new item.
1285 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1286 is used in the next stage.
1287 If any of these symbols are flagged as starting a line, then this
1288 state must be a `starts_line` state so now is a good time to record that.
1290 When itemsets are created we assign a precedence to the itemset from
1291 the complete item, if there is one. We ignore the possibility of
1292 there being two and don't (currently) handle precedence in such
1293 grammars. When completing a grammar we ignore any item where DOT is
1294 followed by a terminal with a precedence lower (numerically higher)
1295 than that for the itemset. Unless the terminal has right
1296 associativity, we also ignore items where the terminal has the same
1297 precedence. The result is that unwanted items are still in the
1298 itemset, but the terminal doesn't get into the go to set, so the item
1301 ###### complete itemset
1302 for (i = 0; i < is->items.cnt; i++) {
1303 int p = item_prod(is->items.syms[i]);
1304 int bs = item_index(is->items.syms[i]);
1305 struct production *pr = g->productions[p];
1308 struct symset LA = INIT_SYMSET;
1309 unsigned short sn = 0;
1311 if (is->min_prefix == 0 ||
1312 (bs > 0 && bs < is->min_prefix))
1313 is->min_prefix = bs;
1314 if (bs == pr->body_size)
1317 if (s->precedence && is->precedence &&
1318 is->precedence < s->precedence)
1319 /* This terminal has a low precedence and
1320 * shouldn't be shifted
1323 if (s->precedence && is->precedence &&
1324 is->precedence == s->precedence && s->assoc != Right)
1325 /* This terminal has a matching precedence and is
1326 * not Right-associative, so we mustn't shift it.
1329 if (symset_find(&done, s->num) < 0) {
1330 symset_add(&done, s->num, 0);
1332 is->starts_line = 1;
1334 if (s->type != Nonterminal)
1340 add_first(pr, bs+1, &LA, g, &to_end);
1342 struct symset ss = set_find(g, is->items.data[i]);
1343 symset_union(&LA, &ss);
1345 sn = save_set(g, LA);
1346 LA = set_find(g, sn);
1349 /* Add productions for this symbol */
1350 for (p2 = s->first_production;
1351 p2 < g->production_count &&
1352 g->productions[p2]->head == s;
1354 int itm = item_num(p2, 0);
1355 int pos = symset_find(&is->items, itm);
1357 symset_add(&is->items, itm, sn);
1358 /* Will have re-ordered, so start
1359 * from beginning again */
1361 } else if (type >= LALR) {
1362 struct symset ss = set_find(g, is->items.data[pos]);
1363 struct symset tmp = INIT_SYMSET;
1365 symset_union(&tmp, &ss);
1366 if (symset_union(&tmp, &LA)) {
1367 is->items.data[pos] = save_set(g, tmp);
1375 For each symbol we found (and placed in `done`) we collect all the items for
1376 which this symbol is next, and create a new itemset with "DOT" advanced over
1377 the symbol. This is then added to the collection of itemsets (or merged
1378 with a pre-existing itemset).
1380 ###### derive itemsets
1381 // Now we have a completed itemset, so we need to
1382 // compute all the 'next' itemsets and create them
1383 // if they don't exist.
1384 for (i = 0; i < done.cnt; i++) {
1386 unsigned short state;
1387 struct symbol *sym = g->symtab[done.syms[i]];
1388 enum assoc assoc = Non;
1389 unsigned short precedence = 0;
1390 struct symset newitemset = INIT_SYMSET;
1392 newitemset = INIT_DATASET;
1394 for (j = 0; j < is->items.cnt; j++) {
1395 int itm = is->items.syms[j];
1396 int p = item_prod(itm);
1397 int bp = item_index(itm);
1398 struct production *pr = g->productions[p];
1399 unsigned short la = 0;
1402 if (bp == pr->body_size)
1404 if (pr->body[bp] != sym)
1407 la = is->items.data[j];
1408 pos = symset_find(&newitemset, pr->head->num);
1409 if (bp + 1 == pr->body_size &&
1410 pr->precedence > 0 &&
1412 pr->precedence < precedence)) {
1413 // new itemset is reducible and has a precedence.
1414 precedence = pr->precedence;
1418 symset_add(&newitemset, item_num(p, bp+1), la);
1419 else if (type >= LALR) {
1420 // Need to merge la set.
1421 int la2 = newitemset.data[pos];
1423 struct symset ss = set_find(g, la2);
1424 struct symset LA = INIT_SYMSET;
1425 symset_union(&LA, &ss);
1426 ss = set_find(g, la);
1427 if (symset_union(&LA, &ss))
1428 newitemset.data[pos] = save_set(g, LA);
1434 state = add_itemset(g, newitemset, assoc, precedence, type);
1435 if (symset_find(&is->go_to, done.syms[i]) < 0)
1436 symset_add(&is->go_to, done.syms[i], state);
1439 All that is left is to create the initial itemset from production zero, and
1440 with `TK_eof` as the LA set.
1443 static void build_itemsets(struct grammar *g, enum grammar_type type)
1445 struct symset first = INIT_SYMSET;
1448 unsigned short la = 0;
1450 // LA set just has eof
1451 struct symset eof = INIT_SYMSET;
1452 symset_add(&eof, TK_eof, 0);
1453 la = save_set(g, eof);
1454 first = INIT_DATASET;
1456 // production 0, offset 0 (with no data)
1457 symset_add(&first, item_num(0, 0), la);
1458 add_itemset(g, first, Non, 0, type);
1459 for (again = 0, is = g->items;
1461 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1463 struct symset done = INIT_SYMSET;
1474 ### Completing the analysis.
1476 The exact process of analysis depends on which level was requested. For
1477 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1478 `LR1` we need first, but not follow. For `SLR` we need both.
1480 We don't build the "action" tables that you might expect as the parser
1481 doesn't use them. They will be calculated to some extent if needed for
1484 Once we have built everything we allocate arrays for the two lists:
1485 symbols and itemsets. This allows more efficient access during reporting.
1486 The symbols are grouped as terminals and non-terminals and we record the
1487 changeover point in `first_nonterm`.
1489 ###### grammar fields
1490 struct symbol **symtab;
1491 struct itemset **statetab;
1494 ###### grammar_analyse
1496 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1500 int snum = TK_reserved;
1501 for (s = g->syms; s; s = s->next)
1502 if (s->num < 0 && s->type == Terminal) {
1506 g->first_nonterm = snum;
1507 for (s = g->syms; s; s = s->next)
1513 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1514 for (s = g->syms; s; s = s->next)
1515 g->symtab[s->num] = s;
1526 build_itemsets(g, type);
1528 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1529 for (is = g->items; is ; is = is->next)
1530 g->statetab[is->state] = is;
1533 ## Reporting on the Grammar
1535 The purpose of the report is to give the grammar developer insight into
1536 how the grammar parser will work. It is basically a structured dump of
1537 all the tables that have been generated, plus a description of any conflicts.
1539 ###### grammar_report
1540 static int grammar_report(struct grammar *g, enum grammar_type type)
1546 return report_conflicts(g, type);
1549 Firstly we have the complete list of symbols, together with the
1550 "FIRST" set if that was generated. We add a mark to each symbol to
1551 show if it can end in a newline (`>`), if it is considered to be
1552 "line-like" (`<`), or if it is nullable (`.`).
1556 static void report_symbols(struct grammar *g)
1560 printf("SYMBOLS + FIRST:\n");
1562 printf("SYMBOLS:\n");
1564 for (n = 0; n < g->num_syms; n++) {
1565 struct symbol *s = g->symtab[n];
1569 printf(" %c%c%c%3d%c: ",
1570 s->nullable ? '.':' ',
1571 s->can_eol ? '>':' ',
1572 s->line_like ? '<':' ',
1573 s->num, symtypes[s->type]);
1576 printf(" (%d%s)", s->precedence,
1577 assoc_names[s->assoc]);
1579 if (g->first && s->type == Nonterminal) {
1582 for (j = 0; j < g->first[n].cnt; j++) {
1585 prtxt(g->symtab[g->first[n].syms[j]]->name);
1592 Then we have the follow sets if they were computed.
1594 static void report_follow(struct grammar *g)
1597 printf("FOLLOW:\n");
1598 for (n = 0; n < g->num_syms; n++)
1599 if (g->follow[n].cnt) {
1603 prtxt(g->symtab[n]->name);
1604 for (j = 0; j < g->follow[n].cnt; j++) {
1607 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1613 And finally the item sets. These include the GO TO tables and, for
1614 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1615 it up a bit. First the items, with production number and associativity.
1617 static void report_item(struct grammar *g, int itm)
1619 int p = item_prod(itm);
1620 int dot = item_index(itm);
1621 struct production *pr = g->productions[p];
1625 prtxt(pr->head->name);
1627 for (i = 0; i < pr->body_size; i++) {
1628 printf(" %s", (dot == i ? ". ": ""));
1629 prtxt(pr->body[i]->name);
1631 if (dot == pr->body_size)
1634 if (pr->precedence && dot == pr->body_size)
1635 printf(" (%d%s)", pr->precedence,
1636 assoc_names[pr->assoc]);
1637 if (dot < pr->body_size &&
1638 pr->body[dot]->precedence) {
1639 struct symbol *s = pr->body[dot];
1640 printf(" [%d%s]", s->precedence,
1641 assoc_names[s->assoc]);
1646 The LA sets which are (possibly) reported with each item:
1648 static void report_la(struct grammar *g, int lanum)
1650 struct symset la = set_find(g, lanum);
1654 printf(" LOOK AHEAD(%d)", lanum);
1655 for (i = 0; i < la.cnt; i++) {
1658 prtxt(g->symtab[la.syms[i]]->name);
1663 Then the go to sets:
1666 static void report_goto(struct grammar *g, struct symset gt)
1671 for (i = 0; i < gt.cnt; i++) {
1673 prtxt(g->symtab[gt.syms[i]]->name);
1674 printf(" -> %d\n", gt.data[i]);
1678 Now we can report all the item sets complete with items, LA sets, and GO TO.
1680 static void report_itemsets(struct grammar *g)
1683 printf("ITEM SETS(%d)\n", g->states);
1684 for (s = 0; s < g->states; s++) {
1686 struct itemset *is = g->statetab[s];
1687 printf(" Itemset %d:%s min prefix=%d",
1688 s, is->starts_line?" (startsline)":"", is->min_prefix);
1690 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1692 for (j = 0; j < is->items.cnt; j++) {
1693 report_item(g, is->items.syms[j]);
1694 if (is->items.data != NO_DATA)
1695 report_la(g, is->items.data[j]);
1697 report_goto(g, is->go_to);
1701 ### Reporting conflicts
1703 Conflict detection varies a lot among different analysis levels. However
1704 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1705 LR1 are also similar as they have FOLLOW or LA sets.
1709 ## conflict functions
1711 static int report_conflicts(struct grammar *g, enum grammar_type type)
1714 printf("Conflicts:\n");
1716 cnt = conflicts_lr0(g, type);
1718 cnt = conflicts_slr(g, type);
1720 printf(" - no conflicts\n");
1724 LR0 conflicts are any state which have both a reducible item and
1725 a shiftable item, or two reducible items.
1727 LR05 conflicts only occur if two possible reductions exist,
1728 as shifts always over-ride reductions.
1730 ###### conflict functions
1731 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1735 for (i = 0; i < g->states; i++) {
1736 struct itemset *is = g->statetab[i];
1737 int last_reduce = -1;
1738 int prev_reduce = -1;
1739 int last_shift = -1;
1743 for (j = 0; j < is->items.cnt; j++) {
1744 int itm = is->items.syms[j];
1745 int p = item_prod(itm);
1746 int bp = item_index(itm);
1747 struct production *pr = g->productions[p];
1749 if (bp == pr->body_size) {
1750 prev_reduce = last_reduce;
1754 if (pr->body[bp]->type == Terminal)
1757 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1758 printf(" State %d has both SHIFT and REDUCE:\n", i);
1759 report_item(g, is->items.syms[last_shift]);
1760 report_item(g, is->items.syms[last_reduce]);
1763 if (prev_reduce >= 0) {
1764 printf(" State %d has 2 (or more) reducible items\n", i);
1765 report_item(g, is->items.syms[prev_reduce]);
1766 report_item(g, is->items.syms[last_reduce]);
1773 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1774 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1775 in the source of the look ahead set.
1777 We build two datasets to reflect the "action" table: one which maps
1778 terminals to items where that terminal could be shifted and another
1779 which maps terminals to items that could be reduced when the terminal
1780 is in look-ahead. We report when we get conflicts between the two.
1782 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1787 for (i = 0; i < g->states; i++) {
1788 struct itemset *is = g->statetab[i];
1789 struct symset shifts = INIT_DATASET;
1790 struct symset reduce = INIT_DATASET;
1794 /* First collect the shifts */
1795 for (j = 0; j < is->items.cnt; j++) {
1796 unsigned short itm = is->items.syms[j];
1797 int p = item_prod(itm);
1798 int bp = item_index(itm);
1799 struct production *pr = g->productions[p];
1801 if (bp < pr->body_size &&
1802 pr->body[bp]->type == Terminal) {
1804 int sym = pr->body[bp]->num;
1805 if (symset_find(&shifts, sym) < 0)
1806 symset_add(&shifts, sym, itm);
1809 /* Now look for reduction and conflicts */
1810 for (j = 0; j < is->items.cnt; j++) {
1811 unsigned short itm = is->items.syms[j];
1812 int p = item_prod(itm);
1813 int bp = item_index(itm);
1814 struct production *pr = g->productions[p];
1816 if (bp < pr->body_size)
1821 la = g->follow[pr->head->num];
1823 la = set_find(g, is->items.data[j]);
1825 for (k = 0; k < la.cnt; k++) {
1826 int pos = symset_find(&shifts, la.syms[k]);
1828 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1829 prtxt(g->symtab[la.syms[k]]->name);
1831 report_item(g, shifts.data[pos]);
1832 report_item(g, itm);
1835 pos = symset_find(&reduce, la.syms[k]);
1837 symset_add(&reduce, la.syms[k], itm);
1840 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1841 prtxt(g->symtab[la.syms[k]]->name);
1843 report_item(g, itm);
1844 report_item(g, reduce.data[pos]);
1848 symset_free(shifts);
1849 symset_free(reduce);
1855 ## Generating the parser
1857 The exported part of the parser is the `parse_XX` function, where the name
1858 `XX` is based on the name of the parser files.
1860 This takes a `code_node`, a partially initialized `token_config`, and an
1861 optional `FILE` to send tracing to. The `token_config` gets the list of
1862 known words added and then is used with the `code_node` to initialize the
1865 `parse_XX` then calls the library function `parser_run` to actually complete
1866 the parse. This needs the `states` table and function to call the various
1867 pieces of code provided in the grammar file, so they are generated first.
1869 ###### parser_generate
1871 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1877 gen_reduce(f, g, file);
1880 fprintf(f, "#line 0 \"gen_parser\"\n");
1881 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1884 fprintf(f, "\tstruct token_state *tokens;\n");
1885 fprintf(f, "\tconfig->words_marks = known;\n");
1886 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1887 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1888 fprintf(f, "\ttokens = token_open(code, config);\n");
1889 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1890 fprintf(f, "\ttoken_close(tokens);\n");
1891 fprintf(f, "\treturn rv;\n");
1892 fprintf(f, "}\n\n");
1895 ### Known words table
1897 The known words table is simply an array of terminal symbols.
1898 The table of nonterminals used for tracing is a similar array.
1902 static void gen_known(FILE *f, struct grammar *g)
1905 fprintf(f, "#line 0 \"gen_known\"\n");
1906 fprintf(f, "static const char *known[] = {\n");
1907 for (i = TK_reserved;
1908 i < g->num_syms && g->symtab[i]->type == Terminal;
1910 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1911 g->symtab[i]->name.txt);
1912 fprintf(f, "};\n\n");
1915 static void gen_non_term(FILE *f, struct grammar *g)
1918 fprintf(f, "#line 0 \"gen_non_term\"\n");
1919 fprintf(f, "static const char *non_term[] = {\n");
1920 for (i = TK_reserved;
1923 if (g->symtab[i]->type == Nonterminal)
1924 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1925 g->symtab[i]->name.txt);
1926 fprintf(f, "};\n\n");
1929 ### States and the goto tables.
1931 For each state we record the goto table, the reducible production if
1932 there is one, or a symbol to shift for error recovery.
1933 Some of the details of the reducible production are stored in the
1934 `do_reduce` function to come later. Here we store the production number,
1935 the body size (useful for stack management) and the resulting symbol (useful
1936 for knowing how to free data later).
1938 The go to table is stored in a simple array of `sym` and corresponding
1941 ###### exported types
1949 const struct lookup * go_to;
1960 static void gen_goto(FILE *f, struct grammar *g)
1963 fprintf(f, "#line 0 \"gen_goto\"\n");
1964 for (i = 0; i < g->states; i++) {
1966 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1968 struct symset gt = g->statetab[i]->go_to;
1969 for (j = 0; j < gt.cnt; j++)
1970 fprintf(f, "\t{ %d, %d },\n",
1971 gt.syms[j], gt.data[j]);
1978 static void gen_states(FILE *f, struct grammar *g)
1981 fprintf(f, "#line 0 \"gen_states\"\n");
1982 fprintf(f, "static const struct state states[] = {\n");
1983 for (i = 0; i < g->states; i++) {
1984 struct itemset *is = g->statetab[i];
1985 int j, prod = -1, prod_len;
1987 for (j = 0; j < is->items.cnt; j++) {
1988 int itm = is->items.syms[j];
1989 int p = item_prod(itm);
1990 int bp = item_index(itm);
1991 struct production *pr = g->productions[p];
1993 if (bp < pr->body_size)
1995 /* This is what we reduce */
1996 if (prod < 0 || prod_len < pr->body_size) {
1998 prod_len = pr->body_size;
2003 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d },\n",
2004 i, is->go_to.cnt, i, prod,
2005 g->productions[prod]->body_size,
2006 g->productions[prod]->head->num,
2007 is->starts_line, is->min_prefix);
2009 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
2010 i, is->go_to.cnt, i,
2011 is->starts_line, is->min_prefix);
2013 fprintf(f, "};\n\n");
2016 ### The `do_reduce` function and the code
2018 When the parser engine decides to reduce a production, it calls `do_reduce`.
2019 This has two functions.
2021 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
2022 production being reduced. Secondly it runs the code that was included with
2023 the production if any.
2025 This code needs to be able to store data somewhere. Rather than requiring
2026 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
2027 `do_reduce` return the size to be saved.
2029 In order for the code to access "global" context, we pass in the
2030 "config" pointer that was passed to parser function. If the `struct
2031 token_config` is embedded in some larger structure, the reducing code
2032 can access the larger structure using pointer manipulation.
2034 The code fragment requires translation when written out. Any `$N` needs to
2035 be converted to a reference either to that buffer (if `$0`) or to the
2036 structure returned by a previous reduction. These pointers need to be cast
2037 to the appropriate type for each access. All this is handled in
2040 `gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
2041 This applied only to symbols with references (or pointers), not those with structures.
2042 The `<` implies that the reference it being moved out, so the object will not be
2043 automatically freed. This is equivalent to assigning `NULL` to the pointer.
2047 static void gen_code(struct production *p, FILE *f, struct grammar *g)
2050 char *used = calloc(1, p->body_size);
2053 fprintf(f, "\t\t\t");
2054 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2068 if (*c < '0' || *c > '9') {
2075 while (c[1] >= '0' && c[1] <= '9') {
2077 n = n * 10 + *c - '0';
2080 fprintf(f, "(*(struct %.*s*%s)ret)",
2081 p->head->struct_name.len,
2082 p->head->struct_name.txt,
2083 p->head->isref ? "*":"");
2084 else if (n > p->body_size)
2085 fprintf(f, "$%d", n);
2086 else if (p->body[n-1]->type == Terminal)
2087 fprintf(f, "(*(struct token *)body[%d])",
2089 else if (p->body[n-1]->struct_name.txt == NULL)
2090 fprintf(f, "$%d", n);
2092 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2093 p->body[n-1]->struct_name.len,
2094 p->body[n-1]->struct_name.txt,
2095 p->body[n-1]->isref ? "*":"", n-1);
2100 for (i = 0; i < p->body_size; i++) {
2101 if (p->body[i]->struct_name.txt &&
2102 p->body[i]->isref &&
2104 // assume this has been copied out
2105 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2112 static void gen_reduce(FILE *f, struct grammar *g, char *file)
2115 fprintf(f, "#line 0 \"gen_reduce\"\n");
2116 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2118 fprintf(f, "\tint ret_size = 0;\n");
2120 fprintf(f, "\tswitch(prod) {\n");
2121 for (i = 0; i < g->production_count; i++) {
2122 struct production *p = g->productions[i];
2123 fprintf(f, "\tcase %d:\n", i);
2126 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2130 if (p->head->struct_name.txt)
2131 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2132 p->head->struct_name.len,
2133 p->head->struct_name.txt,
2134 p->head->isref ? "*":"");
2136 fprintf(f, "\t\tbreak;\n");
2138 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2143 As each non-terminal can potentially cause a different type of data
2144 structure to be allocated and filled in, we need to be able to free it when
2147 It is particularly important to have fine control over freeing during error
2148 recovery where individual stack frames might need to be freed.
2150 For this, the grammar author is required to defined a `free_XX` function for
2151 each structure that is used by a non-terminal. `do_free` will call whichever
2152 is appropriate given a symbol number, and will call `free` (as is
2153 appropriate for tokens) on any terminal symbol.
2157 static void gen_free(FILE *f, struct grammar *g)
2161 fprintf(f, "#line 0 \"gen_free\"\n");
2162 fprintf(f, "static void do_free(short sym, void *asn)\n");
2164 fprintf(f, "\tif (!asn) return;\n");
2165 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2166 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2167 fprintf(f, "\tswitch(sym) {\n");
2169 for (i = 0; i < g->num_syms; i++) {
2170 struct symbol *s = g->symtab[i];
2172 s->type != Nonterminal ||
2173 s->struct_name.txt == NULL)
2176 fprintf(f, "\tcase %d:\n", s->num);
2178 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2180 s->struct_name.txt);
2181 fprintf(f, "\t\tfree(asn);\n");
2183 fprintf(f, "\t\tfree_%.*s(asn);\n",
2185 s->struct_name.txt);
2186 fprintf(f, "\t\tbreak;\n");
2188 fprintf(f, "\t}\n}\n\n");
2191 ## The main routine.
2193 There are three key parts to the "main" function of parsergen: processing
2194 the arguments, loading the grammar file, and dealing with the grammar.
2196 ### Argument processing.
2198 Command line options allow the selection of analysis mode, name of output
2199 file, and whether or not a report should be generated. By default we create
2200 a report only if no code output was requested.
2202 The `parse_XX` function name uses the basename of the output file which
2203 should not have a suffix (`.c`). `.c` is added to the given name for the
2204 code, and `.h` is added for the header (if header text is specifed in the
2211 static const struct option long_options[] = {
2212 { "LR0", 0, NULL, '0' },
2213 { "LR05", 0, NULL, '5' },
2214 { "SLR", 0, NULL, 'S' },
2215 { "LALR", 0, NULL, 'L' },
2216 { "LR1", 0, NULL, '1' },
2217 { "tag", 1, NULL, 't' },
2218 { "report", 0, NULL, 'R' },
2219 { "output", 1, NULL, 'o' },
2220 { NULL, 0, NULL, 0 }
2222 const char *options = "05SL1t:Ro:";
2224 ###### process arguments
2226 char *outfile = NULL;
2231 enum grammar_type type = LR05;
2232 while ((opt = getopt_long(argc, argv, options,
2233 long_options, NULL)) != -1) {
2248 outfile = optarg; break;
2250 tag = optarg; break;
2252 fprintf(stderr, "Usage: parsergen ...\n");
2257 infile = argv[optind++];
2259 fprintf(stderr, "No input file given\n");
2262 if (outfile && report == 1)
2265 if (name && strchr(name, '/'))
2266 name = strrchr(name, '/')+1;
2268 if (optind < argc) {
2269 fprintf(stderr, "Excess command line arguments\n");
2273 ### Loading the grammar file
2275 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2278 Once we have extracted the code (with `mdcode`) we expect to find three
2279 sections: header, code, and grammar. Anything else that is not
2280 excluded by the `--tag` option is an error.
2282 "header" and "code" are optional, though it is hard to build a working
2283 parser with neither. "grammar" must be provided.
2287 #include <sys/mman.h>
2292 static void pr_err(char *msg)
2295 fprintf(stderr, "%s\n", msg);
2299 struct section *table;
2303 fd = open(infile, O_RDONLY);
2305 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2306 infile, strerror(errno));
2309 len = lseek(fd, 0, 2);
2310 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2311 table = code_extract(file, file+len, pr_err);
2313 struct code_node *hdr = NULL;
2314 struct code_node *code = NULL;
2315 struct code_node *gram = NULL;
2316 for (s = table; s; s = s->next) {
2317 struct text sec = s->section;
2318 if (tag && !strip_tag(&sec, tag))
2320 if (text_is(sec, "header"))
2322 else if (text_is(sec, "code"))
2324 else if (text_is(sec, "grammar"))
2327 fprintf(stderr, "Unknown content section: %.*s\n",
2328 s->section.len, s->section.txt);
2333 ### Processing the input
2335 As we need to append an extention to a filename and then open it for
2336 writing, and we need to do this twice, it helps to have a separate function.
2340 static FILE *open_ext(char *base, char *ext)
2342 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2344 strcat(strcpy(fn, base), ext);
2350 If we can read the grammar, then we analyse and optionally report on it. If
2351 the report finds conflicts we will exit with an error status.
2353 ###### process input
2354 struct grammar *g = NULL;
2356 fprintf(stderr, "No grammar section provided\n");
2359 g = grammar_read(gram);
2361 fprintf(stderr, "Failure to parse grammar\n");
2366 grammar_analyse(g, type);
2368 if (grammar_report(g, type))
2372 If a "headers" section is defined, we write it out and include a declaration
2373 for the `parse_XX` function so it can be used from separate code.
2375 if (rv == 0 && hdr && outfile) {
2376 FILE *f = open_ext(outfile, ".h");
2378 code_node_print(f, hdr, infile);
2379 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2383 fprintf(stderr, "Cannot create %s.h\n",
2389 And if all goes well and an output file was provided, we create the `.c`
2390 file with the code section (if any) and the parser tables and function.
2392 if (rv == 0 && outfile) {
2393 FILE *f = open_ext(outfile, ".c");
2396 code_node_print(f, code, infile);
2397 gen_parser(f, g, infile, name);
2400 fprintf(stderr, "Cannot create %s.c\n",
2406 And that about wraps it up. We need to set the locale so that UTF-8 is
2407 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2409 ###### File: parsergen.mk
2410 parsergen : parsergen.o libscanner.o libmdcode.o
2411 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2418 int main(int argc, char *argv[])
2423 setlocale(LC_ALL,"");
2425 ## process arguments
2432 ## The SHIFT/REDUCE parser
2434 Having analysed the grammar and generated all the tables, we only need the
2435 shift/reduce engine to bring it all together.
2437 ### Goto table lookup
2439 The parser generator has nicely provided us with goto tables sorted by
2440 symbol number. We need a binary search function to find a symbol in the
2443 ###### parser functions
2445 static int search(const struct state *l, int sym)
2448 int hi = l->go_to_cnt;
2452 while (lo + 1 < hi) {
2453 int mid = (lo + hi) / 2;
2454 if (l->go_to[mid].sym <= sym)
2459 if (l->go_to[lo].sym == sym)
2460 return l->go_to[lo].state;
2465 ### The state stack.
2467 The core data structure for the parser is the stack. This tracks all the
2468 symbols that have been recognised or partially recognised.
2470 The stack usually won't grow very large - maybe a few tens of entries. So
2471 we dynamically resize and array as required but never bother to shrink it
2474 We keep the stack as two separate allocations. One, `asn_stack` stores the
2475 "abstract syntax nodes" which are created by each reduction. When we call
2476 `do_reduce` we need to pass an array of the `asn`s of the body of the
2477 production, and by keeping a separate `asn` stack, we can just pass a
2478 pointer into this stack.
2480 The other allocation stores all other stack fields of which there are six.
2481 The `state` is the most important one and guides the parsing process. The
2482 `sym` is nearly unnecessary. However when we want to free entries from the
2483 `asn_stack`, it helps to know what type they are so we can call the right
2484 freeing function. The symbol leads us to the right free function through
2487 The `indents` count tracks the line indents with-in the symbol or
2488 immediately follow it. These are used to allow indent information to
2489 guide parsing and error recovery.
2491 `since_newline` tracks how many stack frames since the last
2492 start-of-line (whether indented or not). So if `since_newline` is
2493 zero, then this symbol is at the start of a line. Similarly
2494 `since_indent` counts the number of states since an indent, it is zero
2495 precisely when `indents` is not zero.
2497 `newline_permitted` keeps track of whether newlines should be ignored
2500 The stack is most properly seen as alternating states and symbols -
2501 states, like the 'DOT' in items, are between symbols. Each frame in
2502 our stack holds a state and the symbol that was before it. The
2503 bottom of stack holds the start state but no symbol, as nothing came
2504 before the beginning.
2506 ###### parser functions
2511 short newline_permitted;
2515 short since_newline;
2525 Two operations are needed on the stack - shift (which is like push) and pop.
2527 Shift applies not only to terminals but also to non-terminals. When
2528 we reduce a production we will pop off entries corresponding to the
2529 body symbols, then push on an item for the head of the production.
2530 This last is exactly the same process as shifting in a terminal so we
2531 use the same function for both. In both cases we provide the symbol,
2532 the number of indents the symbol contains (which will be zero for a
2533 terminal symbol) and a flag indicating the the symbol was at (or was
2534 reduced from a symbol which was at) the start of a line. The state is
2535 deduced from the current top-of-stack state and the new symbol.
2537 To simplify other code we arrange for `shift` to fail if there is no `goto`
2538 state for the symbol. This is useful in basic parsing due to our design
2539 that we shift when we can, and reduce when we cannot. So the `shift`
2540 function reports if it could.
2542 `shift` is also used to push state zero onto the stack, so if the
2543 stack is empty, it always chooses zero as the next state.
2545 So `shift` finds the next state. If that succeeds it extends the
2546 allocations if needed and pushes all the information onto the stacks.
2548 Newlines are permitted after a `starts_line` state until an internal
2549 indent. If the new frame has neither a `starts_line` state nor an
2550 indent, newlines are permitted if the previous stack frame permitted
2553 ###### parser functions
2555 static int shift(struct parser *p,
2556 short sym, short indents, short start_of_line,
2558 const struct state states[])
2560 // Push an entry onto the stack
2561 struct frame next = {0};
2562 int newstate = p->tos
2563 ? search(&states[p->stack[p->tos-1].state],
2568 if (p->tos >= p->stack_size) {
2569 p->stack_size += 10;
2570 p->stack = realloc(p->stack, p->stack_size
2571 * sizeof(p->stack[0]));
2572 p->asn_stack = realloc(p->asn_stack, p->stack_size
2573 * sizeof(p->asn_stack[0]));
2576 next.indents = indents;
2577 next.state = newstate;
2578 if (states[newstate].starts_line)
2579 next.newline_permitted = 1;
2581 next.newline_permitted = 0;
2583 next.newline_permitted =
2584 p->stack[p->tos-1].newline_permitted;
2586 next.newline_permitted = 0;
2588 if (!start_of_line) {
2590 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2592 next.since_newline = 1;
2595 next.since_indent = 0;
2597 next.since_indent = p->stack[p->tos-1].since_indent + 1;
2599 next.since_indent = 1;
2601 p->stack[p->tos] = next;
2602 p->asn_stack[p->tos] = asn;
2607 `pop` primarily moves the top of stack (`tos`) back down the required
2608 amount and frees any `asn` entries that need to be freed. It also
2609 collects a summary of the indents and line starts in the symbols that
2610 are being removed. It is called _after_ we reduce a production, just
2611 before we `shift` the nonterminal in.
2613 ###### parser functions
2615 static int pop(struct parser *p, int num,
2616 short *start_of_line,
2617 void(*do_free)(short sym, void *asn))
2624 for (i = 0; i < num; i++) {
2625 sol |= !p->stack[p->tos+i].since_newline;
2626 indents += p->stack[p->tos+i].indents;
2627 do_free(p->stack[p->tos+i].sym,
2628 p->asn_stack[p->tos+i]);
2631 *start_of_line = sol;
2635 ### Memory allocation
2637 The `scanner` returns tokens in a local variable - we want them in allocated
2638 memory so they can live in the `asn_stack`. Similarly the `asn` produced by
2639 a reduce is in a large buffer. Both of these require some allocation and
2640 copying, hence `memdup` and `tokcopy`.
2642 ###### parser includes
2645 ###### parser functions
2647 void *memdup(void *m, int len)
2653 memcpy(ret, m, len);
2657 static struct token *tok_copy(struct token tk)
2659 struct token *new = malloc(sizeof(*new));
2664 ### The heart of the parser.
2666 Now we have the parser. If we can shift we do, though newlines and
2667 reducing indenting may block that. If not and we can reduce we do
2668 that. If the production we reduced was production zero, then we have
2669 accepted the input and can finish.
2671 We return whatever `asn` was returned by reducing production zero.
2673 If we can neither shift nor reduce we have an error to handle. We pop
2674 single entries off the stack until we can shift the `TK_error` symbol, then
2675 drop input tokens until we find one we can shift into the new error state.
2677 When we find `TK_in` and `TK_out` tokens which report indents we need
2678 to handle them directly as the grammar cannot express what we want to
2681 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2682 record how many indents there are following the previous token.
2684 `TK_out` tokens must be canceled against an indent count
2685 within the stack. If we can reduce some symbols that are all since
2686 the most recent indent, then we do that first. If the minimum prefix
2687 of the current state then extends back before the most recent indent,
2688 that indent can be cancelled. If the minimum prefix is shorter then
2689 the indent is premature and we must start error handling, which
2690 currently doesn't work at all.
2692 `TK_newline` tokens are ignored unless the top stack frame records
2693 that they are permitted. In that case they will not be considered for
2694 shifting if it is possible to reduce some symbols that are all since
2695 the most recent start of line. This is how a newline forcible
2696 terminates any line-like structure - we try to reduce down to at most
2697 one symbol for each line where newlines are allowed.
2699 ###### parser includes
2702 void *parser_run(struct token_state *tokens,
2703 const struct state states[],
2704 int (*do_reduce)(int, void**, struct token_config*, void*),
2705 void (*do_free)(short, void*),
2706 FILE *trace, const char *non_term[],
2707 struct token_config *config)
2709 struct parser p = { 0 };
2710 struct token *tk = NULL;
2714 shift(&p, TK_eof, 0, 1, NULL, states);
2716 struct token *err_tk;
2717 struct frame *tos = &p.stack[p.tos-1];
2719 tk = tok_copy(token_next(tokens));
2720 parser_trace(trace, &p,
2721 tk, states, non_term, config->known_count);
2723 if (tk->num == TK_in) {
2725 tos->since_newline = 0;
2726 tos->since_indent = 0;
2727 if (!states[tos->state].starts_line)
2728 tos->newline_permitted = 0;
2731 parser_trace_action(trace, "Record");
2734 if (tk->num == TK_out) {
2735 if (states[tos->state].reduce_size >= 0 &&
2736 states[tos->state].reduce_size <= tos->since_indent)
2738 if (states[tos->state].min_prefix >= tos->since_indent) {
2740 struct frame *in = tos - tos->since_indent;
2742 if (in->indents == 0) {
2743 /* Reassess since_indent and newline_permitted */
2745 in->since_indent = in[-1].since_indent + 1;
2746 in->newline_permitted = in[-1].newline_permitted;
2748 in->since_indent = 0;
2749 in->newline_permitted = 0;
2751 if (states[in->state].starts_line)
2752 in->newline_permitted = 1;
2755 in->since_indent = in[-1].since_indent + 1;
2756 if (states[in->state].starts_line)
2757 in->newline_permitted = 1;
2759 in->newline_permitted = in[-1].newline_permitted;
2764 parser_trace_action(trace, "Cancel");
2767 // fall through and force a REDUCE (as 'shift'
2770 if (tk->num == TK_newline) {
2771 if (!tos->newline_permitted) {
2774 parser_trace_action(trace, "Discard");
2777 if (tos->since_newline > 1 &&
2778 states[tos->state].reduce_size >= 0 &&
2779 states[tos->state].reduce_size <= tos->since_newline)
2782 if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
2784 parser_trace_action(trace, "Shift");
2788 if (states[tos->state].reduce_prod >= 0) {
2791 const struct state *nextstate = &states[tos->state];
2792 int prod = nextstate->reduce_prod;
2793 int size = nextstate->reduce_size;
2795 static char buf[16*1024];
2796 short indents, start_of_line;
2798 body = p.asn_stack + (p.tos - size);
2800 bufsize = do_reduce(prod, body, config, buf);
2802 indents = pop(&p, size, &start_of_line,
2804 res = memdup(buf, bufsize);
2805 memset(buf, 0, bufsize);
2806 if (!shift(&p, nextstate->reduce_sym,
2807 indents, start_of_line,
2809 if (prod != 0) abort();
2813 parser_trace_action(trace, "Reduce");
2816 /* Error. We walk up the stack until we
2817 * find a state which will accept TK_error.
2818 * We then shift in TK_error and see what state
2819 * that takes us too.
2820 * Then we discard input tokens until
2821 * we find one that is acceptable.
2823 parser_trace_action(trace, "ERROR");
2824 short indents = 0, start_of_line;
2826 err_tk = tok_copy(*tk);
2827 while (shift(&p, TK_error, 0, 0,
2828 err_tk, states) == 0
2830 // discard this state
2831 indents += pop(&p, 1, &start_of_line, do_free);
2834 // no state accepted TK_error
2837 tos = &p.stack[p.tos-1];
2838 while (search(&states[tos->state], tk->num) < 0 &&
2839 tk->num != TK_eof) {
2841 tk = tok_copy(token_next(tokens));
2842 if (tk->num == TK_in)
2844 if (tk->num == TK_out) {
2848 // FIXME update since_indent here
2851 if (p.tos == 0 && tk->num == TK_eof)
2853 tos = &p.stack[p.tos-1];
2854 tos->indents += indents;
2858 pop(&p, p.tos, NULL, do_free);
2864 ###### exported functions
2865 void *parser_run(struct token_state *tokens,
2866 const struct state states[],
2867 int (*do_reduce)(int, void**, struct token_config*, void*),
2868 void (*do_free)(short, void*),
2869 FILE *trace, const char *non_term[],
2870 struct token_config *config);
2874 Being able to visualize the parser in action can be invaluable when
2875 debugging the parser code, or trying to understand how the parse of a
2876 particular grammar progresses. The stack contains all the important
2877 state, so just printing out the stack every time around the parse loop
2878 can make it possible to see exactly what is happening.
2880 This doesn't explicitly show each SHIFT and REDUCE action. However they
2881 are easily deduced from the change between consecutive lines, and the
2882 details of each state can be found by cross referencing the states list
2883 in the stack with the "report" that parsergen can generate.
2885 For terminal symbols, we just dump the token. For non-terminals we
2886 print the name of the symbol. The look ahead token is reported at the
2887 end inside square brackets.
2889 ###### parser functions
2891 static char *reserved_words[] = {
2892 [TK_error] = "ERROR",
2895 [TK_newline] = "NEWLINE",
2898 static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2900 fprintf(trace, "(%d", f->state);
2901 if (states[f->state].starts_line)
2902 fprintf(trace, "s");
2903 if (f->newline_permitted)
2904 fprintf(trace, "n%d", f->since_newline);
2905 fprintf(trace, ") ");
2908 void parser_trace(FILE *trace, struct parser *p,
2909 struct token *tk, const struct state states[],
2910 const char *non_term[], int knowns)
2915 for (i = 0; i < p->tos; i++) {
2916 struct frame *f = &p->stack[i];
2919 if (sym < TK_reserved &&
2920 reserved_words[sym] != NULL)
2921 fputs(reserved_words[sym], trace);
2922 else if (sym < TK_reserved + knowns) {
2923 struct token *t = p->asn_stack[i];
2924 text_dump(trace, t->txt, 20);
2926 fputs(non_term[sym - TK_reserved - knowns],
2929 fprintf(trace, ".%d", f->indents);
2930 if (f->since_newline == 0)
2934 parser_trace_state(trace, f, states);
2936 fprintf(trace, "[");
2937 if (tk->num < TK_reserved &&
2938 reserved_words[tk->num] != NULL)
2939 fputs(reserved_words[tk->num], trace);
2941 text_dump(trace, tk->txt, 20);
2945 void parser_trace_action(FILE *trace, char *action)
2948 fprintf(trace, " - %s\n", action);
2953 The obvious example for a parser is a calculator.
2955 As `scanner` provides number parsing function using `libgmp` is it not much
2956 work to perform arbitrary rational number calculations.
2958 This calculator takes one expression, or an equality test, per line. The
2959 results are printed and if any equality test fails, the program exits with
2962 ###### File: parsergen.mk
2963 calc.c calc.h : parsergen parsergen.mdc
2964 ./parsergen --tag calc -o calc parsergen.mdc
2965 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2966 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2971 // what do we use for a demo-grammar? A calculator of course.
2983 #include <sys/mman.h>
2988 #include "scanner.h"
2994 static void free_number(struct number *n)
3000 int main(int argc, char *argv[])
3002 int fd = open(argv[1], O_RDONLY);
3003 int len = lseek(fd, 0, 2);
3004 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3005 struct section *s = code_extract(file, file+len, NULL);
3006 struct token_config config = {
3007 .ignored = (1 << TK_line_comment)
3008 | (1 << TK_block_comment)
3011 .number_chars = ".,_+-",
3015 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3017 struct section *t = s->next;
3030 Session -> Session Line
3033 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3034 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3035 gmp_printf(" or as a decimal: %Fg\n", fl);
3039 | Expression = Expression NEWLINE ${
3040 if (mpq_equal($1.val, $3.val))
3041 gmp_printf("Both equal %Qd\n", $1.val);
3043 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
3048 | NEWLINE ${ printf("Blank line\n"); }$
3049 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3052 Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3053 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3054 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3055 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3056 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3057 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$