1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. Finally `calc` is a test grammar for a simple calculator. The
21 `parsergen` program built from the C code in this file can extract
22 that grammar directly from this file and process it.
25 ###### File: parsergen.c
30 ## forward declarations
41 ###### File: libparser.c
48 ###### File: parsergen.mk
51 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
54 ## Reading the grammar
56 The grammar must be stored in a markdown literate code file as parsed
57 by "[mdcode][]". It must have three top level (i.e. unreferenced)
58 sections: `header`, `code`, and `grammar`. The first two will be
59 literally copied into the generated `.c` and `.h`. files. The last
60 contains the grammar. This is tokenised with "[scanner][]".
62 If the `--tag` option is given, then any top level header that doesn't
63 start with the tag is ignored, and the tag is striped from the rest. So
65 means that the three needed sections must be `Foo: header`, `Foo: code`,
66 and `Foo: grammar`. The tag `calc` is used to extract the same calculator
70 [scanner]: scanner.html
76 ###### parser includes
80 The grammar contains production sets, precedence/associativity
81 declarations, and data type declarations. These are all parsed with
82 _ad hoc_ parsing as we don't have a parser generator yet.
84 The precedence and associativity can be set for each production, but
85 can be inherited from symbols. The data type (either structure or a
86 reference to a structure) is potentially defined for each non-terminal
87 and describes what C structure is used to store information about each
91 enum assoc {Left, Right, Non};
92 char *assoc_names[] = {"Left","Right","Non"};
95 struct text struct_name;
98 unsigned short precedence;
102 unsigned short precedence;
110 The strings reported by `mdcode` and `scanner` are `struct text` which have
111 length rather than being null terminated. To help with printing and
112 comparing we define `text_is` and `prtxt`, which should possibly go in
113 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
114 which might contain control characters.
116 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
117 because that is what we need to detect tags.
120 static int text_is(struct text t, char *s)
122 return (strlen(s) == t.len &&
123 strncmp(s, t.txt, t.len) == 0);
125 static void prtxt(struct text t)
127 printf("%.*s", t.len, t.txt);
130 static int strip_tag(struct text *t, char *tag)
132 int skip = strlen(tag) + 1;
133 if (skip >= t->len ||
134 strncmp(t->txt, tag, skip-1) != 0 ||
135 t->txt[skip-1] != ':')
137 while (skip < t->len && t->txt[skip] == ' ')
146 Productions are comprised primarily of symbols - terminal and
147 non-terminal. We do not make a syntactic distinction between the two,
148 though non-terminals must be identifiers. Non-terminal symbols are
149 those which appear in the head of a production, terminal symbols are
150 those which don't. There are also "virtual" symbols used for precedence
151 marking discussed later, and sometimes we won't know what type a symbol
154 ###### forward declarations
155 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
156 char *symtypes = "UVTN";
160 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
161 table of known symbols and the resulting parser will report them as
162 `TK_reserved + N`. A small set of identifiers are reserved for the
163 different token types that `scanner` can report.
166 static char *reserved_words[] = {
167 [TK_error] = "ERROR",
168 [TK_number] = "NUMBER",
169 [TK_ident] = "IDENTIFIER",
171 [TK_string] = "STRING",
172 [TK_multi_string] = "MULTI_STRING",
175 [TK_newline] = "NEWLINE",
181 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
182 recognised. The former is automatically expected at the end of the text
183 being parsed. The latter are always ignored by the parser.
185 All of these symbols are stored in a simple symbol table. We use the
186 `struct text` from `mdcode` to store the name and link them together
187 into a sorted list using an insertion sort.
189 We don't have separate `find` and `insert` functions as any symbol we
190 find needs to be remembered. We simply expect `find` to always return a
191 symbol, but its type might be `Unknown`.
200 ###### grammar fields
205 static struct symbol *sym_find(struct grammar *g, struct text s)
207 struct symbol **l = &g->syms;
212 (cmp = text_cmp((*l)->name, s)) < 0)
216 n = calloc(1, sizeof(*n));
225 static void symbols_init(struct grammar *g)
227 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
229 for (i = 0; i < entries; i++) {
232 t.txt = reserved_words[i];
235 t.len = strlen(t.txt);
242 ### Data types and precedence.
244 Data type specification and precedence specification are both
245 introduced by a dollar sign at the start of the line. If the next
246 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
247 precedence, otherwise it specifies a data type.
249 The data type name is simply stored and applied to the head of all
250 subsequent productions. It must be the name of a structure optionally
251 preceded by an asterisk which means a reference or "pointer". So
252 `$expression` maps to `struct expression` and `$*statement` maps to
253 `struct statement *`.
255 Any productions given before the first data type declaration will have
256 no data type associated with them and can carry no information. In
257 order to allow other non-terminals to have no type, the data type
258 `$void` can be given. This does *not* mean that `struct void` will be
259 used, but rather than no type will be associated with future
262 The precedence line must contain a list of symbols - typically
263 terminal symbols, but not necessarily. It can only contain symbols
264 that have not been seen yet, so precedence declaration must precede
265 the productions that they affect.
267 A precedence line may also contain a "virtual" symbol which is an
268 identifier preceded by `$$`. Virtual terminals carry precedence
269 information but are not included in the grammar. A production can
270 declare that it inherits the precedence of a given virtual symbol.
272 This use for `$$` precludes it from being used as a symbol in the
273 described language. Two other symbols: `${` and `}$` are also
276 Each new precedence line introduces a new precedence level and
277 declares how it associates. This level is stored in each symbol
278 listed and may be inherited by any production which uses the symbol. A
279 production inherits from the last symbol which has a precedence.
281 ###### grammar fields
282 struct text current_type;
287 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
288 static const char *known[] = { "$$", "${", "}$" };
291 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
293 struct token t = token_next(ts);
298 if (t.num != TK_ident) {
299 err = "type or assoc expected after '$'";
302 if (text_is(t.txt, "LEFT"))
304 else if (text_is(t.txt, "RIGHT"))
306 else if (text_is(t.txt, "NON"))
309 g->current_type = t.txt;
310 g->type_isref = isref;
311 if (text_is(t.txt, "void"))
312 g->current_type.txt = NULL;
314 if (t.num != TK_newline) {
315 err = "Extra tokens after type name";
322 err = "$* cannot be followed by a precedence";
326 // This is a precedence line, need some symbols.
330 while (t.num != TK_newline) {
331 enum symtype type = Terminal;
333 if (t.num == TK_virtual) {
336 if (t.num != TK_ident) {
337 err = "$$ must be followed by a word";
340 } else if (t.num != TK_ident &&
342 err = "Illegal token in precedence line";
345 s = sym_find(g, t.txt);
346 if (s->type != Unknown) {
347 err = "Symbols in precedence line must not already be known.";
351 s->precedence = g->prec_levels;
357 err = "No symbols given on precedence line";
361 while (t.num != TK_newline && t.num != TK_eof)
368 A production either starts with an identifier which is the head
369 non-terminal, or a vertical bar (`|`) in which case this production
370 uses the same head as the previous one. The identifier must be
371 followed by a `->` mark. All productions for a given non-terminal must
372 be grouped together with the `nonterminal ->` given only once.
374 After this a (possibly empty) sequence of identifiers and marks form
375 the body of the production. A virtual symbol may be given after the
376 body by preceding it with `$$`. If a virtual symbol is given, the
377 precedence of the production is that for the virtual symbol. If none
378 is given, the precedence is inherited from the last symbol in the
379 production which has a precedence specified.
381 After the optional precedence may come the `${` mark. This indicates
382 the start of a code fragment. If present, this must be on the same
383 line as the start of the production.
385 All of the text from the `${` through to the matching `}$` is
386 collected and forms the code-fragment for the production. It must all
387 be in one `code_node` of the literate code. The `}$` must be
388 at the end of a line.
390 Text in the code fragment will undergo substitutions where `$N` or
391 `$<N`,for some numeric `N`, will be replaced with a variable holding
392 the parse information for the particular symbol in the production.
393 `$0` is the head of the production, `$1` is the first symbol of the
394 body, etc. The type of `$N` for a terminal symbol is `struct token`.
395 For a non-terminal, it is whatever has been declared for that symbol.
396 The `<` may be included for symbols declared as storing a reference
397 (not a structure) and means that the reference is being moved out, so
398 it will not automatically be freed.
400 While building productions we will need to add to an array which needs to
404 static void array_add(void *varray, int *cnt, void *new)
406 void ***array = varray;
409 current = ((*cnt-1) | (step-1))+1;
410 if (*cnt == current) {
413 *array = realloc(*array, current * sizeof(void*));
415 (*array)[*cnt] = new;
419 Collecting the code fragment simply involves reading tokens until we
420 hit the end token `}$`, and noting the character position of the start and
424 static struct text collect_code(struct token_state *state,
429 code.txt = start.txt.txt + start.txt.len;
431 t = token_next(state);
432 while (t.node == start.node &&
433 t.num != TK_close && t.num != TK_error &&
435 if (t.num == TK_close && t.node == start.node)
436 code.len = t.txt.txt - code.txt;
442 Now we have all the bits we need to parse a full production.
447 ###### grammar fields
448 struct production **productions;
449 int production_count;
451 ###### production fields
453 struct symbol **body;
459 int first_production;
462 static char *parse_production(struct grammar *g,
464 struct token_state *state)
466 /* Head has already been parsed. */
469 struct production p, *pp;
471 memset(&p, 0, sizeof(p));
473 tk = token_next(state);
474 while (tk.num == TK_ident || tk.num == TK_mark) {
475 struct symbol *bs = sym_find(g, tk.txt);
476 if (bs->type == Unknown)
478 if (bs->type == Virtual) {
479 err = "Virtual symbol not permitted in production";
482 if (bs->precedence) {
483 p.precedence = bs->precedence;
486 array_add(&p.body, &p.body_size, bs);
487 tk = token_next(state);
489 if (tk.num == TK_virtual) {
491 tk = token_next(state);
492 if (tk.num != TK_ident) {
493 err = "word required after $$";
496 vs = sym_find(g, tk.txt);
497 if (vs->type != Virtual) {
498 err = "symbol after $$ must be virtual";
501 p.precedence = vs->precedence;
503 tk = token_next(state);
505 if (tk.num == TK_open) {
506 p.code_line = tk.line;
507 p.code = collect_code(state, tk);
508 if (p.code.txt == NULL) {
509 err = "code fragment not closed properly";
512 tk = token_next(state);
514 if (tk.num != TK_newline && tk.num != TK_eof) {
515 err = "stray tokens at end of line";
518 pp = malloc(sizeof(*pp));
520 array_add(&g->productions, &g->production_count, pp);
523 while (tk.num != TK_newline && tk.num != TK_eof)
524 tk = token_next(state);
528 With the ability to parse production and dollar-lines, we have nearly all
529 that we need to parse a grammar from a `code_node`.
531 The head of the first production will effectively be the `start` symbol of
532 the grammar. However it won't _actually_ be so. Processing the grammar is
533 greatly simplified if the real start symbol only has a single production,
534 and expects `$eof` as the final terminal. So when we find the first
535 explicit production we insert an extra production as production zero which
538 ###### Example: production 0
541 where `START` is the first non-terminal given.
543 ###### create production zero
544 struct production *p = calloc(1,sizeof(*p));
545 struct text start = {"$start",6};
546 struct text eof = {"$eof",4};
547 struct text code = {"$0 = $<1;", 9};
548 p->head = sym_find(g, start);
549 p->head->type = Nonterminal;
550 p->head->struct_name = g->current_type;
551 p->head->isref = g->type_isref;
552 if (g->current_type.txt)
554 array_add(&p->body, &p->body_size, head);
555 array_add(&p->body, &p->body_size, sym_find(g, eof));
556 p->head->first_production = g->production_count;
557 array_add(&g->productions, &g->production_count, p);
559 Now we are ready to read in the grammar. We ignore comments
560 and strings so that the marks which introduce them can be
561 used as terminals in the grammar. We don't ignore numbers
562 even though we don't allow them as that causes the scanner
563 to produce errors that the parser is better positioned to handle.
566 static struct grammar *grammar_read(struct code_node *code)
568 struct token_config conf = {
571 .words_marks = known,
572 .known_count = sizeof(known)/sizeof(known[0]),
574 .ignored = (1 << TK_line_comment)
575 | (1 << TK_block_comment)
578 | (1 << TK_multi_string)
583 struct token_state *state = token_open(code, &conf);
585 struct symbol *head = NULL;
589 g = calloc(1, sizeof(*g));
592 for (tk = token_next(state); tk.num != TK_eof;
593 tk = token_next(state)) {
594 if (tk.num == TK_newline)
596 if (tk.num == TK_ident) {
598 head = sym_find(g, tk.txt);
599 if (head->type == Nonterminal)
600 err = "This non-terminal has already be used.";
601 else if (head->type == Virtual)
602 err = "Virtual symbol not permitted in head of production";
604 head->type = Nonterminal;
605 head->struct_name = g->current_type;
606 head->isref = g->type_isref;
607 if (g->production_count == 0) {
608 ## create production zero
610 head->first_production = g->production_count;
611 tk = token_next(state);
612 if (tk.num == TK_mark &&
613 text_is(tk.txt, "->"))
614 err = parse_production(g, head, state);
616 err = "'->' missing in production";
618 } else if (tk.num == TK_mark
619 && text_is(tk.txt, "|")) {
620 // another production for same non-term
622 err = parse_production(g, head, state);
624 err = "First production must have a head";
625 } else if (tk.num == TK_mark
626 && text_is(tk.txt, "$")) {
627 err = dollar_line(state, g, 0);
628 } else if (tk.num == TK_mark
629 && text_is(tk.txt, "$*")) {
630 err = dollar_line(state, g, 1);
632 err = "Unrecognised token at start of line.";
640 fprintf(stderr, "Error at line %d: %s\n",
647 ## Analysing the grammar
649 The central task in analysing the grammar is to determine a set of
650 states to drive the parsing process. This will require finding
651 various sets of symbols and of "items". Some of these sets will need
652 to attach information to each element in the set, so they are more
655 Each "item" may have a 'look-ahead' or `LA` set associated with
656 it. Many of these will be the same as each other. To avoid memory
657 wastage, and to simplify some comparisons of sets, these sets will be
658 stored in a list of unique sets, each assigned a number.
660 Once we have the data structures in hand to manage these sets and
661 lists, we can start setting the 'nullable' flag, build the 'FIRST'
662 sets, and then create the item sets which define the various states.
666 Though we don't only store symbols in these sets, they are the main
667 things we store, so they are called symbol sets or "symsets".
669 A symset has a size, an array of shorts, and an optional array of data
670 values, which are also shorts. If the array of data is not present,
671 we store an impossible pointer, as `NULL` is used to indicate that no
672 memory has been allocated yet;
677 unsigned short *syms, *data;
679 #define NO_DATA ((unsigned short *)1)
680 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
681 const struct symset INIT_DATASET = { 0, NULL, NULL };
683 The arrays of shorts are allocated in blocks of 8 and are kept sorted
684 by using an insertion sort. We don't explicitly record the amount of
685 allocated space as it can be derived directly from the current `cnt` using
686 `((cnt - 1) | 7) + 1`.
689 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
692 int current = ((s->cnt-1) | 7) + 1;
693 if (current == s->cnt) {
695 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
696 if (s->data != NO_DATA)
697 s->data = realloc(s->data, sizeof(*s->data) * current);
700 while (i > 0 && s->syms[i-1] > key) {
701 s->syms[i] = s->syms[i-1];
702 if (s->data != NO_DATA)
703 s->data[i] = s->data[i-1];
707 if (s->data != NO_DATA)
712 Finding a symbol (or item) in a `symset` uses a simple binary search.
713 We return the index where the value was found (so data can be accessed),
714 or `-1` to indicate failure.
716 static int symset_find(struct symset *ss, unsigned short key)
723 while (lo + 1 < hi) {
724 int mid = (lo + hi) / 2;
725 if (ss->syms[mid] <= key)
730 if (ss->syms[lo] == key)
735 We will often want to form the union of two symsets. When we do, we
736 will often want to know if anything changed (as that might mean there
737 is more work to do). So `symset_union` reports whether anything was
738 added to the first set. We use a slow quadratic approach as these
739 sets don't really get very big. If profiles shows this to be a problem it
740 can be optimised later.
742 static int symset_union(struct symset *a, struct symset *b)
746 for (i = 0; i < b->cnt; i++)
747 if (symset_find(a, b->syms[i]) < 0) {
748 unsigned short data = 0;
749 if (b->data != NO_DATA)
751 symset_add(a, b->syms[i], data);
757 And of course we must be able to free a symset.
759 static void symset_free(struct symset ss)
762 if (ss.data != NO_DATA)
768 Some symsets are simply stored somewhere appropriate in the `grammar`
769 data structure, others needs a bit of indirection. This applies
770 particularly to "LA" sets. These will be paired with "items" in an
771 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
772 stored in the `data` field, so we need a mapping from a small (short)
773 number to an LA `symset`.
775 As mentioned earlier this involves creating a list of unique symsets.
777 For now, we just use a linear list sorted by insertion. A skiplist
778 would be more efficient and may be added later.
785 struct setlist *next;
788 ###### grammar fields
789 struct setlist *sets;
794 static int ss_cmp(struct symset a, struct symset b)
797 int diff = a.cnt - b.cnt;
800 for (i = 0; i < a.cnt; i++) {
801 diff = (int)a.syms[i] - (int)b.syms[i];
808 static int save_set(struct grammar *g, struct symset ss)
810 struct setlist **sl = &g->sets;
814 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
821 s = malloc(sizeof(*s));
830 Finding a set by number is currently performed by a simple linear search.
831 If this turns out to hurt performance, we can store the sets in a dynamic
832 array like the productions.
834 static struct symset set_find(struct grammar *g, int num)
836 struct setlist *sl = g->sets;
837 while (sl && sl->num != num)
843 ### Setting `nullable`
845 We set `nullable` on the head symbol for any production for which all
846 the body symbols (if any) are nullable. As this is a recursive
847 definition, any change in the `nullable` setting means that we need to
848 re-evaluate where it needs to be set.
850 We simply loop around performing the same calculations until no more
857 static void set_nullable(struct grammar *g)
860 while (check_again) {
863 for (p = 0; p < g->production_count; p++) {
864 struct production *pr = g->productions[p];
867 if (pr->head->nullable)
869 for (s = 0; s < pr->body_size; s++)
870 if (! pr->body[s]->nullable)
872 if (s == pr->body_size) {
873 pr->head->nullable = 1;
880 ### Setting `can_eol` and `line_like`
882 In order to be able to ignore newline tokens when not relevant, but
883 still include them in the parse when needed, we will need to know
884 which states can start a "line-like" section of code. We ignore
885 newlines when there is an indent since the most recent start of a
888 To know which symbols are line-like, we first need to know which
889 symbols start with a NEWLINE token. Any symbol which is followed by a
890 NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol.
891 Certainly when trying to parse one of these we must take note of NEWLINEs.
893 Clearly the `TK_newline` token can start with a NEWLINE. Any symbol
894 which is the head of a production that contains a starts-with-NEWLINE
895 symbol preceeded only by nullable symbols is also a
896 starts-with-NEWLINE symbol. We use a new field `can_eol` to record
897 this attribute of symbols, and compute it in a repetitive manner
898 similar to `set_nullable`.
900 Once we have that, we can determine which symbols are `line_like` by
901 seeing which are followed by a `can_eol` symbol in any production.
908 static void set_can_eol(struct grammar *g)
911 g->symtab[TK_newline]->can_eol = 1;
912 while (check_again) {
915 for (p = 0; p < g->production_count; p++) {
916 struct production *pr = g->productions[p];
919 if (pr->head->can_eol)
922 for (s = 0 ; s < pr->body_size; s++) {
923 if (pr->body[s]->can_eol) {
924 pr->head->can_eol = 1;
928 if (!pr->body[s]->nullable)
935 static void set_line_like(struct grammar *g)
938 for (p = 0; p < g->production_count; p++) {
939 struct production *pr = g->productions[p];
942 for (s = 1; s < pr->body_size; s++)
943 if (pr->body[s]->can_eol)
944 pr->body[s-1]->line_like = 1;
948 ### Building the `first` sets
950 When calculating what can follow a particular non-terminal, we will need to
951 know what the "first" terminal in any subsequent non-terminal might be. So
952 we calculate the `first` set for every non-terminal and store them in an
953 array. We don't bother recording the "first" set for terminals as they are
956 As the "first" for one symbol might depend on the "first" of another,
957 we repeat the calculations until no changes happen, just like with
958 `set_nullable`. This makes use of the fact that `symset_union`
959 reports if any change happens.
961 The core of this, which finds the "first" of part of a production body,
962 will be reused for computing the "follow" sets, so we split it out
963 into a separate function.
965 ###### grammar fields
966 struct symset *first;
970 static int add_first(struct production *pr, int start,
971 struct symset *target, struct grammar *g,
976 for (s = start; s < pr->body_size; s++) {
977 struct symbol *bs = pr->body[s];
978 if (bs->type == Terminal) {
979 if (symset_find(target, bs->num) < 0) {
980 symset_add(target, bs->num, 0);
984 } else if (symset_union(target, &g->first[bs->num]))
990 *to_end = (s == pr->body_size);
994 static void build_first(struct grammar *g)
998 g->first = calloc(g->num_syms, sizeof(g->first[0]));
999 for (s = 0; s < g->num_syms; s++)
1000 g->first[s] = INIT_SYMSET;
1002 while (check_again) {
1005 for (p = 0; p < g->production_count; p++) {
1006 struct production *pr = g->productions[p];
1007 struct symset *head = &g->first[pr->head->num];
1009 if (add_first(pr, 0, head, g, NULL))
1015 ### Building the `follow` sets.
1017 There are two different situations when we will want to generate "follow"
1018 sets. If we are doing an SLR analysis, we want to generate a single
1019 "follow" set for each non-terminal in the grammar. That is what is
1020 happening here. If we are doing an LALR or LR analysis we will want
1021 to generate a separate "LA" set for each item. We do that later
1022 in state generation.
1024 There are two parts to generating a "follow" set. Firstly we look at
1025 every place that any non-terminal appears in the body of any
1026 production, and we find the set of possible "first" symbols after
1027 there. This is done using `add_first` above and only needs to be done
1028 once as the "first" sets are now stable and will not change.
1032 for (p = 0; p < g->production_count; p++) {
1033 struct production *pr = g->productions[p];
1036 for (b = 0; b < pr->body_size - 1; b++) {
1037 struct symbol *bs = pr->body[b];
1038 if (bs->type == Terminal)
1040 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1044 The second part is to add the "follow" set of the head of a production
1045 to the "follow" sets of the final symbol in the production, and any
1046 other symbol which is followed only by `nullable` symbols. As this
1047 depends on "follow" itself we need to repeatedly perform the process
1048 until no further changes happen.
1052 for (again = 0, p = 0;
1053 p < g->production_count;
1054 p < g->production_count-1
1055 ? p++ : again ? (again = 0, p = 0)
1057 struct production *pr = g->productions[p];
1060 for (b = pr->body_size - 1; b >= 0; b--) {
1061 struct symbol *bs = pr->body[b];
1062 if (bs->type == Terminal)
1064 if (symset_union(&g->follow[bs->num],
1065 &g->follow[pr->head->num]))
1072 We now just need to create and initialise the `follow` list to get a
1075 ###### grammar fields
1076 struct symset *follow;
1079 static void build_follow(struct grammar *g)
1082 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1083 for (s = 0; s < g->num_syms; s++)
1084 g->follow[s] = INIT_SYMSET;
1088 ### Building itemsets and states
1090 There are three different levels of detail that can be chosen for
1091 building the itemsets and states for the LR grammar. They are:
1093 1. LR(0) or SLR(1), where no look-ahead is considered.
1094 2. LALR(1) where we build look-ahead sets with each item and merge
1095 the LA sets when we find two paths to the same "kernel" set of items.
1096 3. LR(1) where different look-ahead for any item in the set means
1097 a different state must be created.
1099 ###### forward declarations
1100 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1102 We need to be able to look through existing states to see if a newly
1103 generated state already exists. For now we use a simple sorted linked
1106 An item is a pair of numbers: the production number and the position of
1107 "DOT", which is an index into the body of the production.
1108 As the numbers are not enormous we can combine them into a single "short"
1109 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1110 production number (so 4000 productions with maximum of 15 symbols in the
1113 Comparing the itemsets will be a little different to comparing symsets
1114 as we want to do the lookup after generating the "kernel" of an
1115 itemset, so we need to ignore the offset=zero items which are added during
1118 To facilitate this, we modify the "DOT" number so that "0" sorts to
1119 the end of the list in the symset, and then only compare items before
1123 static inline unsigned short item_num(int production, int index)
1125 return production | ((31-index) << 11);
1127 static inline int item_prod(unsigned short item)
1129 return item & 0x7ff;
1131 static inline int item_index(unsigned short item)
1133 return (31-(item >> 11)) & 0x1f;
1136 For LR(1) analysis we need to compare not just the itemset in a state
1137 but also the LA sets. As we assign each unique LA set a number, we
1138 can just compare the symset and the data values together.
1141 static int itemset_cmp(struct symset a, struct symset b,
1142 enum grammar_type type)
1148 i < a.cnt && i < b.cnt &&
1149 item_index(a.syms[i]) > 0 &&
1150 item_index(b.syms[i]) > 0;
1152 int diff = a.syms[i] - b.syms[i];
1156 diff = a.data[i] - b.data[i];
1161 if (i == a.cnt || item_index(a.syms[i]) == 0)
1165 if (i == b.cnt || item_index(b.syms[i]) == 0)
1171 if (type < LR1 || av == -1)
1174 a.data[i] - b.data[i];
1177 It will be helpful to know if an itemset has been "completed" or not,
1178 particularly for LALR where itemsets get merged, at which point they
1179 need to be consider for completion again. So a `completed` flag is needed.
1181 For correct handling of `TK_newline` when parsing, we will need to
1182 know which states (itemsets) can occur at the start of a line, so we
1183 will record a `starts_line` flag too.
1185 Finally, for handling `TK_out` we need to know where production in the
1186 current state started *before* the most recent indent. A state
1187 doesn't usually keep details of individual productions, so we need to
1188 add one extra detail. `min_prefix` is the smallest non-zero number of
1189 symbols *before* DOT in any production in an itemset. This will allow
1190 us to determine if the the most recent indent is sufficiently recent
1191 to cancel it against a `TK_out`. If it was seen longer ago than the
1192 `min_prefix`, and if the current state cannot be reduced, then the
1193 indented section must have ended in the middle of a syntactic unit, so
1194 an error must be signaled.
1196 And now we can build the list of itemsets. The lookup routine returns
1197 both a success flag and a pointer to where in the list an insert
1198 should happen, so we don't need to search a second time.
1202 struct itemset *next;
1204 struct symset items;
1205 struct symset go_to;
1207 unsigned short precedence;
1213 ###### grammar fields
1214 struct itemset *items;
1218 static int itemset_find(struct grammar *g, struct itemset ***where,
1219 struct symset kernel, enum grammar_type type)
1221 struct itemset **ip;
1223 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1224 struct itemset *i = *ip;
1226 diff = itemset_cmp(i->items, kernel, type);
1239 Adding an itemset may require merging the LA sets if LALR analysis is
1240 happening. If any new LA set adds any symbols that weren't in the old LA set, we
1241 clear the `completed` flag so that the dependants of this itemset will be
1242 recalculated and their LA sets updated.
1244 `add_itemset` must consume the symsets it is passed, either by adding
1245 them to a data structure, of freeing them.
1247 static int add_itemset(struct grammar *g, struct symset ss,
1248 enum assoc assoc, unsigned short precedence,
1249 enum grammar_type type)
1251 struct itemset **where, *is;
1253 int found = itemset_find(g, &where, ss, type);
1255 is = calloc(1, sizeof(*is));
1256 is->state = g->states;
1260 is->precedence = precedence;
1262 is->go_to = INIT_DATASET;
1271 for (i = 0; i < ss.cnt; i++) {
1272 struct symset temp = INIT_SYMSET, s;
1273 if (ss.data[i] == is->items.data[i])
1275 s = set_find(g, is->items.data[i]);
1276 symset_union(&temp, &s);
1277 s = set_find(g, ss.data[i]);
1278 if (symset_union(&temp, &s)) {
1279 is->items.data[i] = save_set(g, temp);
1290 To build all the itemsets, we first insert the initial itemset made
1291 from production zero, complete each itemset, and then generate new
1292 itemsets from old until no new ones can be made.
1294 Completing an itemset means finding all the items where "DOT" is followed by
1295 a nonterminal and adding "DOT=0" items for every production from that
1296 non-terminal - providing each item hasn't already been added.
1298 If LA sets are needed, the LA set for each new item is found using
1299 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1300 be supplemented by the LA set for the item which produce the new item.
1302 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1303 is used in the next stage.
1304 If any of these symbols are flagged as starting a line, then this
1305 state must be a `starts_line` state so now is a good time to record that.
1307 When itemsets are created we assign a precedence to the itemset from
1308 the complete item, if there is one. We ignore the possibility of
1309 there being two and don't (currently) handle precedence in such
1310 grammars. When completing a grammar we ignore any item where DOT is
1311 followed by a terminal with a precedence lower (numerically higher)
1312 than that for the itemset. Unless the terminal has right
1313 associativity, we also ignore items where the terminal has the same
1314 precedence. The result is that unwanted items are still in the
1315 itemset, but the terminal doesn't get into the go to set, so the item
1318 ###### complete itemset
1319 for (i = 0; i < is->items.cnt; i++) {
1320 int p = item_prod(is->items.syms[i]);
1321 int bs = item_index(is->items.syms[i]);
1322 struct production *pr = g->productions[p];
1325 struct symset LA = INIT_SYMSET;
1326 unsigned short sn = 0;
1328 if (is->min_prefix == 0 ||
1329 (bs > 0 && bs < is->min_prefix))
1330 is->min_prefix = bs;
1331 if (bs == pr->body_size)
1334 if (s->precedence && is->precedence &&
1335 is->precedence < s->precedence)
1336 /* This terminal has a low precedence and
1337 * shouldn't be shifted
1340 if (s->precedence && is->precedence &&
1341 is->precedence == s->precedence && s->assoc != Right)
1342 /* This terminal has a matching precedence and is
1343 * not Right-associative, so we mustn't shift it.
1346 if (symset_find(&done, s->num) < 0) {
1347 symset_add(&done, s->num, 0);
1349 is->starts_line = 1;
1351 if (s->type != Nonterminal)
1357 add_first(pr, bs+1, &LA, g, &to_end);
1359 struct symset ss = set_find(g, is->items.data[i]);
1360 symset_union(&LA, &ss);
1362 sn = save_set(g, LA);
1363 LA = set_find(g, sn);
1366 /* Add productions for this symbol */
1367 for (p2 = s->first_production;
1368 p2 < g->production_count &&
1369 g->productions[p2]->head == s;
1371 int itm = item_num(p2, 0);
1372 int pos = symset_find(&is->items, itm);
1374 symset_add(&is->items, itm, sn);
1375 /* Will have re-ordered, so start
1376 * from beginning again */
1378 } else if (type >= LALR) {
1379 struct symset ss = set_find(g, is->items.data[pos]);
1380 struct symset tmp = INIT_SYMSET;
1382 symset_union(&tmp, &ss);
1383 if (symset_union(&tmp, &LA)) {
1384 is->items.data[pos] = save_set(g, tmp);
1392 For each symbol we found (and placed in `done`) we collect all the items for
1393 which this symbol is next, and create a new itemset with "DOT" advanced over
1394 the symbol. This is then added to the collection of itemsets (or merged
1395 with a pre-existing itemset).
1397 ###### derive itemsets
1398 // Now we have a completed itemset, so we need to
1399 // compute all the 'next' itemsets and create them
1400 // if they don't exist.
1401 for (i = 0; i < done.cnt; i++) {
1403 unsigned short state;
1404 struct symbol *sym = g->symtab[done.syms[i]];
1405 enum assoc assoc = Non;
1406 unsigned short precedence = 0;
1407 struct symset newitemset = INIT_SYMSET;
1409 newitemset = INIT_DATASET;
1411 for (j = 0; j < is->items.cnt; j++) {
1412 int itm = is->items.syms[j];
1413 int p = item_prod(itm);
1414 int bp = item_index(itm);
1415 struct production *pr = g->productions[p];
1416 unsigned short la = 0;
1419 if (bp == pr->body_size)
1421 if (pr->body[bp] != sym)
1424 la = is->items.data[j];
1425 pos = symset_find(&newitemset, pr->head->num);
1426 if (bp + 1 == pr->body_size &&
1427 pr->precedence > 0 &&
1429 pr->precedence < precedence)) {
1430 // new itemset is reducible and has a precedence.
1431 precedence = pr->precedence;
1435 symset_add(&newitemset, item_num(p, bp+1), la);
1436 else if (type >= LALR) {
1437 // Need to merge la set.
1438 int la2 = newitemset.data[pos];
1440 struct symset ss = set_find(g, la2);
1441 struct symset LA = INIT_SYMSET;
1442 symset_union(&LA, &ss);
1443 ss = set_find(g, la);
1444 if (symset_union(&LA, &ss))
1445 newitemset.data[pos] = save_set(g, LA);
1451 state = add_itemset(g, newitemset, assoc, precedence, type);
1452 if (symset_find(&is->go_to, done.syms[i]) < 0)
1453 symset_add(&is->go_to, done.syms[i], state);
1456 All that is left is to create the initial itemset from production zero, and
1457 with `TK_eof` as the LA set.
1460 static void build_itemsets(struct grammar *g, enum grammar_type type)
1462 struct symset first = INIT_SYMSET;
1465 unsigned short la = 0;
1467 // LA set just has eof
1468 struct symset eof = INIT_SYMSET;
1469 symset_add(&eof, TK_eof, 0);
1470 la = save_set(g, eof);
1471 first = INIT_DATASET;
1473 // production 0, offset 0 (with no data)
1474 symset_add(&first, item_num(0, 0), la);
1475 add_itemset(g, first, Non, 0, type);
1476 for (again = 0, is = g->items;
1478 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1480 struct symset done = INIT_SYMSET;
1491 ### Completing the analysis.
1493 The exact process of analysis depends on which level was requested. For
1494 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1495 `LR1` we need first, but not follow. For `SLR` we need both.
1497 We don't build the "action" tables that you might expect as the parser
1498 doesn't use them. They will be calculated to some extent if needed for
1501 Once we have built everything we allocate arrays for the two lists:
1502 symbols and itemsets. This allows more efficient access during reporting.
1503 The symbols are grouped as terminals and non-terminals and we record the
1504 changeover point in `first_nonterm`.
1506 ###### grammar fields
1507 struct symbol **symtab;
1508 struct itemset **statetab;
1511 ###### grammar_analyse
1513 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1517 int snum = TK_reserved;
1518 for (s = g->syms; s; s = s->next)
1519 if (s->num < 0 && s->type == Terminal) {
1523 g->first_nonterm = snum;
1524 for (s = g->syms; s; s = s->next)
1530 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1531 for (s = g->syms; s; s = s->next)
1532 g->symtab[s->num] = s;
1543 build_itemsets(g, type);
1545 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1546 for (is = g->items; is ; is = is->next)
1547 g->statetab[is->state] = is;
1550 ## Reporting on the Grammar
1552 The purpose of the report is to give the grammar developer insight into
1553 how the grammar parser will work. It is basically a structured dump of
1554 all the tables that have been generated, plus a description of any conflicts.
1556 ###### grammar_report
1557 static int grammar_report(struct grammar *g, enum grammar_type type)
1563 return report_conflicts(g, type);
1566 Firstly we have the complete list of symbols, together with the
1567 "FIRST" set if that was generated. We add a mark to each symbol to
1568 show if it can end in a newline (`>`), if it is considered to be
1569 "line-like" (`<`), or if it is nullable (`.`).
1573 static void report_symbols(struct grammar *g)
1577 printf("SYMBOLS + FIRST:\n");
1579 printf("SYMBOLS:\n");
1581 for (n = 0; n < g->num_syms; n++) {
1582 struct symbol *s = g->symtab[n];
1586 printf(" %c%c%c%3d%c: ",
1587 s->nullable ? '.':' ',
1588 s->can_eol ? '>':' ',
1589 s->line_like ? '<':' ',
1590 s->num, symtypes[s->type]);
1593 printf(" (%d%s)", s->precedence,
1594 assoc_names[s->assoc]);
1596 if (g->first && s->type == Nonterminal) {
1599 for (j = 0; j < g->first[n].cnt; j++) {
1602 prtxt(g->symtab[g->first[n].syms[j]]->name);
1609 Then we have the follow sets if they were computed.
1611 static void report_follow(struct grammar *g)
1614 printf("FOLLOW:\n");
1615 for (n = 0; n < g->num_syms; n++)
1616 if (g->follow[n].cnt) {
1620 prtxt(g->symtab[n]->name);
1621 for (j = 0; j < g->follow[n].cnt; j++) {
1624 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1630 And finally the item sets. These include the GO TO tables and, for
1631 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1632 it up a bit. First the items, with production number and associativity.
1634 static void report_item(struct grammar *g, int itm)
1636 int p = item_prod(itm);
1637 int dot = item_index(itm);
1638 struct production *pr = g->productions[p];
1642 prtxt(pr->head->name);
1644 for (i = 0; i < pr->body_size; i++) {
1645 printf(" %s", (dot == i ? ". ": ""));
1646 prtxt(pr->body[i]->name);
1648 if (dot == pr->body_size)
1651 if (pr->precedence && dot == pr->body_size)
1652 printf(" (%d%s)", pr->precedence,
1653 assoc_names[pr->assoc]);
1654 if (dot < pr->body_size &&
1655 pr->body[dot]->precedence) {
1656 struct symbol *s = pr->body[dot];
1657 printf(" [%d%s]", s->precedence,
1658 assoc_names[s->assoc]);
1663 The LA sets which are (possibly) reported with each item:
1665 static void report_la(struct grammar *g, int lanum)
1667 struct symset la = set_find(g, lanum);
1671 printf(" LOOK AHEAD(%d)", lanum);
1672 for (i = 0; i < la.cnt; i++) {
1675 prtxt(g->symtab[la.syms[i]]->name);
1680 Then the go to sets:
1683 static void report_goto(struct grammar *g, struct symset gt)
1688 for (i = 0; i < gt.cnt; i++) {
1690 prtxt(g->symtab[gt.syms[i]]->name);
1691 printf(" -> %d\n", gt.data[i]);
1695 Now we can report all the item sets complete with items, LA sets, and GO TO.
1697 static void report_itemsets(struct grammar *g)
1700 printf("ITEM SETS(%d)\n", g->states);
1701 for (s = 0; s < g->states; s++) {
1703 struct itemset *is = g->statetab[s];
1704 printf(" Itemset %d:%s min prefix=%d",
1705 s, is->starts_line?" (startsline)":"", is->min_prefix);
1707 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1709 for (j = 0; j < is->items.cnt; j++) {
1710 report_item(g, is->items.syms[j]);
1711 if (is->items.data != NO_DATA)
1712 report_la(g, is->items.data[j]);
1714 report_goto(g, is->go_to);
1718 ### Reporting conflicts
1720 Conflict detection varies a lot among different analysis levels. However
1721 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1722 LR1 are also similar as they have FOLLOW or LA sets.
1726 ## conflict functions
1728 static int report_conflicts(struct grammar *g, enum grammar_type type)
1731 printf("Conflicts:\n");
1733 cnt = conflicts_lr0(g, type);
1735 cnt = conflicts_slr(g, type);
1737 printf(" - no conflicts\n");
1741 LR0 conflicts are any state which have both a reducible item and
1742 a shiftable item, or two reducible items.
1744 LR05 conflicts only occur if two possible reductions exist,
1745 as shifts always over-ride reductions.
1747 ###### conflict functions
1748 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1752 for (i = 0; i < g->states; i++) {
1753 struct itemset *is = g->statetab[i];
1754 int last_reduce = -1;
1755 int prev_reduce = -1;
1756 int last_shift = -1;
1760 for (j = 0; j < is->items.cnt; j++) {
1761 int itm = is->items.syms[j];
1762 int p = item_prod(itm);
1763 int bp = item_index(itm);
1764 struct production *pr = g->productions[p];
1766 if (bp == pr->body_size) {
1767 prev_reduce = last_reduce;
1771 if (pr->body[bp]->type == Terminal)
1774 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1775 printf(" State %d has both SHIFT and REDUCE:\n", i);
1776 report_item(g, is->items.syms[last_shift]);
1777 report_item(g, is->items.syms[last_reduce]);
1780 if (prev_reduce >= 0) {
1781 printf(" State %d has 2 (or more) reducible items\n", i);
1782 report_item(g, is->items.syms[prev_reduce]);
1783 report_item(g, is->items.syms[last_reduce]);
1790 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1791 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1792 in the source of the look ahead set.
1794 We build two datasets to reflect the "action" table: one which maps
1795 terminals to items where that terminal could be shifted and another
1796 which maps terminals to items that could be reduced when the terminal
1797 is in look-ahead. We report when we get conflicts between the two.
1799 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1804 for (i = 0; i < g->states; i++) {
1805 struct itemset *is = g->statetab[i];
1806 struct symset shifts = INIT_DATASET;
1807 struct symset reduce = INIT_DATASET;
1811 /* First collect the shifts */
1812 for (j = 0; j < is->items.cnt; j++) {
1813 unsigned short itm = is->items.syms[j];
1814 int p = item_prod(itm);
1815 int bp = item_index(itm);
1816 struct production *pr = g->productions[p];
1818 if (bp < pr->body_size &&
1819 pr->body[bp]->type == Terminal) {
1821 int sym = pr->body[bp]->num;
1822 if (symset_find(&shifts, sym) < 0)
1823 symset_add(&shifts, sym, itm);
1826 /* Now look for reduction and conflicts */
1827 for (j = 0; j < is->items.cnt; j++) {
1828 unsigned short itm = is->items.syms[j];
1829 int p = item_prod(itm);
1830 int bp = item_index(itm);
1831 struct production *pr = g->productions[p];
1833 if (bp < pr->body_size)
1838 la = g->follow[pr->head->num];
1840 la = set_find(g, is->items.data[j]);
1842 for (k = 0; k < la.cnt; k++) {
1843 int pos = symset_find(&shifts, la.syms[k]);
1845 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1846 prtxt(g->symtab[la.syms[k]]->name);
1848 report_item(g, shifts.data[pos]);
1849 report_item(g, itm);
1852 pos = symset_find(&reduce, la.syms[k]);
1854 symset_add(&reduce, la.syms[k], itm);
1857 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1858 prtxt(g->symtab[la.syms[k]]->name);
1860 report_item(g, itm);
1861 report_item(g, reduce.data[pos]);
1865 symset_free(shifts);
1866 symset_free(reduce);
1872 ## Generating the parser
1874 The exported part of the parser is the `parse_XX` function, where the name
1875 `XX` is based on the name of the parser files.
1877 This takes a `code_node`, a partially initialized `token_config`, and an
1878 optional `FILE` to send tracing to. The `token_config` gets the list of
1879 known words added and then is used with the `code_node` to initialize the
1882 `parse_XX` then calls the library function `parser_run` to actually complete
1883 the parse. This needs the `states` table and function to call the various
1884 pieces of code provided in the grammar file, so they are generated first.
1886 ###### parser_generate
1888 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1894 gen_reduce(f, g, file);
1897 fprintf(f, "#line 0 \"gen_parser\"\n");
1898 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1901 fprintf(f, "\tstruct token_state *tokens;\n");
1902 fprintf(f, "\tconfig->words_marks = known;\n");
1903 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1904 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1905 fprintf(f, "\ttokens = token_open(code, config);\n");
1906 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1907 fprintf(f, "\ttoken_close(tokens);\n");
1908 fprintf(f, "\treturn rv;\n");
1909 fprintf(f, "}\n\n");
1912 ### Known words table
1914 The known words table is simply an array of terminal symbols.
1915 The table of nonterminals used for tracing is a similar array.
1919 static void gen_known(FILE *f, struct grammar *g)
1922 fprintf(f, "#line 0 \"gen_known\"\n");
1923 fprintf(f, "static const char *known[] = {\n");
1924 for (i = TK_reserved;
1925 i < g->num_syms && g->symtab[i]->type == Terminal;
1927 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1928 g->symtab[i]->name.txt);
1929 fprintf(f, "};\n\n");
1932 static void gen_non_term(FILE *f, struct grammar *g)
1935 fprintf(f, "#line 0 \"gen_non_term\"\n");
1936 fprintf(f, "static const char *non_term[] = {\n");
1937 for (i = TK_reserved;
1940 if (g->symtab[i]->type == Nonterminal)
1941 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1942 g->symtab[i]->name.txt);
1943 fprintf(f, "};\n\n");
1946 ### States and the goto tables.
1948 For each state we record the goto table, the reducible production if
1949 there is one, or a symbol to shift for error recovery.
1950 Some of the details of the reducible production are stored in the
1951 `do_reduce` function to come later. Here we store the production number,
1952 the body size (useful for stack management) and the resulting symbol (useful
1953 for knowing how to free data later).
1955 The go to table is stored in a simple array of `sym` and corresponding
1958 ###### exported types
1966 const struct lookup * go_to;
1977 static void gen_goto(FILE *f, struct grammar *g)
1980 fprintf(f, "#line 0 \"gen_goto\"\n");
1981 for (i = 0; i < g->states; i++) {
1983 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1985 struct symset gt = g->statetab[i]->go_to;
1986 for (j = 0; j < gt.cnt; j++)
1987 fprintf(f, "\t{ %d, %d },\n",
1988 gt.syms[j], gt.data[j]);
1995 static void gen_states(FILE *f, struct grammar *g)
1998 fprintf(f, "#line 0 \"gen_states\"\n");
1999 fprintf(f, "static const struct state states[] = {\n");
2000 for (i = 0; i < g->states; i++) {
2001 struct itemset *is = g->statetab[i];
2002 int j, prod = -1, prod_len;
2004 for (j = 0; j < is->items.cnt; j++) {
2005 int itm = is->items.syms[j];
2006 int p = item_prod(itm);
2007 int bp = item_index(itm);
2008 struct production *pr = g->productions[p];
2010 if (bp < pr->body_size)
2012 /* This is what we reduce */
2013 if (prod < 0 || prod_len < pr->body_size) {
2015 prod_len = pr->body_size;
2020 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d },\n",
2021 i, is->go_to.cnt, i, prod,
2022 g->productions[prod]->body_size,
2023 g->productions[prod]->head->num,
2024 is->starts_line, is->min_prefix);
2026 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
2027 i, is->go_to.cnt, i,
2028 is->starts_line, is->min_prefix);
2030 fprintf(f, "};\n\n");
2033 ### The `do_reduce` function and the code
2035 When the parser engine decides to reduce a production, it calls `do_reduce`.
2036 This has two functions.
2038 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
2039 production being reduced. Secondly it runs the code that was included with
2040 the production if any.
2042 This code needs to be able to store data somewhere. Rather than requiring
2043 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
2044 `do_reduce` return the size to be saved.
2046 In order for the code to access "global" context, we pass in the
2047 "config" pointer that was passed to parser function. If the `struct
2048 token_config` is embedded in some larger structure, the reducing code
2049 can access the larger structure using pointer manipulation.
2051 The code fragment requires translation when written out. Any `$N` needs to
2052 be converted to a reference either to that buffer (if `$0`) or to the
2053 structure returned by a previous reduction. These pointers need to be cast
2054 to the appropriate type for each access. All this is handled in
2057 `gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
2058 This applied only to symbols with references (or pointers), not those with structures.
2059 The `<` implies that the reference it being moved out, so the object will not be
2060 automatically freed. This is equivalent to assigning `NULL` to the pointer.
2064 static void gen_code(struct production *p, FILE *f, struct grammar *g)
2067 char *used = calloc(1, p->body_size);
2070 fprintf(f, "\t\t\t");
2071 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2085 if (*c < '0' || *c > '9') {
2092 while (c[1] >= '0' && c[1] <= '9') {
2094 n = n * 10 + *c - '0';
2097 fprintf(f, "(*(struct %.*s*%s)ret)",
2098 p->head->struct_name.len,
2099 p->head->struct_name.txt,
2100 p->head->isref ? "*":"");
2101 else if (n > p->body_size)
2102 fprintf(f, "$%d", n);
2103 else if (p->body[n-1]->type == Terminal)
2104 fprintf(f, "(*(struct token *)body[%d])",
2106 else if (p->body[n-1]->struct_name.txt == NULL)
2107 fprintf(f, "$%d", n);
2109 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2110 p->body[n-1]->struct_name.len,
2111 p->body[n-1]->struct_name.txt,
2112 p->body[n-1]->isref ? "*":"", n-1);
2117 for (i = 0; i < p->body_size; i++) {
2118 if (p->body[i]->struct_name.txt &&
2119 p->body[i]->isref &&
2121 // assume this has been copied out
2122 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2129 static void gen_reduce(FILE *f, struct grammar *g, char *file)
2132 fprintf(f, "#line 0 \"gen_reduce\"\n");
2133 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2135 fprintf(f, "\tint ret_size = 0;\n");
2137 fprintf(f, "\tswitch(prod) {\n");
2138 for (i = 0; i < g->production_count; i++) {
2139 struct production *p = g->productions[i];
2140 fprintf(f, "\tcase %d:\n", i);
2143 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2147 if (p->head->struct_name.txt)
2148 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2149 p->head->struct_name.len,
2150 p->head->struct_name.txt,
2151 p->head->isref ? "*":"");
2153 fprintf(f, "\t\tbreak;\n");
2155 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2160 As each non-terminal can potentially cause a different type of data
2161 structure to be allocated and filled in, we need to be able to free it when
2164 It is particularly important to have fine control over freeing during error
2165 recovery where individual stack frames might need to be freed.
2167 For this, the grammar author is required to defined a `free_XX` function for
2168 each structure that is used by a non-terminal. `do_free` will call whichever
2169 is appropriate given a symbol number, and will call `free` (as is
2170 appropriate for tokens) on any terminal symbol.
2174 static void gen_free(FILE *f, struct grammar *g)
2178 fprintf(f, "#line 0 \"gen_free\"\n");
2179 fprintf(f, "static void do_free(short sym, void *asn)\n");
2181 fprintf(f, "\tif (!asn) return;\n");
2182 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2183 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2184 fprintf(f, "\tswitch(sym) {\n");
2186 for (i = 0; i < g->num_syms; i++) {
2187 struct symbol *s = g->symtab[i];
2189 s->type != Nonterminal ||
2190 s->struct_name.txt == NULL)
2193 fprintf(f, "\tcase %d:\n", s->num);
2195 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2197 s->struct_name.txt);
2198 fprintf(f, "\t\tfree(asn);\n");
2200 fprintf(f, "\t\tfree_%.*s(asn);\n",
2202 s->struct_name.txt);
2203 fprintf(f, "\t\tbreak;\n");
2205 fprintf(f, "\t}\n}\n\n");
2208 ## The main routine.
2210 There are three key parts to the "main" function of parsergen: processing
2211 the arguments, loading the grammar file, and dealing with the grammar.
2213 ### Argument processing.
2215 Command line options allow the selection of analysis mode, name of output
2216 file, and whether or not a report should be generated. By default we create
2217 a report only if no code output was requested.
2219 The `parse_XX` function name uses the basename of the output file which
2220 should not have a suffix (`.c`). `.c` is added to the given name for the
2221 code, and `.h` is added for the header (if header text is specifed in the
2228 static const struct option long_options[] = {
2229 { "LR0", 0, NULL, '0' },
2230 { "LR05", 0, NULL, '5' },
2231 { "SLR", 0, NULL, 'S' },
2232 { "LALR", 0, NULL, 'L' },
2233 { "LR1", 0, NULL, '1' },
2234 { "tag", 1, NULL, 't' },
2235 { "report", 0, NULL, 'R' },
2236 { "output", 1, NULL, 'o' },
2237 { NULL, 0, NULL, 0 }
2239 const char *options = "05SL1t:Ro:";
2241 ###### process arguments
2243 char *outfile = NULL;
2248 enum grammar_type type = LR05;
2249 while ((opt = getopt_long(argc, argv, options,
2250 long_options, NULL)) != -1) {
2265 outfile = optarg; break;
2267 tag = optarg; break;
2269 fprintf(stderr, "Usage: parsergen ...\n");
2274 infile = argv[optind++];
2276 fprintf(stderr, "No input file given\n");
2279 if (outfile && report == 1)
2282 if (name && strchr(name, '/'))
2283 name = strrchr(name, '/')+1;
2285 if (optind < argc) {
2286 fprintf(stderr, "Excess command line arguments\n");
2290 ### Loading the grammar file
2292 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2295 Once we have extracted the code (with `mdcode`) we expect to find three
2296 sections: header, code, and grammar. Anything else that is not
2297 excluded by the `--tag` option is an error.
2299 "header" and "code" are optional, though it is hard to build a working
2300 parser with neither. "grammar" must be provided.
2304 #include <sys/mman.h>
2309 static void pr_err(char *msg)
2312 fprintf(stderr, "%s\n", msg);
2316 struct section *table;
2320 fd = open(infile, O_RDONLY);
2322 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2323 infile, strerror(errno));
2326 len = lseek(fd, 0, 2);
2327 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2328 table = code_extract(file, file+len, pr_err);
2330 struct code_node *hdr = NULL;
2331 struct code_node *code = NULL;
2332 struct code_node *gram = NULL;
2333 for (s = table; s; s = s->next) {
2334 struct text sec = s->section;
2335 if (tag && !strip_tag(&sec, tag))
2337 if (text_is(sec, "header"))
2339 else if (text_is(sec, "code"))
2341 else if (text_is(sec, "grammar"))
2344 fprintf(stderr, "Unknown content section: %.*s\n",
2345 s->section.len, s->section.txt);
2350 ### Processing the input
2352 As we need to append an extention to a filename and then open it for
2353 writing, and we need to do this twice, it helps to have a separate function.
2357 static FILE *open_ext(char *base, char *ext)
2359 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2361 strcat(strcpy(fn, base), ext);
2367 If we can read the grammar, then we analyse and optionally report on it. If
2368 the report finds conflicts we will exit with an error status.
2370 ###### process input
2371 struct grammar *g = NULL;
2373 fprintf(stderr, "No grammar section provided\n");
2376 g = grammar_read(gram);
2378 fprintf(stderr, "Failure to parse grammar\n");
2383 grammar_analyse(g, type);
2385 if (grammar_report(g, type))
2389 If a "headers" section is defined, we write it out and include a declaration
2390 for the `parse_XX` function so it can be used from separate code.
2392 if (rv == 0 && hdr && outfile) {
2393 FILE *f = open_ext(outfile, ".h");
2395 code_node_print(f, hdr, infile);
2396 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2400 fprintf(stderr, "Cannot create %s.h\n",
2406 And if all goes well and an output file was provided, we create the `.c`
2407 file with the code section (if any) and the parser tables and function.
2409 if (rv == 0 && outfile) {
2410 FILE *f = open_ext(outfile, ".c");
2413 code_node_print(f, code, infile);
2414 gen_parser(f, g, infile, name);
2417 fprintf(stderr, "Cannot create %s.c\n",
2423 And that about wraps it up. We need to set the locale so that UTF-8 is
2424 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2426 ###### File: parsergen.mk
2427 parsergen : parsergen.o libscanner.o libmdcode.o
2428 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2435 int main(int argc, char *argv[])
2440 setlocale(LC_ALL,"");
2442 ## process arguments
2449 ## The SHIFT/REDUCE parser
2451 Having analysed the grammar and generated all the tables, we only need the
2452 shift/reduce engine to bring it all together.
2454 ### Goto table lookup
2456 The parser generator has nicely provided us with goto tables sorted by
2457 symbol number. We need a binary search function to find a symbol in the
2460 ###### parser functions
2462 static int search(const struct state *l, int sym)
2465 int hi = l->go_to_cnt;
2469 while (lo + 1 < hi) {
2470 int mid = (lo + hi) / 2;
2471 if (l->go_to[mid].sym <= sym)
2476 if (l->go_to[lo].sym == sym)
2477 return l->go_to[lo].state;
2482 ### The state stack.
2484 The core data structure for the parser is the stack. This tracks all the
2485 symbols that have been recognised or partially recognised.
2487 The stack usually won't grow very large - maybe a few tens of entries. So
2488 we dynamically resize and array as required but never bother to shrink it
2491 We keep the stack as two separate allocations. One, `asn_stack` stores the
2492 "abstract syntax nodes" which are created by each reduction. When we call
2493 `do_reduce` we need to pass an array of the `asn`s of the body of the
2494 production, and by keeping a separate `asn` stack, we can just pass a
2495 pointer into this stack.
2497 The other allocation stores all other stack fields of which there are six.
2498 The `state` is the most important one and guides the parsing process. The
2499 `sym` is nearly unnecessary. However when we want to free entries from the
2500 `asn_stack`, it helps to know what type they are so we can call the right
2501 freeing function. The symbol leads us to the right free function through
2504 The `indents` count tracks the line indents with-in the symbol or
2505 immediately follow it. These are used to allow indent information to
2506 guide parsing and error recovery.
2508 `since_newline` tracks how many stack frames since the last
2509 start-of-line (whether indented or not). So if `since_newline` is
2510 zero, then this symbol is at the start of a line. Similarly
2511 `since_indent` counts the number of states since an indent, it is zero
2512 precisely when `indents` is not zero.
2514 `newline_permitted` keeps track of whether newlines should be ignored
2517 The stack is most properly seen as alternating states and symbols -
2518 states, like the 'DOT' in items, are between symbols. Each frame in
2519 our stack holds a state and the symbol that was before it. The
2520 bottom of stack holds the start state but no symbol, as nothing came
2521 before the beginning.
2523 ###### parser functions
2528 short newline_permitted;
2532 short since_newline;
2542 Two operations are needed on the stack - shift (which is like push) and pop.
2544 Shift applies not only to terminals but also to non-terminals. When
2545 we reduce a production we will pop off entries corresponding to the
2546 body symbols, then push on an item for the head of the production.
2547 This last is exactly the same process as shifting in a terminal so we
2548 use the same function for both. In both cases we provide the symbol,
2549 the number of indents the symbol contains (which will be zero for a
2550 terminal symbol) and a flag indicating the the symbol was at (or was
2551 reduced from a symbol which was at) the start of a line. The state is
2552 deduced from the current top-of-stack state and the new symbol.
2554 To simplify other code we arrange for `shift` to fail if there is no `goto`
2555 state for the symbol. This is useful in basic parsing due to our design
2556 that we shift when we can, and reduce when we cannot. So the `shift`
2557 function reports if it could.
2559 `shift` is also used to push state zero onto the stack, so if the
2560 stack is empty, it always chooses zero as the next state.
2562 So `shift` finds the next state. If that succeeds it extends the
2563 allocations if needed and pushes all the information onto the stacks.
2565 Newlines are permitted after a `starts_line` state until an internal
2566 indent. If the new frame has neither a `starts_line` state nor an
2567 indent, newlines are permitted if the previous stack frame permitted
2570 ###### parser functions
2572 static int shift(struct parser *p,
2573 short sym, short indents, short start_of_line,
2575 const struct state states[])
2577 // Push an entry onto the stack
2578 struct frame next = {0};
2579 int newstate = p->tos
2580 ? search(&states[p->stack[p->tos-1].state],
2585 if (p->tos >= p->stack_size) {
2586 p->stack_size += 10;
2587 p->stack = realloc(p->stack, p->stack_size
2588 * sizeof(p->stack[0]));
2589 p->asn_stack = realloc(p->asn_stack, p->stack_size
2590 * sizeof(p->asn_stack[0]));
2593 next.indents = indents;
2594 next.state = newstate;
2595 if (states[newstate].starts_line)
2596 next.newline_permitted = 1;
2598 next.newline_permitted = 0;
2600 next.newline_permitted =
2601 p->stack[p->tos-1].newline_permitted;
2603 next.newline_permitted = 0;
2605 if (!start_of_line) {
2607 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2609 next.since_newline = 1;
2612 next.since_indent = 0;
2614 next.since_indent = p->stack[p->tos-1].since_indent + 1;
2616 next.since_indent = 1;
2618 p->stack[p->tos] = next;
2619 p->asn_stack[p->tos] = asn;
2624 `pop` primarily moves the top of stack (`tos`) back down the required
2625 amount and frees any `asn` entries that need to be freed. It also
2626 collects a summary of the indents and line starts in the symbols that
2627 are being removed. It is called _after_ we reduce a production, just
2628 before we `shift` the nonterminal in.
2630 ###### parser functions
2632 static int pop(struct parser *p, int num,
2633 short *start_of_line,
2634 void(*do_free)(short sym, void *asn))
2641 for (i = 0; i < num; i++) {
2642 sol |= !p->stack[p->tos+i].since_newline;
2643 indents += p->stack[p->tos+i].indents;
2644 do_free(p->stack[p->tos+i].sym,
2645 p->asn_stack[p->tos+i]);
2648 *start_of_line = sol;
2652 ### Memory allocation
2654 The `scanner` returns tokens in a local variable - we want them in allocated
2655 memory so they can live in the `asn_stack`. Similarly the `asn` produced by
2656 a reduce is in a large buffer. Both of these require some allocation and
2657 copying, hence `memdup` and `tokcopy`.
2659 ###### parser includes
2662 ###### parser functions
2664 void *memdup(void *m, int len)
2670 memcpy(ret, m, len);
2674 static struct token *tok_copy(struct token tk)
2676 struct token *new = malloc(sizeof(*new));
2681 ### The heart of the parser.
2683 Now we have the parser. If we can shift we do, though newlines and
2684 reducing indenting may block that. If not and we can reduce we do
2685 that. If the production we reduced was production zero, then we have
2686 accepted the input and can finish.
2688 We return whatever `asn` was returned by reducing production zero.
2690 If we can neither shift nor reduce we have an error to handle. We pop
2691 single entries off the stack until we can shift the `TK_error` symbol, then
2692 drop input tokens until we find one we can shift into the new error state.
2694 When we find `TK_in` and `TK_out` tokens which report indents we need
2695 to handle them directly as the grammar cannot express what we want to
2698 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2699 record how many indents there are following the previous token.
2701 `TK_out` tokens must be canceled against an indent count
2702 within the stack. If we can reduce some symbols that are all since
2703 the most recent indent, then we do that first. If the minimum prefix
2704 of the current state then extends back before the most recent indent,
2705 that indent can be cancelled. If the minimum prefix is shorter then
2706 the indent had ended prematurely and we must start error handling, which
2707 is still a work-in-progress.
2709 `TK_newline` tokens are ignored unless the top stack frame records
2710 that they are permitted. In that case they will not be considered for
2711 shifting if it is possible to reduce some symbols that are all since
2712 the most recent start of line. This is how a newline forcible
2713 terminates any line-like structure - we try to reduce down to at most
2714 one symbol for each line where newlines are allowed.
2716 When, during error handling, we discard token read in, we want to keep
2717 discarding until we see one that is recognised. If we had a full set
2718 of LR(1) grammar states, this will mean looking in the look-ahead set,
2719 but we don't keep a full look-ahead set. We only record the subset
2720 that leads to SHIFT. We can, however, deduce the look-ahead set but
2721 looking at the SHIFT subsets for all states that we can get to by
2722 reducing zero or more times. So we need a little function which
2723 checks if a given token is in any of these look-ahead sets.
2725 ###### parser includes
2730 static int in_lookahead(struct token *tk, const struct state *states, int state)
2732 while (state >= 0) {
2733 if (search(&states[state], tk->num) >= 0)
2735 if (states[state].reduce_prod < 0)
2737 state = search(&states[state], states[state].reduce_sym);
2742 void *parser_run(struct token_state *tokens,
2743 const struct state states[],
2744 int (*do_reduce)(int, void**, struct token_config*, void*),
2745 void (*do_free)(short, void*),
2746 FILE *trace, const char *non_term[],
2747 struct token_config *config)
2749 struct parser p = { 0 };
2750 struct token *tk = NULL;
2754 shift(&p, TK_eof, 0, 1, NULL, states);
2756 struct token *err_tk;
2757 struct frame *tos = &p.stack[p.tos-1];
2759 tk = tok_copy(token_next(tokens));
2760 parser_trace(trace, &p,
2761 tk, states, non_term, config->known_count);
2763 if (tk->num == TK_in) {
2765 tos->since_newline = 0;
2766 tos->since_indent = 0;
2767 if (!states[tos->state].starts_line)
2768 tos->newline_permitted = 0;
2771 parser_trace_action(trace, "Record");
2774 if (tk->num == TK_out) {
2775 if (states[tos->state].reduce_size >= 0 &&
2776 states[tos->state].reduce_size <= tos->since_indent)
2778 if (states[tos->state].min_prefix >= tos->since_indent) {
2780 struct frame *in = tos - tos->since_indent;
2782 if (in->indents == 0) {
2783 /* Reassess since_indent and newline_permitted */
2785 in->since_indent = in[-1].since_indent + 1;
2786 in->newline_permitted = in[-1].newline_permitted;
2788 in->since_indent = 0;
2789 in->newline_permitted = 0;
2791 if (states[in->state].starts_line)
2792 in->newline_permitted = 1;
2795 in->since_indent = in[-1].since_indent + 1;
2796 if (states[in->state].starts_line)
2797 in->newline_permitted = 1;
2799 in->newline_permitted = in[-1].newline_permitted;
2804 parser_trace_action(trace, "Cancel");
2807 // fall through to error handling as both SHIFT and REDUCE
2810 if (tk->num == TK_newline) {
2811 if (!tos->newline_permitted) {
2814 parser_trace_action(trace, "Discard");
2817 if (tos->since_newline > 1 &&
2818 states[tos->state].reduce_size >= 0 &&
2819 states[tos->state].reduce_size <= tos->since_newline)
2822 if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
2824 parser_trace_action(trace, "Shift");
2828 if (states[tos->state].reduce_prod >= 0) {
2831 const struct state *nextstate = &states[tos->state];
2832 int prod = nextstate->reduce_prod;
2833 int size = nextstate->reduce_size;
2835 static char buf[16*1024];
2836 short indents, start_of_line;
2838 body = p.asn_stack + (p.tos - size);
2840 bufsize = do_reduce(prod, body, config, buf);
2842 indents = pop(&p, size, &start_of_line,
2844 res = memdup(buf, bufsize);
2845 memset(buf, 0, bufsize);
2846 if (!shift(&p, nextstate->reduce_sym,
2847 indents, start_of_line,
2849 if (prod != 0) abort();
2853 parser_trace_action(trace, "Reduce");
2856 /* Error. We walk up the stack until we
2857 * find a state which will accept TK_error.
2858 * We then shift in TK_error and see what state
2859 * that takes us too.
2860 * Then we discard input tokens until
2861 * we find one that is acceptable.
2863 parser_trace_action(trace, "ERROR");
2864 short indents = 0, start_of_line;
2866 err_tk = tok_copy(*tk);
2868 shift(&p, TK_error, 0, 0,
2869 err_tk, states) == 0)
2870 // discard this state
2871 indents += pop(&p, 1, &start_of_line, do_free);
2874 // no state accepted TK_error
2877 tos = &p.stack[p.tos-1];
2878 while (!in_lookahead(tk, states, tos->state) &&
2879 tk->num != TK_eof) {
2881 tk = tok_copy(token_next(tokens));
2882 if (tk->num == TK_in)
2884 if (tk->num == TK_out) {
2888 // FIXME update since_indent here
2891 if (p.tos == 0 && tk->num == TK_eof)
2893 tos = &p.stack[p.tos-1];
2894 tos->indents += indents;
2898 pop(&p, p.tos, NULL, do_free);
2904 ###### exported functions
2905 void *parser_run(struct token_state *tokens,
2906 const struct state states[],
2907 int (*do_reduce)(int, void**, struct token_config*, void*),
2908 void (*do_free)(short, void*),
2909 FILE *trace, const char *non_term[],
2910 struct token_config *config);
2914 Being able to visualize the parser in action can be invaluable when
2915 debugging the parser code, or trying to understand how the parse of a
2916 particular grammar progresses. The stack contains all the important
2917 state, so just printing out the stack every time around the parse loop
2918 can make it possible to see exactly what is happening.
2920 This doesn't explicitly show each SHIFT and REDUCE action. However they
2921 are easily deduced from the change between consecutive lines, and the
2922 details of each state can be found by cross referencing the states list
2923 in the stack with the "report" that parsergen can generate.
2925 For terminal symbols, we just dump the token. For non-terminals we
2926 print the name of the symbol. The look ahead token is reported at the
2927 end inside square brackets.
2929 ###### parser functions
2931 static char *reserved_words[] = {
2932 [TK_error] = "ERROR",
2935 [TK_newline] = "NEWLINE",
2938 static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2940 fprintf(trace, "(%d", f->state);
2941 if (states[f->state].starts_line)
2942 fprintf(trace, "s");
2943 if (f->newline_permitted)
2944 fprintf(trace, "n%d", f->since_newline);
2945 fprintf(trace, ") ");
2948 void parser_trace(FILE *trace, struct parser *p,
2949 struct token *tk, const struct state states[],
2950 const char *non_term[], int knowns)
2955 for (i = 0; i < p->tos; i++) {
2956 struct frame *f = &p->stack[i];
2959 if (sym < TK_reserved &&
2960 reserved_words[sym] != NULL)
2961 fputs(reserved_words[sym], trace);
2962 else if (sym < TK_reserved + knowns) {
2963 struct token *t = p->asn_stack[i];
2964 text_dump(trace, t->txt, 20);
2966 fputs(non_term[sym - TK_reserved - knowns],
2969 fprintf(trace, ".%d", f->indents);
2970 if (f->since_newline == 0)
2974 parser_trace_state(trace, f, states);
2976 fprintf(trace, "[");
2977 if (tk->num < TK_reserved &&
2978 reserved_words[tk->num] != NULL)
2979 fputs(reserved_words[tk->num], trace);
2981 text_dump(trace, tk->txt, 20);
2985 void parser_trace_action(FILE *trace, char *action)
2988 fprintf(trace, " - %s\n", action);
2993 The obvious example for a parser is a calculator.
2995 As `scanner` provides number parsing function using `libgmp` is it not much
2996 work to perform arbitrary rational number calculations.
2998 This calculator takes one expression, or an equality test, per line. The
2999 results are printed and if any equality test fails, the program exits with
3002 ###### File: parsergen.mk
3003 calc.c calc.h : parsergen parsergen.mdc
3004 ./parsergen --tag calc -o calc parsergen.mdc
3005 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3006 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3011 // what do we use for a demo-grammar? A calculator of course.
3023 #include <sys/mman.h>
3028 #include "scanner.h"
3034 static void free_number(struct number *n)
3040 int main(int argc, char *argv[])
3042 int fd = open(argv[1], O_RDONLY);
3043 int len = lseek(fd, 0, 2);
3044 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3045 struct section *s = code_extract(file, file+len, NULL);
3046 struct token_config config = {
3047 .ignored = (1 << TK_line_comment)
3048 | (1 << TK_block_comment)
3051 .number_chars = ".,_+-",
3055 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3057 struct section *t = s->next;
3070 Session -> Session Line
3073 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3074 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3075 gmp_printf(" or as a decimal: %Fg\n", fl);
3079 | Expression = Expression NEWLINE ${
3080 if (mpq_equal($1.val, $3.val))
3081 gmp_printf("Both equal %Qd\n", $1.val);
3083 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
3088 | NEWLINE ${ printf("Blank line\n"); }$
3089 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3092 Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3093 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3094 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3095 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3096 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3097 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$