1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. `calc.cgm` is a test grammar for a simple calculator.
22 ###### File: parsergen.c
27 ## forward declarations
38 ###### File: libparser.c
47 ###### File: parsergen.mk
50 parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
53 ## Reading the grammar
55 The grammar must be stored in a markdown literate code file as parsed
56 by "[mdcode][]". It must have three top level (i.e. unreferenced)
57 sections: `header`, `code`, and `grammar`. The first two will be
58 literally copied into the generated `.c` and `.h`. files. The last
59 contains the grammar. This is tokenised with "[scanner][]".
61 If the `--tag` option is given, then any top level header that doesn't
62 start with the tag is ignored, and the tag is striped from the rest. So
64 means that the three needed sections must be `Foo: header`, `Foo: code`,
68 [scanner]: scanner.html
74 ###### parser includes
78 The grammar contains production sets, precedence/associativity
79 declarations, and data type declarations. These are all parsed with
80 _ad hoc_ parsing as we don't have a parser generator yet.
82 The precedence and associativity can be set for each production, but
83 can be inherited from symbols. The data type is potentially defined
84 for each non-terminal and describes what C structure is used to store
85 information about each symbol.
88 enum assoc {Left, Right, Non};
89 char *assoc_names[] = {"Left","Right","Non"};
92 struct text struct_name;
94 unsigned short precedence;
98 unsigned short precedence;
106 The strings reported by `mdcode` and `scanner` are `struct text` which have
107 length rather than being null terminated. To help with printing and
108 comparing we define `text_is` and `prtxt`, which should possibly go in
109 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
110 which might contain control characters.
112 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
113 because that is what we need to detect tags.
116 static int text_is(struct text t, char *s)
118 return (strlen(s) == t.len &&
119 strncmp(s, t.txt, t.len) == 0);
121 static void prtxt(struct text t)
123 printf("%.*s", t.len, t.txt);
126 static int strip_tag(struct text *t, char *tag)
128 int skip = strlen(tag) + 1;
129 if (skip >= t->len ||
130 strncmp(t->txt, tag, skip-1) != 0 ||
131 t->txt[skip-1] != ':')
133 while (skip < t->len && t->txt[skip] == ' ')
142 Productions are comprised primarily of symbols - terminal and
143 non-terminal. We do not make a syntactic distinction between the two,
144 though non-terminals must be identifiers. Non-terminal symbols are
145 those which appear in the head of a production, terminal symbols are
146 those which don't. There are also "virtual" symbols used for precedence
147 marking discussed later, and sometimes we won't know what type a symbol
150 ###### forward declarations
151 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
152 char *symtypes = "UVTN";
156 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
157 table of known symbols and the resulting parser will report them as
158 `TK_reserved + N`. A small set of identifiers are reserved for the
159 different token types that `scanner` can report.
162 static char *reserved_words[] = {
163 [TK_error] = "ERROR",
164 [TK_number] = "NUMBER",
165 [TK_ident] = "IDENTIFIER",
167 [TK_string] = "STRING",
168 [TK_multi_string] = "MULTI_STRING",
171 [TK_newline] = "NEWLINE",
177 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
178 recognised. The former is automatically expected at the end of the text
179 being parsed. The latter are always ignored by the parser.
181 All of these symbols are stored in a simple symbol table. We use the
182 `struct text` from `mdcode` to store the name and link them together
183 into a sorted list using an insertion sort.
185 We don't have separate `find` and `insert` functions as any symbol we
186 find needs to be remembered. We simply expect `find` to always return a
187 symbol, but its type might be `Unknown`.
196 ###### grammar fields
201 static int text_cmp(struct text a, struct text b)
206 int cmp = strncmp(a.txt, b.txt, len);
210 return a.len - b.len;
213 static struct symbol *sym_find(struct grammar *g, struct text s)
215 struct symbol **l = &g->syms;
220 (cmp = text_cmp((*l)->name, s)) < 0)
224 n = calloc(1, sizeof(*n));
233 static void symbols_init(struct grammar *g)
235 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
237 for (i = 0; i < entries; i++) {
240 t.txt = reserved_words[i];
243 t.len = strlen(t.txt);
250 ### Data types and precedence.
252 Data type specification and precedence specification are both
253 introduced by a dollar sign at the start of the line. If the next
254 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
255 precedence, otherwise it specifies a data type.
257 The data type name is simply stored and applied to the head of all
258 subsequent productions. It must be the name of a structure, so `$expr`
259 maps to `struct expr`.
261 Any productions given before the first data type will have no data type
262 and can carry no information. In order to allow other non-terminals to
263 have no type, the data type `$void` can be given. This does *not* mean
264 that `struct void` will be used, but rather than no type will be
265 associated with future non-terminals.
267 The precedence line must contain a list of symbols - typically
268 terminal symbols, but not necessarily. It can only contain symbols
269 that have not been seen yet, so precedence declaration must precede
270 the productions that they affect.
272 A precedence line may also contain a "virtual" symbol which is an
273 identifier preceded by `$$`. Virtual terminals carry precedence
274 information but are not included in the grammar. A production can
275 declare that it inherits the precedence of a given virtual symbol.
277 This use for `$$` precludes it from being used as a symbol in the
278 described language. Two other symbols: `${` and `}$` are also
281 Each new precedence line introduces a new precedence level and
282 declares how it associates. This level is stored in each symbol
283 listed and may be inherited by any production which uses the symbol. A
284 production inherits from the last symbol which has a precedence.
286 ###### grammar fields
287 struct text current_type;
291 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
292 static const char *known[] = { "$$", "${", "}$" };
295 static char *dollar_line(struct token_state *ts, struct grammar *g)
297 struct token t = token_next(ts);
302 if (t.num != TK_ident) {
303 err = "type or assoc expected after '$'";
306 if (text_is(t.txt, "LEFT"))
308 else if (text_is(t.txt, "RIGHT"))
310 else if (text_is(t.txt, "NON"))
313 g->current_type = t.txt;
314 if (text_is(t.txt, "void"))
315 g->current_type.txt = NULL;
317 if (t.num != TK_newline) {
318 err = "Extra tokens after type name";
324 // This is a precedence line, need some symbols.
328 while (t.num != TK_newline) {
329 enum symtype type = Terminal;
331 if (t.num == TK_virtual) {
334 if (t.num != TK_ident) {
335 err = "$$ must be followed by a word";
338 } else if (t.num != TK_ident &&
340 err = "Illegal token in precedence line";
343 s = sym_find(g, t.txt);
344 if (s->type != Unknown) {
345 err = "Symbols in precedence line must not already be known.";
349 s->precedence = g->prec_levels;
354 err = "No symbols given on precedence line";
358 while (t.num != TK_newline && t.num != TK_eof)
365 A production either starts with an identifier which is the head
366 non-terminal, or a vertical bar (`|`) in which case this production
367 uses the same head as the previous one. The identifier must be
368 followed by a `->` mark. All productions for a given non-terminal must
369 be grouped together with the `nonterminal ->` given only once.
371 After this a (possibly empty) sequence of identifiers and marks form
372 the body of the production. A virtual symbol may be given after the
373 body by preceding it with `$$`. If a virtual symbol is given, the
374 precedence of the production is that for the virtual symbol. If none
375 is given, the precedence is inherited from the last symbol in the
376 production which has a precedence specified.
378 After the optional precedence may come the `${` mark. This indicates
379 the start of a code fragment. If present, this must be on the same
380 line as the start of the production.
382 All of the text from the `${` through to the matching `}$` is
383 collected and forms the code-fragment for the production. It must all
384 be in one `code_node` of the literate code. The `}$` must be
385 at the end of a line.
387 Text in the code fragment will undergo substitutions where `$N` for
388 some numeric `N` will be replaced with a variable holding the parse
389 information for the particular symbol in the production. `$0` is the
390 head of the production, `$1` is the first symbol of the body, etc.
391 The type of `$N` for a terminal symbol is `struct token`. For
392 a non-terminal, it is whatever has been declared for that symbol.
394 While building productions we will need to add to an array which needs to
398 static void array_add(void *varray, int *cnt, void *new)
400 void ***array = varray;
403 current = ((*cnt-1) | (step-1))+1;
404 if (*cnt == current) {
407 *array = realloc(*array, current * sizeof(void*));
409 (*array)[*cnt] = new;
413 Collecting the code fragment simply involves reading tokens until we
414 hit the end token `}$`, and noting the character position of the start and
418 static struct text collect_code(struct token_state *state,
423 code.txt = start.txt.txt + start.txt.len;
425 t = token_next(state);
426 while (t.node == start.node &&
427 t.num != TK_close && t.num != TK_error &&
429 if (t.num == TK_close && t.node == start.node)
430 code.len = t.txt.txt - code.txt;
436 Now we have all the bit we need to parse a full production.
441 ###### grammar fields
442 struct production **productions;
443 int production_count;
445 ###### production fields
447 struct symbol **body;
452 int first_production;
455 static char *parse_production(struct grammar *g,
457 struct token_state *state)
459 /* Head has already been parsed. */
462 struct production p, *pp;
464 memset(&p, 0, sizeof(p));
466 tk = token_next(state);
467 while (tk.num == TK_ident || tk.num == TK_mark) {
468 struct symbol *bs = sym_find(g, tk.txt);
469 if (bs->type == Unknown)
471 if (bs->type == Virtual) {
472 err = "Virtual symbol not permitted in production";
475 if (bs->precedence) {
476 p.precedence = bs->precedence;
479 array_add(&p.body, &p.body_size, bs);
480 tk = token_next(state);
482 if (tk.num == TK_virtual) {
484 tk = token_next(state);
485 if (tk.num != TK_ident) {
486 err = "word required after $$";
489 vs = sym_find(g, tk.txt);
490 if (vs->type != Virtual) {
491 err = "symbol after $$ must be virtual";
494 p.precedence = vs->precedence;
496 tk = token_next(state);
498 if (tk.num == TK_open) {
499 p.code = collect_code(state, tk);
500 if (p.code.txt == NULL) {
501 err = "code fragment not closed properly";
504 tk = token_next(state);
506 if (tk.num != TK_newline && tk.num != TK_eof) {
507 err = "stray tokens at end of line";
510 pp = malloc(sizeof(*pp));
512 array_add(&g->productions, &g->production_count, pp);
515 while (tk.num != TK_newline && tk.num != TK_eof)
516 tk = token_next(state);
520 With the ability to parse production and dollar-lines, we have nearly all
521 that we need to parse a grammar from a `code_node`.
523 The head of the first production will effectively be the `start` symbol of
524 the grammar. However it won't _actually_ be so. Processing the grammar is
525 greatly simplified if the real start symbol only has a single production,
526 and expects `$eof` as the final terminal. So when we find the first
527 explicit production we insert an extra production as production zero which
530 ###### Example: production 0
533 where `START` is the first non-terminal given.
535 ###### create production zero
536 struct production *p = calloc(1,sizeof(*p));
537 struct text start = {"$start",6};
538 struct text eof = {"$eof",4};
539 p->head = sym_find(g, start);
540 p->head->type = Nonterminal;
541 array_add(&p->body, &p->body_size, head);
542 array_add(&p->body, &p->body_size, sym_find(g, eof));
543 g->start = p->head->num;
544 p->head->first_production = g->production_count;
545 array_add(&g->productions, &g->production_count, p);
547 Now we are ready to read in the grammar.
549 ###### grammar fields
550 int start; // the 'start' symbol.
553 static struct grammar *grammar_read(struct code_node *code)
555 struct token_config conf = {
558 .words_marks = known,
559 .known_count = sizeof(known)/sizeof(known[0]),
561 .ignored = (1 << TK_line_comment)
562 | (1 << TK_block_comment)
565 | (1 << TK_multi_string)
570 struct token_state *state = token_open(code, &conf);
572 struct symbol *head = NULL;
576 g = calloc(1, sizeof(*g));
579 for (tk = token_next(state); tk.num != TK_eof;
580 tk = token_next(state)) {
581 if (tk.num == TK_newline)
583 if (tk.num == TK_ident) {
585 head = sym_find(g, tk.txt);
586 if (head->type == Nonterminal)
587 err = "This non-terminal has already be used.";
588 else if (head->type == Virtual)
589 err = "Virtual symbol not permitted in head of production";
591 head->type = Nonterminal;
592 head->struct_name = g->current_type;
594 ## create production zero
596 head->first_production = g->production_count;
597 tk = token_next(state);
598 if (tk.num == TK_mark &&
599 text_is(tk.txt, "->"))
600 err = parse_production(g, head, state);
602 err = "'->' missing in production";
604 } else if (tk.num == TK_mark
605 && text_is(tk.txt, "|")) {
606 // another production for same non-term
608 err = parse_production(g, head, state);
610 err = "First production must have a head";
611 } else if (tk.num == TK_mark
612 && text_is(tk.txt, "$")) {
613 err = dollar_line(state, g);
615 err = "Unrecognised token at start of line.";
623 fprintf(stderr, "Error at line %d: %s\n",
630 ## Analysing the grammar
632 The central task in analysing the grammar is to determine a set of
633 states to drive the parsing process. This will require finding
634 various sets of symbols and of "items". Some of these sets will need
635 to attach information to each element in the set, so they are more
638 Each "item" may have a 'look-ahead' or `LA` set associated with
639 it. Many of these will be the same as each other. To avoid memory
640 wastage, and to simplify some comparisons of sets, these sets will be
641 stored in a list of unique sets, each assigned a number.
643 Once we have the data structures in hand to manage these sets and
644 lists, we can start setting the 'nullable' flag, build the 'FIRST'
645 sets, and then create the item sets which define the various states.
649 Though we don't only store symbols in these sets, they are the main
650 things we store, so they are called symbol sets or "symsets".
652 A symset has a size, an array of shorts, and an optional array of data
653 values, which are also shorts. If the array of data is not present,
654 we store an impossible pointer, as `NULL` is used to indicate that no
655 memory has been allocated yet;
660 unsigned short *syms, *data;
662 #define NO_DATA ((unsigned short *)1)
663 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
664 const struct symset INIT_DATASET = { 0, NULL, NULL };
666 The arrays of shorts are allocated in blocks of 8 and are kept sorted
667 by using an insertion sort. We don't explicitly record the amount of
668 allocated space as it can be derived directly from the current `cnt` using
669 `((cnt - 1) | 7) + 1`.
672 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
675 int current = ((s->cnt-1) | 7) + 1;
676 if (current == s->cnt) {
678 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
679 if (s->data != NO_DATA)
680 s->data = realloc(s->data, sizeof(*s->data) * current);
683 while (i > 0 && s->syms[i-1] > key) {
684 s->syms[i] = s->syms[i-1];
685 if (s->data != NO_DATA)
686 s->data[i] = s->data[i-1];
690 if (s->data != NO_DATA)
695 Finding a symbol (or item) in a `symset` uses a simple binary search.
696 We return the index where the value was found (so data can be accessed),
697 or `-1` to indicate failure.
699 static int symset_find(struct symset *ss, unsigned short key)
706 while (lo + 1 < hi) {
707 int mid = (lo + hi) / 2;
708 if (ss->syms[mid] <= key)
713 if (ss->syms[lo] == key)
718 We will often want to form the union of two symsets. When we do, we
719 will often want to know if anything changed (as they might mean there
720 is more work to do). So `symset_union` reports whether anything was
721 added to the first set. We use a slow quadratic approach as these
722 sets don't really get very big. If profiles shows this to be a problem is
723 can be optimised later.
725 static int symset_union(struct symset *a, struct symset *b)
729 for (i = 0; i < b->cnt; i++)
730 if (symset_find(a, b->syms[i]) < 0) {
731 unsigned short data = 0;
732 if (b->data != NO_DATA)
734 symset_add(a, b->syms[i], data);
740 And of course we must be able to free a symset.
742 static void symset_free(struct symset ss)
745 if (ss.data != NO_DATA)
751 Some symsets are simply stored somewhere appropriate in the `grammar`
752 data structure, others needs a bit of indirection. This applies
753 particularly to "LA" sets. These will be paired with "items" in an
754 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
755 stored in the `data` field, so we need a mapping from a small (short)
756 number to an LA `symset`.
758 As mentioned earlier this involves creating a list of unique symsets.
760 For now, we just use a linear list sorted by insertion. A skiplist
761 would be more efficient and may be added later.
768 struct setlist *next;
771 ###### grammar fields
772 struct setlist *sets;
777 static int ss_cmp(struct symset a, struct symset b)
780 int diff = a.cnt - b.cnt;
783 for (i = 0; i < a.cnt; i++) {
784 diff = (int)a.syms[i] - (int)b.syms[i];
791 static int save_set(struct grammar *g, struct symset ss)
793 struct setlist **sl = &g->sets;
797 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
804 s = malloc(sizeof(*s));
813 Finding a set by number is currently performed by a simple linear search.
814 If this turns out to hurt performance, we can store the sets in a dynamic
815 array like the productions.
817 static struct symset set_find(struct grammar *g, int num)
819 struct setlist *sl = g->sets;
820 while (sl && sl->num != num)
826 ### Setting `nullable`
828 We set `nullable` on the head symbol for any production for which all
829 the body symbols (if any) are nullable. As this is a recursive
830 definition, any change in the `nullable` setting means that we need to
831 re-evaluate where it needs to be set.
833 We simply loop around performing the same calculations until no more
840 static void set_nullable(struct grammar *g)
843 while (check_again) {
846 for (p = 0; p < g->production_count; p++) {
847 struct production *pr = g->productions[p];
850 if (pr->head->nullable)
852 for (s = 0; s < pr->body_size; s++)
853 if (! pr->body[s]->nullable)
855 if (s == pr->body_size) {
856 pr->head->nullable = 1;
863 ### Setting `can_eol`
865 In order to be able to ignore newline tokens when not relevant, but
866 still include them in the parse when needed, we will need to know
867 which states can start a "line-like" section of code. We ignore
868 newlines when there is an indent since the most recent start of a
871 To know what is line-like, we first need to know which symbols can end
872 a line-like section, which is precisely those which can end with a
873 newline token. These symbols don't necessarily alway end with a
874 newline, but they can. Hence they are not described as "lines" but
877 Clearly the `TK_newline` token can end with a newline. Any symbol
878 which is the head of a production that contains a line-ending symbol
879 followed only by nullable symbols is also a line-ending symbol. We
880 use a new field `can_eol` to record this attribute of symbols, and
881 compute it in a repetitive manner similar to `set_nullable`.
887 static void set_can_eol(struct grammar *g)
890 g->symtab[TK_newline]->can_eol = 1;
891 while (check_again) {
894 for (p = 0; p < g->production_count; p++) {
895 struct production *pr = g->productions[p];
898 if (pr->head->can_eol)
901 for (s = pr->body_size - 1; s >= 0; s--) {
902 if (pr->body[s]->can_eol) {
903 pr->head->can_eol = 1;
907 if (!pr->body[s]->nullable)
914 ### Building the `first` sets
916 When calculating what can follow a particular non-terminal, we will need to
917 know what the "first" terminal in any subsequent non-terminal might be. So
918 we calculate the `first` set for every non-terminal and store them in an
919 array. We don't bother recording the "first" set for terminals as they are
922 As the "first" for one symbol might depend on the "first" of another,
923 we repeat the calculations until no changes happen, just like with
924 `set_nullable`. This makes use of the fact that `symset_union`
925 reports if any change happens.
927 The core of this which finds the "first" of part of a production body
928 will be reused for computing the "follow" sets, so we split it out
929 into a separate function.
931 ###### grammar fields
932 struct symset *first;
936 static int add_first(struct production *pr, int start,
937 struct symset *target, struct grammar *g,
942 for (s = start; s < pr->body_size; s++) {
943 struct symbol *bs = pr->body[s];
944 if (bs->type == Terminal) {
945 if (symset_find(target, bs->num) < 0) {
946 symset_add(target, bs->num, 0);
950 } else if (symset_union(target, &g->first[bs->num]))
956 *to_end = (s == pr->body_size);
960 static void build_first(struct grammar *g)
964 g->first = calloc(g->num_syms, sizeof(g->first[0]));
965 for (s = 0; s < g->num_syms; s++)
966 g->first[s] = INIT_SYMSET;
968 while (check_again) {
971 for (p = 0; p < g->production_count; p++) {
972 struct production *pr = g->productions[p];
973 struct symset *head = &g->first[pr->head->num];
975 if (add_first(pr, 0, head, g, NULL))
981 ### Building the `follow` sets.
983 There are two different situations when we will want to generate "follow"
984 sets. If we are doing an SLR analysis, we want to generate a single
985 "follow" set for each non-terminal in the grammar. That is what is
986 happening here. If we are doing an LALR or LR analysis we will want
987 to generate a separate "LA" set for each item. We do that later
990 There are two parts to generating a "follow" set. Firstly we look at
991 every place that any non-terminal appears in the body of any
992 production, and we find the set of possible "first" symbols after
993 there. This is done using `add_first` above and only needs to be done
994 once as the "first" sets are now stable and will not change.
998 for (p = 0; p < g->production_count; p++) {
999 struct production *pr = g->productions[p];
1002 for (b = 0; b < pr->body_size - 1; b++) {
1003 struct symbol *bs = pr->body[b];
1004 if (bs->type == Terminal)
1006 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1010 The second part is to add the "follow" set of the head of a production
1011 to the "follow" sets of the final symbol in the production, and any
1012 other symbol which is followed only by `nullable` symbols. As this
1013 depends on "follow" itself we need to repeatedly perform the process
1014 until no further changes happen.
1018 for (again = 0, p = 0;
1019 p < g->production_count;
1020 p < g->production_count-1
1021 ? p++ : again ? (again = 0, p = 0)
1023 struct production *pr = g->productions[p];
1026 for (b = pr->body_size - 1; b >= 0; b--) {
1027 struct symbol *bs = pr->body[b];
1028 if (bs->type == Terminal)
1030 if (symset_union(&g->follow[bs->num],
1031 &g->follow[pr->head->num]))
1038 We now just need to create and initialise the `follow` list to get a
1041 ###### grammar fields
1042 struct symset *follow;
1045 static void build_follow(struct grammar *g)
1048 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1049 for (s = 0; s < g->num_syms; s++)
1050 g->follow[s] = INIT_SYMSET;
1054 ### Building itemsets and states
1056 There are three different levels of detail that can be chosen for
1057 building the itemsets and states for the LR grammar. They are:
1059 1. LR(0) or SLR(1), where no look-ahead is considered.
1060 2. LALR(1) where we build look-ahead sets with each item and merge
1061 the LA sets when we find two paths to the same "kernel" set of items.
1062 3. LR(1) where different look-ahead for any item in the set means
1063 a different state must be created.
1065 ###### forward declarations
1066 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1068 We need to be able to look through existing states to see if a newly
1069 generated state already exists. For now we use a simple sorted linked
1072 An item is a pair of numbers: the production number and the position of
1073 "DOT", which is an index into the body of the production.
1074 As the numbers are not enormous we can combine them into a single "short"
1075 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1076 production number (so 4000 productions with maximum of 15 symbols in the
1079 Comparing the itemsets will be a little different to comparing symsets
1080 as we want to do the lookup after generating the "kernel" of an
1081 itemset, so we need to ignore the offset=zero items which are added during
1084 To facilitate this, we modify the "DOT" number so that "0" sorts to
1085 the end of the list in the symset, and then only compare items before
1089 static inline unsigned short item_num(int production, int index)
1091 return production | ((31-index) << 11);
1093 static inline int item_prod(unsigned short item)
1095 return item & 0x7ff;
1097 static inline int item_index(unsigned short item)
1099 return (31-(item >> 11)) & 0x1f;
1102 For LR(1) analysis we need to compare not just the itemset in a state
1103 but also the LA sets. As we assign each unique LA set a number, we
1104 can just compare the symset and the data values together.
1107 static int itemset_cmp(struct symset a, struct symset b,
1108 enum grammar_type type)
1114 i < a.cnt && i < b.cnt &&
1115 item_index(a.syms[i]) > 0 &&
1116 item_index(b.syms[i]) > 0;
1118 int diff = a.syms[i] - b.syms[i];
1122 diff = a.data[i] - b.data[i];
1127 if (i == a.cnt || item_index(a.syms[i]) == 0)
1131 if (i == b.cnt || item_index(b.syms[i]) == 0)
1137 if (type < LR1 || av == -1)
1140 a.data[i] - b.data[i];
1143 And now we can build the list of itemsets. The lookup routine returns
1144 both a success flag and a pointer to where in the list an insert
1145 should happen, so we don't need to search a second time.
1149 struct itemset *next;
1151 struct symset items;
1152 struct symset go_to;
1157 ###### grammar fields
1158 struct itemset *items;
1162 static int itemset_find(struct grammar *g, struct itemset ***where,
1163 struct symset kernel, enum grammar_type type)
1165 struct itemset **ip;
1167 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1168 struct itemset *i = *ip;
1170 diff = itemset_cmp(i->items, kernel, type);
1183 Adding an itemset may require merging the LA sets if LALR analysis is
1184 happening. If any new LA set add symbol that weren't in the old LA set, we
1185 clear the `completed` flag so that the dependants of this itemset will be
1186 recalculated and their LA sets updated.
1188 `add_itemset` must consume the symsets it is passed, either by adding
1189 them to a data structure, of freeing them.
1191 static int add_itemset(struct grammar *g, struct symset ss,
1192 enum grammar_type type, int starts_line)
1194 struct itemset **where, *is;
1196 int found = itemset_find(g, &where, ss, type);
1198 is = calloc(1, sizeof(*is));
1199 is->state = g->states;
1203 is->go_to = INIT_DATASET;
1204 is->starts_line = starts_line;
1213 for (i = 0; i < ss.cnt; i++) {
1214 struct symset temp = INIT_SYMSET, s;
1215 if (ss.data[i] == is->items.data[i])
1217 s = set_find(g, is->items.data[i]);
1218 symset_union(&temp, &s);
1219 s = set_find(g, ss.data[i]);
1220 if (symset_union(&temp, &s)) {
1221 is->items.data[i] = save_set(g, temp);
1232 To build all the itemsets, we first insert the initial itemset made from the
1233 start symbol, complete each itemset, and then generate new itemsets from old
1234 until no new ones can be made.
1236 Completing an itemset means finding all the items where "DOT" is followed by
1237 a nonterminal and adding "DOT=0" items for every production from that
1238 non-terminal - providing each item hasn't already been added.
1240 If LA sets are needed, the LA set for each new item is found using
1241 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1242 be supplemented by the LA set for the item which produce the new item.
1244 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1245 is used in the next stage.
1247 NOTE: precedence handling should happen here - I haven't written this yet
1250 ###### complete itemset
1251 for (i = 0; i < is->items.cnt; i++) {
1252 int p = item_prod(is->items.syms[i]);
1253 int bs = item_index(is->items.syms[i]);
1254 struct production *pr = g->productions[p];
1257 struct symset LA = INIT_SYMSET;
1258 unsigned short sn = 0;
1260 if (bs == pr->body_size)
1263 if (symset_find(&done, s->num) < 0)
1264 symset_add(&done, s->num, 0);
1265 if (s->type != Nonterminal)
1271 add_first(pr, bs+1, &LA, g, &to_end);
1273 struct symset ss = set_find(g, is->items.data[i]);
1274 symset_union(&LA, &ss);
1276 sn = save_set(g, LA);
1277 LA = set_find(g, sn);
1280 /* Add productions for this symbol */
1281 for (p2 = s->first_production;
1282 p2 < g->production_count &&
1283 g->productions[p2]->head == s;
1285 int itm = item_num(p2, 0);
1286 int pos = symset_find(&is->items, itm);
1288 symset_add(&is->items, itm, sn);
1289 /* Will have re-ordered, so start
1290 * from beginning again */
1292 } else if (type >= LALR) {
1293 struct symset ss = set_find(g, is->items.data[pos]);
1294 struct symset tmp = INIT_SYMSET;
1296 symset_union(&tmp, &ss);
1297 if (symset_union(&tmp, &LA)) {
1298 is->items.data[pos] = save_set(g, tmp);
1306 For each symbol we found (and placed in `done`) we collect all the items for
1307 which this symbol is next, and create a new itemset with "DOT" advanced over
1308 the symbol. This is then added to the collection of itemsets (or merged
1309 with a pre-existing itemset).
1311 ###### derive itemsets
1312 // Now we have a completed itemset, so we need to
1313 // compute all the 'next' itemsets and create them
1314 // if they don't exist.
1315 for (i = 0; i < done.cnt; i++) {
1317 unsigned short state;
1318 int starts_line = 0;
1319 struct symbol *sym = g->symtab[done.syms[i]];
1320 struct symset newitemset = INIT_SYMSET;
1322 newitemset = INIT_DATASET;
1325 (sym->nullable && is->starts_line))
1327 for (j = 0; j < is->items.cnt; j++) {
1328 int itm = is->items.syms[j];
1329 int p = item_prod(itm);
1330 int bp = item_index(itm);
1331 struct production *pr = g->productions[p];
1332 unsigned short la = 0;
1335 if (bp == pr->body_size)
1337 if (pr->body[bp] != sym)
1340 la = is->items.data[j];
1341 pos = symset_find(&newitemset, pr->head->num);
1343 symset_add(&newitemset, item_num(p, bp+1), la);
1344 else if (type >= LALR) {
1345 // Need to merge la set.
1346 int la2 = newitemset.data[pos];
1348 struct symset ss = set_find(g, la2);
1349 struct symset LA = INIT_SYMSET;
1350 symset_union(&LA, &ss);
1351 ss = set_find(g, la);
1352 if (symset_union(&LA, &ss))
1353 newitemset.data[pos] = save_set(g, LA);
1359 state = add_itemset(g, newitemset, type, starts_line);
1360 if (symset_find(&is->go_to, done.syms[i]) < 0)
1361 symset_add(&is->go_to, done.syms[i], state);
1364 All that is left is to crate the initial itemset from production zero, and
1365 with `TK_eof` as the LA set.
1368 static void build_itemsets(struct grammar *g, enum grammar_type type)
1370 struct symset first = INIT_SYMSET;
1373 unsigned short la = 0;
1375 // LA set just has eof
1376 struct symset eof = INIT_SYMSET;
1377 symset_add(&eof, TK_eof, 0);
1378 la = save_set(g, eof);
1379 first = INIT_DATASET;
1381 // production 0, offset 0 (with no data)
1382 symset_add(&first, item_num(0, 0), la);
1383 add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
1384 for (again = 0, is = g->items;
1386 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1388 struct symset done = INIT_SYMSET;
1399 ### Completing the analysis.
1401 The exact process of analysis depends on which level was requested. For
1402 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1403 `LR1` we need first, but not follow. For `SLR` we need both.
1405 We don't build the "action" tables that you might expect as the parser
1406 doesn't use them. They will be calculated to some extent if needed for
1409 Once we have built everything we allocate arrays for the two lists:
1410 symbols and itemsets. This allows more efficient access during reporting.
1411 The symbols are grouped as terminals and non-terminals and we record the
1412 changeover point in `first_nonterm`.
1414 ###### grammar fields
1415 struct symbol **symtab;
1416 struct itemset **statetab;
1419 ###### grammar_analyse
1421 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1425 int snum = TK_reserved;
1426 for (s = g->syms; s; s = s->next)
1427 if (s->num < 0 && s->type == Terminal) {
1431 g->first_nonterm = snum;
1432 for (s = g->syms; s; s = s->next)
1438 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1439 for (s = g->syms; s; s = s->next)
1440 g->symtab[s->num] = s;
1450 build_itemsets(g, type);
1452 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1453 for (is = g->items; is ; is = is->next)
1454 g->statetab[is->state] = is;
1457 ## Reporting on the Grammar
1459 The purpose of the report is to give the grammar developer insight into
1460 how the grammar parser will work. It is basically a structured dump of
1461 all the tables that have been generated, plus an description of any conflicts.
1463 ###### grammar_report
1464 static int grammar_report(struct grammar *g, enum grammar_type type)
1470 return report_conflicts(g, type);
1473 Firstly we have the complete list of symbols, together with the "FIRST"
1474 set if that was generated.
1478 static void report_symbols(struct grammar *g)
1482 printf("SYMBOLS + FIRST:\n");
1484 printf("SYMBOLS:\n");
1486 for (n = 0; n < g->num_syms; n++) {
1487 struct symbol *s = g->symtab[n];
1491 printf(" %c%c%3d%c: ",
1492 s->nullable ? '.':' ',
1493 s->can_eol ? '>':' ',
1494 s->num, symtypes[s->type]);
1497 printf(" (%d%s)", s->precedence,
1498 assoc_names[s->assoc]);
1500 if (g->first && s->type == Nonterminal) {
1503 for (j = 0; j < g->first[n].cnt; j++) {
1506 prtxt(g->symtab[g->first[n].syms[j]]->name);
1513 Then we have to follow sets if they were computed.
1515 static void report_follow(struct grammar *g)
1518 printf("FOLLOW:\n");
1519 for (n = 0; n < g->num_syms; n++)
1520 if (g->follow[n].cnt) {
1524 prtxt(g->symtab[n]->name);
1525 for (j = 0; j < g->follow[n].cnt; j++) {
1528 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1534 And finally the item sets. These include the GO TO tables and, for
1535 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1536 it up a bit. First the items, with production number and associativity.
1538 static void report_item(struct grammar *g, int itm)
1540 int p = item_prod(itm);
1541 int dot = item_index(itm);
1542 struct production *pr = g->productions[p];
1546 prtxt(pr->head->name);
1548 for (i = 0; i < pr->body_size; i++) {
1549 printf(" %s", (dot == i ? ". ": ""));
1550 prtxt(pr->body[i]->name);
1552 if (dot == pr->body_size)
1556 printf(" (%d%s)", pr->precedence,
1557 assoc_names[pr->assoc]);
1561 The LA sets which are (possibly) reported with each item:
1563 static void report_la(struct grammar *g, int lanum)
1565 struct symset la = set_find(g, lanum);
1569 printf(" LOOK AHEAD(%d)", lanum);
1570 for (i = 0; i < la.cnt; i++) {
1573 prtxt(g->symtab[la.syms[i]]->name);
1578 Then the go to sets:
1581 static void report_goto(struct grammar *g, struct symset gt)
1586 for (i = 0; i < gt.cnt; i++) {
1588 prtxt(g->symtab[gt.syms[i]]->name);
1589 printf(" -> %d\n", gt.data[i]);
1593 Now we can report all the item sets complete with items, LA sets, and GO TO.
1595 static void report_itemsets(struct grammar *g)
1598 printf("ITEM SETS(%d)\n", g->states);
1599 for (s = 0; s < g->states; s++) {
1601 struct itemset *is = g->statetab[s];
1602 printf(" Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
1603 for (j = 0; j < is->items.cnt; j++) {
1604 report_item(g, is->items.syms[j]);
1605 if (is->items.data != NO_DATA)
1606 report_la(g, is->items.data[j]);
1608 report_goto(g, is->go_to);
1612 ### Reporting conflicts
1614 Conflict detection varies a lot among different analysis levels. However
1615 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1616 LR1 are also similar as they have FOLLOW or LA sets.
1620 ## conflict functions
1622 static int report_conflicts(struct grammar *g, enum grammar_type type)
1625 printf("Conflicts:\n");
1627 cnt = conflicts_lr0(g, type);
1629 cnt = conflicts_slr(g, type);
1631 printf(" - no conflicts\n");
1635 LR0 conflicts are any state which have both a reducible item and
1638 LR05 conflicts only occurs if two possibly reductions exist,
1639 as shifts always over-ride reductions.
1641 ###### conflict functions
1642 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1646 for (i = 0; i < g->states; i++) {
1647 struct itemset *is = g->statetab[i];
1648 int last_reduce = -1;
1649 int prev_reduce = -1;
1650 int last_shift = -1;
1654 for (j = 0; j < is->items.cnt; j++) {
1655 int itm = is->items.syms[j];
1656 int p = item_prod(itm);
1657 int bp = item_index(itm);
1658 struct production *pr = g->productions[p];
1660 if (bp == pr->body_size) {
1661 prev_reduce = last_reduce;
1665 if (pr->body[bp]->type == Terminal)
1668 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1669 printf(" State %d has both SHIFT and REDUCE:\n", i);
1670 report_item(g, is->items.syms[last_shift]);
1671 report_item(g, is->items.syms[last_reduce]);
1674 if (prev_reduce >= 0) {
1675 printf(" State %d has 2 (or more) reducible items\n", i);
1676 report_item(g, is->items.syms[prev_reduce]);
1677 report_item(g, is->items.syms[last_reduce]);
1684 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1685 look ahead, or if a symbol in a look-ahead can be shifted. The differ only
1686 in the source of the look ahead set.
1688 We build a dataset mapping terminal to item for possible SHIFTs and then
1689 another for possible REDUCE operations. We report when we get conflicts
1692 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1697 for (i = 0; i < g->states; i++) {
1698 struct itemset *is = g->statetab[i];
1699 struct symset shifts = INIT_DATASET;
1700 struct symset reduce = INIT_DATASET;
1704 /* First collect the shifts */
1705 for (j = 0; j < is->items.cnt; j++) {
1706 unsigned short itm = is->items.syms[j];
1707 int p = item_prod(itm);
1708 int bp = item_index(itm);
1709 struct production *pr = g->productions[p];
1711 if (bp < pr->body_size &&
1712 pr->body[bp]->type == Terminal) {
1714 int sym = pr->body[bp]->num;
1715 if (symset_find(&shifts, sym) < 0)
1716 symset_add(&shifts, sym, itm);
1719 /* Now look for reduction and conflicts */
1720 for (j = 0; j < is->items.cnt; j++) {
1721 unsigned short itm = is->items.syms[j];
1722 int p = item_prod(itm);
1723 int bp = item_index(itm);
1724 struct production *pr = g->productions[p];
1726 if (bp < pr->body_size)
1731 la = g->follow[pr->head->num];
1733 la = set_find(g, is->items.data[j]);
1735 for (k = 0; k < la.cnt; k++) {
1736 int pos = symset_find(&shifts, la.syms[k]);
1738 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1739 prtxt(g->symtab[la.syms[k]]->name);
1741 report_item(g, shifts.data[pos]);
1742 report_item(g, itm);
1745 pos = symset_find(&reduce, la.syms[k]);
1747 symset_add(&reduce, la.syms[k], itm);
1750 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1751 prtxt(g->symtab[la.syms[k]]->name);
1753 report_item(g, itm);
1754 report_item(g, reduce.data[pos]);
1758 symset_free(shifts);
1759 symset_free(reduce);
1765 ## Generating the parser
1767 The export part of the parser is the `parse_XX` function, where the name
1768 `XX` is based on the name of the parser files.
1770 This takes a `code_node`, a partially initialized `token_config`, and an
1771 optional `FILE` to send tracing to. The `token_config` gets the list of
1772 known words added and then is used with the `code_node` to initialize the
1775 `parse_XX` then call the library function `parser_run` to actually complete
1776 the parse. This needs the `states` table and function to call the various
1777 pieces of code provided in the grammar file, so they are generated first.
1779 ###### parser_generate
1781 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1787 gen_reduce(f, g, file);
1790 fprintf(f, "#line 0 \"gen_parser\"\n");
1791 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1794 fprintf(f, "\tstruct token_state *tokens;\n");
1795 fprintf(f, "\tconfig->words_marks = known;\n");
1796 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1797 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1798 fprintf(f, "\ttokens = token_open(code, config);\n");
1799 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
1800 fprintf(f, "\ttoken_close(tokens);\n");
1801 fprintf(f, "\treturn rv;\n");
1802 fprintf(f, "}\n\n");
1805 ### Table words table
1807 The know words is simply an array of terminal symbols.
1808 The table of nonterminals used for tracing is a similar array.
1812 static void gen_known(FILE *f, struct grammar *g)
1815 fprintf(f, "#line 0 \"gen_known\"\n");
1816 fprintf(f, "static const char *known[] = {\n");
1817 for (i = TK_reserved;
1818 i < g->num_syms && g->symtab[i]->type == Terminal;
1820 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1821 g->symtab[i]->name.txt);
1822 fprintf(f, "};\n\n");
1825 static void gen_non_term(FILE *f, struct grammar *g)
1828 fprintf(f, "#line 0 \"gen_non_term\"\n");
1829 fprintf(f, "static const char *non_term[] = {\n");
1830 for (i = TK_reserved;
1833 if (g->symtab[i]->type == Nonterminal)
1834 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1835 g->symtab[i]->name.txt);
1836 fprintf(f, "};\n\n");
1839 ### States and the goto tables.
1841 For each state we record the goto table, the reducible production if
1842 there is one, or a symbol to shift for error recovery.
1843 Some of the details of the reducible production are stored in the
1844 `do_reduce` function to come later. Here we store the production number,
1845 the body size (useful for stack management) and the resulting symbol (useful
1846 for knowing how to free data later).
1848 The go to table is stored in a simple array of `sym` and corresponding
1851 ###### exported types
1859 const struct lookup * go_to;
1870 static void gen_goto(FILE *f, struct grammar *g)
1873 fprintf(f, "#line 0 \"gen_goto\"\n");
1874 for (i = 0; i < g->states; i++) {
1876 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1878 struct symset gt = g->statetab[i]->go_to;
1879 for (j = 0; j < gt.cnt; j++)
1880 fprintf(f, "\t{ %d, %d },\n",
1881 gt.syms[j], gt.data[j]);
1888 static void gen_states(FILE *f, struct grammar *g)
1891 fprintf(f, "#line 0 \"gen_states\"\n");
1892 fprintf(f, "static const struct state states[] = {\n");
1893 for (i = 0; i < g->states; i++) {
1894 struct itemset *is = g->statetab[i];
1895 int j, prod = -1, prod_len;
1897 int shift_len = 0, shift_remain = 0;
1898 for (j = 0; j < is->items.cnt; j++) {
1899 int itm = is->items.syms[j];
1900 int p = item_prod(itm);
1901 int bp = item_index(itm);
1902 struct production *pr = g->productions[p];
1904 if (bp < pr->body_size) {
1905 if (shift_sym < 0 ||
1906 (shift_len == bp && shift_remain > pr->body_size - bp)) {
1907 shift_sym = pr->body[bp]->num;
1909 shift_remain = pr->body_size - bp;
1913 /* This is what we reduce */
1914 if (prod < 0 || prod_len < pr->body_size) {
1916 prod_len = pr->body_size;
1921 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
1922 i, is->go_to.cnt, i, prod,
1923 g->productions[prod]->body_size,
1924 g->productions[prod]->head->num,
1927 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
1928 i, is->go_to.cnt, i, shift_sym,
1931 fprintf(f, "};\n\n");
1934 ### The `do_reduce` function and the code
1936 When the parser engine decides to reduce a production, it calls `do_reduce`.
1937 This has two functions.
1939 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1940 production being reduced. Secondly it runs the code that was included with
1941 the production if any.
1943 This code needs to be able to store data somewhere. Rather than requiring
1944 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1945 `do_reduce` return the size to be saved.
1947 The code fragment requires translation when written out. Any `$N` needs to
1948 be converted to a reference either to that buffer (if `$0`) or to the
1949 structure returned by a previous reduction. These pointer need to be cast
1950 to the appropriate type for each access. All this is handling in
1956 static void gen_code(struct production *p, FILE *f, struct grammar *g)
1959 fprintf(f, "\t\t\t");
1960 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1969 if (*c < '0' || *c > '9') {
1974 while (c[1] >= '0' && c[1] <= '9') {
1976 n = n * 10 + *c - '0';
1979 fprintf(f, "(*(struct %.*s*)ret)",
1980 p->head->struct_name.len,
1981 p->head->struct_name.txt);
1982 else if (n > p->body_size)
1983 fprintf(f, "$%d", n);
1984 else if (p->body[n-1]->type == Terminal)
1985 fprintf(f, "(*(struct token *)body[%d])",
1987 else if (p->body[n-1]->struct_name.txt == NULL)
1988 fprintf(f, "$%d", n);
1990 fprintf(f, "(*(struct %.*s*)body[%d])",
1991 p->body[n-1]->struct_name.len,
1992 p->body[n-1]->struct_name.txt, n-1);
1999 static void gen_reduce(FILE *f, struct grammar *g, char *file)
2002 fprintf(f, "#line 0 \"gen_reduce\"\n");
2003 fprintf(f, "static int do_reduce(int prod, void **body, void *ret)\n");
2005 fprintf(f, "\tint ret_size = 0;\n");
2007 fprintf(f, "\tswitch(prod) {\n");
2008 for (i = 0; i < g->production_count; i++) {
2009 struct production *p = g->productions[i];
2010 fprintf(f, "\tcase %d:\n", i);
2015 if (p->head->struct_name.txt)
2016 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
2017 p->head->struct_name.len,
2018 p->head->struct_name.txt);
2020 fprintf(f, "\t\tbreak;\n");
2022 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2027 As each non-terminal can potentially cause a different type of data
2028 structure to be allocated and filled in, we need to be able to free it when
2031 It is particularly important to have fine control over freeing during error
2032 recovery where individual stack frames might need to be freed.
2034 For this, the grammar author required to defined a `free_XX` function for
2035 each structure that is used by a non-terminal. `do_free` all call whichever
2036 is appropriate given a symbol number, and will call `free` (as is
2037 appropriate for tokens` on any terminal symbol.
2041 static void gen_free(FILE *f, struct grammar *g)
2045 fprintf(f, "#line 0 \"gen_free\"\n");
2046 fprintf(f, "static void do_free(short sym, void *asn)\n");
2048 fprintf(f, "\tif (!asn) return;\n");
2049 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2050 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2051 fprintf(f, "\tswitch(sym) {\n");
2053 for (i = 0; i < g->num_syms; i++) {
2054 struct symbol *s = g->symtab[i];
2056 s->type != Nonterminal ||
2057 s->struct_name.txt == NULL)
2060 fprintf(f, "\tcase %d:\n", s->num);
2061 fprintf(f, "\t\tfree_%.*s(asn);\n",
2063 s->struct_name.txt);
2064 fprintf(f, "\t\tbreak;\n");
2066 fprintf(f, "\t}\n}\n\n");
2069 ## The main routine.
2071 There are three key parts to the "main" function of parsergen: processing
2072 the arguments, loading the grammar file, and dealing with the grammar.
2074 ### Argument processing.
2076 Command line options allow the selection of analysis mode, name of output
2077 file, and whether or not a report should be generated. By default we create
2078 a report only if no code output was requested.
2080 The `parse_XX` function name uses the basename of the output file which
2081 should not have a suffix (`.c`). `.c` is added to the given name for the
2082 code, and `.h` is added for the header (if header text is specifed in the
2089 static const struct option long_options[] = {
2090 { "LR0", 0, NULL, '0' },
2091 { "LR05", 0, NULL, '5' },
2092 { "SLR", 0, NULL, 'S' },
2093 { "LALR", 0, NULL, 'L' },
2094 { "LR1", 0, NULL, '1' },
2095 { "tag", 1, NULL, 't' },
2096 { "report", 0, NULL, 'R' },
2097 { "output", 1, NULL, 'o' },
2098 { NULL, 0, NULL, 0 }
2100 const char *options = "05SL1t:Ro:";
2102 ###### process arguments
2104 char *outfile = NULL;
2109 enum grammar_type type = LR05;
2110 while ((opt = getopt_long(argc, argv, options,
2111 long_options, NULL)) != -1) {
2126 outfile = optarg; break;
2128 tag = optarg; break;
2130 fprintf(stderr, "Usage: parsergen ...\n");
2135 infile = argv[optind++];
2137 fprintf(stderr, "No input file given\n");
2140 if (outfile && report == 1)
2143 if (name && strchr(name, '/'))
2144 name = strrchr(name, '/')+1;
2146 if (optind < argc) {
2147 fprintf(stderr, "Excess command line arguments\n");
2151 ### Loading the grammar file
2153 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2156 One we have extracted the code (with `mdcode`) we expect to file three
2157 sections: header, code, and grammar. Anything else is an error.
2159 "header" and "code" are optional, though it is hard to build a working
2160 parser with neither. "grammar" must be provided.
2164 #include <sys/mman.h>
2169 static void pr_err(char *msg)
2172 fprintf(stderr, "%s\n", msg);
2176 struct section *table;
2180 fd = open(infile, O_RDONLY);
2182 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2183 infile, strerror(errno));
2186 len = lseek(fd, 0, 2);
2187 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2188 table = code_extract(file, file+len, pr_err);
2190 struct code_node *hdr = NULL;
2191 struct code_node *code = NULL;
2192 struct code_node *gram = NULL;
2193 for (s = table; s; s = s->next) {
2194 struct text sec = s->section;
2195 if (tag && !strip_tag(&sec, tag))
2197 if (text_is(sec, "header"))
2199 else if (text_is(sec, "code"))
2201 else if (text_is(sec, "grammar"))
2204 fprintf(stderr, "Unknown content section: %.*s\n",
2205 s->section.len, s->section.txt);
2210 ### Processing the input
2212 As we need to append an extention to a filename and then open it for
2213 writing, and we need to do this twice, it helps to have a separate function.
2217 static FILE *open_ext(char *base, char *ext)
2219 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2221 strcat(strcpy(fn, base), ext);
2227 If we can read the grammar, then we analyse and optionally report on it. If
2228 the report finds conflicts we will exit with an error status.
2230 ###### process input
2231 struct grammar *g = NULL;
2233 fprintf(stderr, "No grammar section provided\n");
2236 g = grammar_read(gram);
2238 fprintf(stderr, "Failure to parse grammar\n");
2243 grammar_analyse(g, type);
2245 if (grammar_report(g, type))
2249 If a headers section is defined, we write it out and include a declaration
2250 for the `parse_XX` function so it can be used from separate code.
2252 if (rv == 0 && hdr && outfile) {
2253 FILE *f = open_ext(outfile, ".h");
2255 code_node_print(f, hdr, infile);
2256 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2260 fprintf(stderr, "Cannot create %s.h\n",
2266 And if all goes well and an output file was provided, we create the `.c`
2267 file with the code section (if any) and the parser tables and function.
2269 if (rv == 0 && outfile) {
2270 FILE *f = open_ext(outfile, ".c");
2273 code_node_print(f, code, infile);
2274 gen_parser(f, g, infile, name);
2277 fprintf(stderr, "Cannot create %s.c\n",
2283 And that about wraps it up. We need to set the locale so that UTF-8 is
2284 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2286 ###### File: parsergen.mk
2287 parsergen : parsergen.o libscanner.o libmdcode.o
2288 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2295 int main(int argc, char *argv[])
2300 setlocale(LC_ALL,"");
2302 ## process arguments
2309 ## The SHIFT/REDUCE parser
2311 Having analysed the grammar and generated all the table, we only need the
2312 shift/reduce engine to bring it all together.
2314 ### Goto table lookup
2316 The parser generator has nicely provided us with goto tables sorted by
2317 symbol number. We need a binary search function to find a symbol in the
2320 ###### parser functions
2322 static int search(const struct state *l, int sym)
2325 int hi = l->go_to_cnt;
2329 while (lo + 1 < hi) {
2330 int mid = (lo + hi) / 2;
2331 if (l->go_to[mid].sym <= sym)
2336 if (l->go_to[lo].sym == sym)
2337 return l->go_to[lo].state;
2342 ### The state stack.
2344 The core data structure for the parser is the stack. This tracks all the
2345 symbols that have been recognised or partially recognised.
2347 The stack usually won't grow very large - maybe a few tens of entries. So
2348 we dynamically resize and array as required but never bother to shrink it
2351 We keep the stack as two separate allocations. One, `asn_stack` stores the
2352 "abstract syntax nodes" which are created by each reduction. When we call
2353 `do_reduce` we need to pass an array of the `asn`s of the body of the
2354 production, and by keeping a separate `asn` stack, we can just pass a
2355 pointer into this stack.
2357 The other allocation stores all other stack fields of which there are four.
2358 The `state` is the most important one and guides the parsing process. The
2359 `sym` is nearly unnecessary. However when we want to free entries from the
2360 `asn_stack`, it helps to know what type they are so we can call the right
2361 freeing function. The symbol leads us to the right free function through
2364 The `indents` count and the `starts_indented` flag track the line
2365 indents in the symbol. These are used to allow indent information to
2366 guide parsing and error recovery.
2368 As well as the stack of frames we have a `next` frame which is
2369 assembled from the incoming token and other information prior to
2370 pushing it onto the stack.
2372 ###### parser functions
2378 short starts_indented;
2380 short newline_permitted;
2389 Two operations are needed on the stack - shift (which is like push) and pop.
2391 Shift applies not only to terminals but also to non-terminals. When we
2392 reduce a production we will pop off entries corresponding to the body
2393 symbols, then push on an item for the head of the production. This last is
2394 exactly the same process as shifting in a terminal so we use the same
2397 To simplify other code we arrange for `shift` to fail if there is no `goto`
2398 state for the symbol. This is useful in basic parsing due to our design
2399 that we shift when we can, and reduce when we cannot. So the `shift`
2400 function reports if it could.
2402 So `shift` finds the next state. If that succeed it extends the allocations
2403 if needed and pushes all the information onto the stacks.
2405 ###### parser functions
2407 static int shift(struct parser *p,
2409 const struct state states[])
2411 // Push an entry onto the stack
2412 int newstate = search(&states[p->next.state], p->next.sym);
2415 if (p->tos >= p->stack_size) {
2416 p->stack_size += 10;
2417 p->stack = realloc(p->stack, p->stack_size
2418 * sizeof(p->stack[0]));
2419 p->asn_stack = realloc(p->asn_stack, p->stack_size
2420 * sizeof(p->asn_stack[0]));
2422 p->stack[p->tos] = p->next;
2423 p->asn_stack[p->tos] = asn;
2425 p->next.state = newstate;
2426 p->next.indents = 0;
2427 p->next.starts_indented = 0;
2428 // if new state doesn't start a line, we inherit newline_permitted status
2429 if (states[newstate].starts_line)
2430 p->next.newline_permitted = 1;
2434 `pop` simply moves the top of stack (`tos`) back down the required amount
2435 and frees any `asn` entries that need to be freed. It is called _after_ we
2436 reduce a production, just before we `shift` the nonterminal in.
2438 ###### parser functions
2440 static void pop(struct parser *p, int num,
2441 void(*do_free)(short sym, void *asn))
2445 for (i = 0; i < num; i++) {
2446 p->next.indents += p->stack[p->tos+i].indents;
2447 do_free(p->stack[p->tos+i].sym,
2448 p->asn_stack[p->tos+i]);
2452 p->next.state = p->stack[p->tos].state;
2453 p->next.starts_indented = p->stack[p->tos].starts_indented;
2454 p->next.newline_permitted = p->stack[p->tos].newline_permitted;
2455 if (p->next.indents > p->next.starts_indented)
2456 p->next.newline_permitted = 0;
2460 ### Memory allocation
2462 The `scanner` returns tokens in a local variable - we want them in allocated
2463 memory so they can live in the `asn_stack`. Similarly the `asn` produced by
2464 a reduce is in a large buffer. Both of these require some allocation and
2465 copying, hence `memdup` and `tokcopy`.
2467 ###### parser includes
2470 ###### parser functions
2472 void *memdup(void *m, int len)
2478 memcpy(ret, m, len);
2482 static struct token *tok_copy(struct token tk)
2484 struct token *new = malloc(sizeof(*new));
2489 ### The heart of the parser.
2491 Now we have the parser. If we can shift, we do. If not and we can reduce
2492 we do. If the production we reduced was production zero, then we have
2493 accepted the input and can finish.
2495 We return whatever `asn` was returned by reducing production zero.
2497 If we can neither shift nor reduce we have an error to handle. We pop
2498 single entries off the stack until we can shift the `TK_error` symbol, then
2499 drop input tokens until we find one we can shift into the new error state.
2501 When we find `TK_in` and `TK_out` tokens which report indents we need
2502 to handle them directly as the grammar cannot express what we want to
2505 `TK_in` tokens are easy: we simply update the `next` stack frame to
2506 record how many indents there are and that the next token started with
2509 `TK_out` tokens must either be counted off against any pending indent,
2510 or must force reductions until there is a pending indent which isn't
2511 at the start of a production.
2513 ###### parser includes
2516 void *parser_run(struct token_state *tokens,
2517 const struct state states[],
2518 int (*do_reduce)(int, void**, void*),
2519 void (*do_free)(short, void*),
2520 FILE *trace, const char *non_term[], int knowns)
2522 struct parser p = { 0 };
2523 struct token *tk = NULL;
2527 p.next.newline_permitted = states[0].starts_line;
2529 struct token *err_tk;
2531 tk = tok_copy(token_next(tokens));
2532 p.next.sym = tk->num;
2534 parser_trace(trace, &p, tk, states, non_term, knowns);
2536 if (p.next.sym == TK_in) {
2537 p.next.starts_indented = 1;
2543 if (p.next.sym == TK_out) {
2544 if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
2545 (p.stack[p.tos-1].indents == 1 &&
2546 states[p.next.state].reduce_size > 1)) {
2547 p.stack[p.tos-1].indents -= 1;
2548 if (p.stack[p.tos-1].indents == p.stack[p.tos-1].starts_indented) {
2549 // no internal indent any more, reassess 'newline_permitted'
2550 if (states[p.stack[p.tos-1].state].starts_line)
2551 p.stack[p.tos-1].newline_permitted = 1;
2553 p.stack[p.tos-1].newline_permitted = p.stack[p.tos-2].newline_permitted;
2559 // fall through and force a REDUCE (as 'shift'
2562 if (p.next.sym == TK_newline) {
2563 if (!p.tos || ! p.stack[p.tos-1].newline_permitted) {
2569 if (shift(&p, tk, states)) {
2573 if (states[p.next.state].reduce_prod >= 0) {
2575 int prod = states[p.next.state].reduce_prod;
2576 int size = states[p.next.state].reduce_size;
2578 static char buf[16*1024];
2579 p.next.sym = states[p.next.state].reduce_sym;
2581 body = p.asn_stack +
2582 (p.tos - states[p.next.state].reduce_size);
2584 bufsize = do_reduce(prod, body, buf);
2586 pop(&p, size, do_free);
2587 shift(&p, memdup(buf, bufsize), states);
2592 if (tk->num == TK_out) {
2593 // Indent problem - synthesise tokens to get us
2595 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
2596 p.next.sym = states[p.next.state].shift_sym;
2597 shift(&p, tok_copy(*tk), states);
2598 // FIXME need to report this error somehow
2601 /* Error. We walk up the stack until we
2602 * find a state which will accept TK_error.
2603 * We then shift in TK_error and see what state
2604 * that takes us too.
2605 * Then we discard input tokens until
2606 * we find one that is acceptable.
2609 err_tk = tok_copy(*tk);
2610 p.next.sym = TK_error;
2611 while (shift(&p, err_tk, states) == 0
2613 // discard this state
2614 pop(&p, 1, do_free);
2617 // no state accepted TK_error
2620 while (search(&states[p.next.state], tk->num) < 0 &&
2621 tk->num != TK_eof) {
2623 tk = tok_copy(token_next(tokens));
2624 if (tk->num == TK_in)
2625 p.next.indents += 1;
2626 if (tk->num == TK_out) {
2627 if (p.next.indents == 0)
2629 p.next.indents -= 1;
2632 if (p.tos == 0 && tk->num == TK_eof)
2637 ret = p.asn_stack[0];
2639 pop(&p, p.tos, do_free);
2645 ###### exported functions
2646 void *parser_run(struct token_state *tokens,
2647 const struct state states[],
2648 int (*do_reduce)(int, void**, void*),
2649 void (*do_free)(short, void*),
2650 FILE *trace, const char *non_term[], int knowns);
2654 Being able to visualize the parser in action can be invaluable when
2655 debugging the parser code, or trying to understand how the parse of a
2656 particular grammar progresses. The stack contains all the important
2657 state, so just printing out the stack every time around the parse loop
2658 can make it possible to see exactly what is happening.
2660 This doesn't explicitly show each SHIFT and REDUCE action. However they
2661 are easily deduced from the change between consecutive lines, and the
2662 details of each state can be found by cross referencing the states list
2663 in the stack with the "report" that parsergen can generate.
2665 For terminal symbols, we just dump the token. For non-terminals we
2666 print the name of the symbol. The look ahead token is reported at the
2667 end inside square brackets.
2669 ###### parser functions
2671 static char *reserved_words[] = {
2672 [TK_error] = "ERROR",
2675 [TK_newline] = "NEWLINE",
2678 static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2680 fprintf(trace, "(%d", f->state);
2682 fprintf(trace, "%c%d", f->starts_indented?':':'.',
2684 if (states[f->state].starts_line)
2685 fprintf(trace, "s");
2686 if (f->newline_permitted)
2687 fprintf(trace, "n");
2688 fprintf(trace, ") ");
2691 void parser_trace(FILE *trace, struct parser *p,
2692 struct token *tk, const struct state states[],
2693 const char *non_term[], int knowns)
2696 for (i = 0; i < p->tos; i++) {
2697 int sym = p->stack[i].sym;
2698 parser_trace_state(trace, &p->stack[i], states);
2699 if (sym < TK_reserved &&
2700 reserved_words[sym] != NULL)
2701 fputs(reserved_words[sym], trace);
2702 else if (sym < TK_reserved + knowns) {
2703 struct token *t = p->asn_stack[i];
2704 text_dump(trace, t->txt, 20);
2706 fputs(non_term[sym - TK_reserved - knowns],
2710 parser_trace_state(trace, &p->next, states);
2711 fprintf(trace, " [");
2712 if (tk->num < TK_reserved &&
2713 reserved_words[tk->num] != NULL)
2714 fputs(reserved_words[tk->num], trace);
2716 text_dump(trace, tk->txt, 20);
2717 fputs("]\n", trace);
2722 The obvious example for a parser is a calculator.
2724 As `scanner` provides number parsing function using `libgmp` is it not much
2725 work to perform arbitrary rational number calculations.
2727 This calculator takes one expression, or an equality test per line. The
2728 results are printed and in any equality test fails, the program exits with
2731 Embedding mdcode inside mdcode is rather horrible. I'd like to find a
2732 better approach, but as the grammar file must have 3 components I need
2733 something like this.
2735 ###### File: parsergen.mk
2736 calc.c : parsergen calc.cgm
2737 ./parsergen -o calc calc.cgm
2738 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2739 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2747 // what do we use for a demo-grammar? A calculator of course.
2761 #include <sys/mman.h>
2766 #include "scanner.h"
2772 static void free_number(struct number *n)
2778 int main(int argc, char *argv[])
2780 int fd = open(argv[1], O_RDONLY);
2781 int len = lseek(fd, 0, 2);
2782 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2783 struct section *s = code_extract(file, file+len, NULL);
2784 struct token_config config = {
2785 .ignored = (1 << TK_line_comment)
2786 | (1 << TK_block_comment)
2789 .number_chars = ".,_+-",
2793 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2801 Session -> Session Line
2804 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2805 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2806 gmp_printf(" or as a decimal: %Fg\n", fl);
2810 | Expression = Expression NEWLINE ${
2811 if (mpq_equal($1.val, $3.val))
2812 gmp_printf("Both equal %Qd\n", $1.val);
2814 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
2819 | NEWLINE ${ printf("Blank line\n"); }$
2820 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2823 Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2824 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2825 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2827 Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2828 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2829 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2831 Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2832 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$