1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. `calc.cgm` is a test grammar for a simple calculator.
22 ###### File: parsergen.c
27 ## forward declarations
38 ###### File: libparser.c
47 ###### File: parsergen.mk
50 parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
53 ## Reading the grammar
55 The grammar must be stored in a markdown literate code file as parsed
56 by "[mdcode][]". It must have three top level (i.e. unreferenced)
57 sections: `header`, `code`, and `grammar`. The first two will be
58 literally copied into the generated `.c` and `.h`. files. The last
59 contains the grammar. This is tokenised with "[scanner][]".
62 [scanner]: scanner.html
68 ###### parser includes
72 The grammar contains production sets, precedence/associativity
73 declarations, and data type declarations. These are all parsed with
74 _ad hoc_ parsing as we don't have a parser generator yet.
76 The precedence and associativity can be set for each production, but
77 can be inherited from symbols. The data type is potentially defined
78 for each non-terminal and describes what C structure is used to store
79 information about each symbol.
82 enum assoc {Left, Right, Non};
83 char *assoc_names[] = {"Left","Right","Non"};
86 struct text struct_name;
88 unsigned short precedence;
92 unsigned short precedence;
100 The strings reported by `mdcode` and `scanner` are `struct text` which have
101 length rather than being null terminated. To help with printing and
102 comparing we define `text_is` and `prtxt`, which should possibly go in
103 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
104 which might contain control characters.
107 static int text_is(struct text t, char *s)
109 return (strlen(s) == t.len &&
110 strncmp(s, t.txt, t.len) == 0);
112 static void prtxt(struct text t)
114 printf("%.*s", t.len, t.txt);
119 Productions are comprised primarily of symbols - terminal and
120 non-terminal. We do not make a syntactic distinction between the two,
121 though non-terminals must be identifiers. Non-terminal symbols are
122 those which appear in the head of a production, terminal symbols are
123 those which don't. There are also "virtual" symbols used for precedence
124 marking discussed later, and sometimes we won't know what type a symbol
127 ###### forward declarations
128 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
129 char *symtypes = "UVTN";
133 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
134 table of known symbols and the resulting parser will report them as
135 `TK_reserved + N`. A small set of identifiers are reserved for the
136 different token types that `scanner` can report.
139 static char *reserved_words[] = {
140 [TK_error] = "ERROR",
141 [TK_number] = "NUMBER",
142 [TK_ident] = "IDENTIFIER",
144 [TK_string] = "STRING",
145 [TK_multi_string] = "MULTI_STRING",
148 [TK_newline] = "NEWLINE",
154 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
155 recognised. The former is automatically expected at the end of the text
156 being parsed. The latter are always ignored by the parser.
158 All of these symbols are stored in a simple symbol table. We use the
159 `struct text` from `mdcode` to store the name and link them together
160 into a sorted list using an insertion sort.
162 We don't have separate `find` and `insert` functions as any symbol we
163 find needs to be remembered. We simply expect `find` to always return a
164 symbol, but its type might be `Unknown`.
173 ###### grammar fields
178 static int text_cmp(struct text a, struct text b)
183 int cmp = strncmp(a.txt, b.txt, len);
187 return a.len - b.len;
190 static struct symbol *sym_find(struct grammar *g, struct text s)
192 struct symbol **l = &g->syms;
197 (cmp = text_cmp((*l)->name, s)) < 0)
201 n = calloc(1, sizeof(*n));
210 static void symbols_init(struct grammar *g)
212 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
214 for (i = 0; i < entries; i++) {
217 t.txt = reserved_words[i];
220 t.len = strlen(t.txt);
227 ### Data types and precedence.
229 Data type specification and precedence specification are both
230 introduced by a dollar sign at the start of the line. If the next
231 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
232 precedence, otherwise it specifies a data type.
234 The data type name is simply stored and applied to the head of all
235 subsequent productions. It must be the name of a structure, so `$expr`
236 maps to `struct expr`.
238 Any productions given before the first data type will have no data type
239 and can carry no information. In order to allow other non-terminals to
240 have no type, the data type `$void` can be given. This does *not* mean
241 that `struct void` will be used, but rather than no type will be
242 associated with future non-terminals.
244 The precedence line must contain a list of symbols - typically
245 terminal symbols, but not necessarily. It can only contain symbols
246 that have not been seen yet, so precedence declaration must precede
247 the productions that they affect.
249 A precedence line may also contain a "virtual" symbol which is an
250 identifier preceded by `$$`. Virtual terminals carry precedence
251 information but are not included in the grammar. A production can
252 declare that it inherits the precedence of a given virtual symbol.
254 This use for `$$` precludes it from being used as a symbol in the
255 described language. Two other symbols: `${` and `}$` are also
258 Each new precedence line introduces a new precedence level and
259 declares how it associates. This level is stored in each symbol
260 listed and may be inherited by any production which uses the symbol. A
261 production inherits from the last symbol which has a precedence.
263 ###### grammar fields
264 struct text current_type;
268 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
269 static const char *known[] = { "$$", "${", "}$" };
272 static char *dollar_line(struct token_state *ts, struct grammar *g)
274 struct token t = token_next(ts);
279 if (t.num != TK_ident) {
280 err = "type or assoc expected after '$'";
283 if (text_is(t.txt, "LEFT"))
285 else if (text_is(t.txt, "RIGHT"))
287 else if (text_is(t.txt, "NON"))
290 g->current_type = t.txt;
291 if (text_is(t.txt, "void"))
292 g->current_type.txt = NULL;
294 if (t.num != TK_newline) {
295 err = "Extra tokens after type name";
301 // This is a precedence line, need some symbols.
305 while (t.num != TK_newline) {
306 enum symtype type = Terminal;
308 if (t.num == TK_virtual) {
311 if (t.num != TK_ident) {
312 err = "$$ must be followed by a word";
315 } else if (t.num != TK_ident &&
317 err = "Illegal token in precedence line";
320 s = sym_find(g, t.txt);
321 if (s->type != Unknown) {
322 err = "Symbols in precedence line must not already be known.";
326 s->precedence = g->prec_levels;
331 err = "No symbols given on precedence line";
335 while (t.num != TK_newline && t.num != TK_eof)
342 A production either starts with an identifier which is the head
343 non-terminal, or a vertical bar (`|`) in which case this production
344 uses the same head as the previous one. The identifier must be
345 followed by a `->` mark. All productions for a given non-terminal must
346 be grouped together with the `nonterminal ->` given only once.
348 After this a (possibly empty) sequence of identifiers and marks form
349 the body of the production. A virtual symbol may be given after the
350 body by preceding it with `$$`. If a virtual symbol is given, the
351 precedence of the production is that for the virtual symbol. If none
352 is given, the precedence is inherited from the last symbol in the
353 production which has a precedence specified.
355 After the optional precedence may come the `${` mark. This indicates
356 the start of a code fragment. If present, this must be on the same
357 line as the start of the production.
359 All of the text from the `${` through to the matching `}$` is
360 collected and forms the code-fragment for the production. It must all
361 be in one `code_node` of the literate code. The `}$` must be
362 at the end of a line.
364 Text in the code fragment will undergo substitutions where `$N` for
365 some numeric `N` will be replaced with a variable holding the parse
366 information for the particular symbol in the production. `$0` is the
367 head of the production, `$1` is the first symbol of the body, etc.
368 The type of `$N` for a terminal symbol is `struct token`. For
369 a non-terminal, it is whatever has been declared for that symbol.
371 While building productions we will need to add to an array which needs to
375 static void array_add(void *varray, int *cnt, void *new)
377 void ***array = varray;
380 current = ((*cnt-1) | (step-1))+1;
381 if (*cnt == current) {
384 *array = realloc(*array, current * sizeof(void*));
386 (*array)[*cnt] = new;
390 Collecting the code fragment simply involves reading tokens until we
391 hit the end token `}$`, and noting the character position of the start and
395 static struct text collect_code(struct token_state *state,
400 code.txt = start.txt.txt + start.txt.len;
402 t = token_next(state);
403 while (t.node == start.node &&
404 t.num != TK_close && t.num != TK_error &&
406 if (t.num == TK_close && t.node == start.node)
407 code.len = t.txt.txt - code.txt;
413 Now we have all the bit we need to parse a full production.
418 ###### grammar fields
419 struct production **productions;
420 int production_count;
422 ###### production fields
424 struct symbol **body;
429 int first_production;
432 static char *parse_production(struct grammar *g,
434 struct token_state *state)
436 /* Head has already been parsed. */
439 struct production p, *pp;
441 memset(&p, 0, sizeof(p));
443 tk = token_next(state);
444 while (tk.num == TK_ident || tk.num == TK_mark) {
445 struct symbol *bs = sym_find(g, tk.txt);
446 if (bs->type == Unknown)
448 if (bs->type == Virtual) {
449 err = "Virtual symbol not permitted in production";
452 if (bs->precedence) {
453 p.precedence = bs->precedence;
456 array_add(&p.body, &p.body_size, bs);
457 tk = token_next(state);
459 if (tk.num == TK_virtual) {
461 tk = token_next(state);
462 if (tk.num != TK_ident) {
463 err = "word required after $$";
466 vs = sym_find(g, tk.txt);
467 if (vs->type != Virtual) {
468 err = "symbol after $$ must be virtual";
471 p.precedence = vs->precedence;
473 tk = token_next(state);
475 if (tk.num == TK_open) {
476 p.code = collect_code(state, tk);
477 if (p.code.txt == NULL) {
478 err = "code fragment not closed properly";
481 tk = token_next(state);
483 if (tk.num != TK_newline && tk.num != TK_eof) {
484 err = "stray tokens at end of line";
487 pp = malloc(sizeof(*pp));
489 array_add(&g->productions, &g->production_count, pp);
492 while (tk.num != TK_newline && tk.num != TK_eof)
493 tk = token_next(state);
497 With the ability to parse production and dollar-lines, we have nearly all
498 that we need to parse a grammar from a `code_node`.
500 The head of the first production will effectively be the `start` symbol of
501 the grammar. However it won't _actually_ be so. Processing the grammar is
502 greatly simplified if the real start symbol only has a single production,
503 and expects `$eof` as the final terminal. So when we find the first
504 explicit production we insert an extra production as production zero which
507 ###### Example: production 0
510 where `START` is the first non-terminal given.
512 ###### create production zero
513 struct production *p = calloc(1,sizeof(*p));
514 struct text start = {"$start",6};
515 struct text eof = {"$eof",4};
516 p->head = sym_find(g, start);
517 p->head->type = Nonterminal;
518 array_add(&p->body, &p->body_size, head);
519 array_add(&p->body, &p->body_size, sym_find(g, eof));
520 g->start = p->head->num;
521 p->head->first_production = g->production_count;
522 array_add(&g->productions, &g->production_count, p);
524 Now we are ready to read in the grammar.
526 ###### grammar fields
527 int start; // the 'start' symbol.
530 static struct grammar *grammar_read(struct code_node *code)
532 struct token_config conf = {
535 .words_marks = known,
536 .known_count = sizeof(known)/sizeof(known[0]),
538 .ignored = (1 << TK_line_comment)
539 | (1 << TK_block_comment)
542 | (1 << TK_multi_string)
547 struct token_state *state = token_open(code, &conf);
549 struct symbol *head = NULL;
553 g = calloc(1, sizeof(*g));
556 for (tk = token_next(state); tk.num != TK_eof;
557 tk = token_next(state)) {
558 if (tk.num == TK_newline)
560 if (tk.num == TK_ident) {
562 head = sym_find(g, tk.txt);
563 if (head->type == Nonterminal)
564 err = "This non-terminal has already be used.";
565 else if (head->type == Virtual)
566 err = "Virtual symbol not permitted in head of production";
568 head->type = Nonterminal;
569 head->struct_name = g->current_type;
571 ## create production zero
573 head->first_production = g->production_count;
574 tk = token_next(state);
575 if (tk.num == TK_mark &&
576 text_is(tk.txt, "->"))
577 err = parse_production(g, head, state);
579 err = "'->' missing in production";
581 } else if (tk.num == TK_mark
582 && text_is(tk.txt, "|")) {
583 // another production for same non-term
585 err = parse_production(g, head, state);
587 err = "First production must have a head";
588 } else if (tk.num == TK_mark
589 && text_is(tk.txt, "$")) {
590 err = dollar_line(state, g);
592 err = "Unrecognised token at start of line.";
600 fprintf(stderr, "Error at line %d: %s\n",
607 ## Analysing the grammar
609 The central task in analysing the grammar is to determine a set of
610 states to drive the parsing process. This will require finding
611 various sets of symbols and of "items". Some of these sets will need
612 to attach information to each element in the set, so they are more
615 Each "item" may have a 'look-ahead' or `LA` set associated with
616 it. Many of these will be the same as each other. To avoid memory
617 wastage, and to simplify some comparisons of sets, these sets will be
618 stored in a list of unique sets, each assigned a number.
620 Once we have the data structures in hand to manage these sets and
621 lists, we can start setting the 'nullable' flag, build the 'FIRST'
622 sets, and then create the item sets which define the various states.
626 Though we don't only store symbols in these sets, they are the main
627 things we store, so they are called symbol sets or "symsets".
629 A symset has a size, an array of shorts, and an optional array of data
630 values, which are also shorts. If the array of data is not present,
631 we store an impossible pointer, as `NULL` is used to indicate that no
632 memory has been allocated yet;
637 unsigned short *syms, *data;
639 #define NO_DATA ((unsigned short *)1)
640 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
641 const struct symset INIT_DATASET = { 0, NULL, NULL };
643 The arrays of shorts are allocated in blocks of 8 and are kept sorted
644 by using an insertion sort. We don't explicitly record the amount of
645 allocated space as it can be derived directly from the current `cnt` using
646 `((cnt - 1) | 7) + 1`.
649 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
652 int current = ((s->cnt-1) | 7) + 1;
653 if (current == s->cnt) {
655 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
656 if (s->data != NO_DATA)
657 s->data = realloc(s->data, sizeof(*s->data) * current);
660 while (i > 0 && s->syms[i-1] > key) {
661 s->syms[i] = s->syms[i-1];
662 if (s->data != NO_DATA)
663 s->data[i] = s->data[i-1];
667 if (s->data != NO_DATA)
672 Finding a symbol (or item) in a `symset` uses a simple binary search.
673 We return the index where the value was found (so data can be accessed),
674 or `-1` to indicate failure.
676 static int symset_find(struct symset *ss, unsigned short key)
683 while (lo + 1 < hi) {
684 int mid = (lo + hi) / 2;
685 if (ss->syms[mid] <= key)
690 if (ss->syms[lo] == key)
695 We will often want to form the union of two symsets. When we do, we
696 will often want to know if anything changed (as they might mean there
697 is more work to do). So `symset_union` reports whether anything was
698 added to the first set. We use a slow quadratic approach as these
699 sets don't really get very big. If profiles shows this to be a problem is
700 can be optimised later.
702 static int symset_union(struct symset *a, struct symset *b)
706 for (i = 0; i < b->cnt; i++)
707 if (symset_find(a, b->syms[i]) < 0) {
708 unsigned short data = 0;
709 if (b->data != NO_DATA)
711 symset_add(a, b->syms[i], data);
717 And of course we must be able to free a symset.
719 static void symset_free(struct symset ss)
722 if (ss.data != NO_DATA)
728 Some symsets are simply stored somewhere appropriate in the `grammar`
729 data structure, others needs a bit of indirection. This applies
730 particularly to "LA" sets. These will be paired with "items" in an
731 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
732 stored in the `data` field, so we need a mapping from a small (short)
733 number to an LA `symset`.
735 As mentioned earlier this involves creating a list of unique symsets.
737 For now, we just use a linear list sorted by insertion. A skiplist
738 would be more efficient and may be added later.
745 struct setlist *next;
748 ###### grammar fields
749 struct setlist *sets;
754 static int ss_cmp(struct symset a, struct symset b)
757 int diff = a.cnt - b.cnt;
760 for (i = 0; i < a.cnt; i++) {
761 diff = (int)a.syms[i] - (int)b.syms[i];
768 static int save_set(struct grammar *g, struct symset ss)
770 struct setlist **sl = &g->sets;
774 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
781 s = malloc(sizeof(*s));
790 Finding a set by number is currently performed by a simple linear search.
791 If this turns out to hurt performance, we can store the sets in a dynamic
792 array like the productions.
794 static struct symset set_find(struct grammar *g, int num)
796 struct setlist *sl = g->sets;
797 while (sl && sl->num != num)
803 ### Setting `nullable`
805 We set `nullable` on the head symbol for any production for which all
806 the body symbols (if any) are nullable. As this is a recursive
807 definition, any change in the `nullable` setting means that we need to
808 re-evaluate where it needs to be set.
810 We simply loop around performing the same calculations until no more
817 static void set_nullable(struct grammar *g)
820 while (check_again) {
823 for (p = 0; p < g->production_count; p++) {
824 struct production *pr = g->productions[p];
827 if (pr->head->nullable)
829 for (s = 0; s < pr->body_size; s++)
830 if (! pr->body[s]->nullable)
832 if (s == pr->body_size) {
833 pr->head->nullable = 1;
840 ### Setting `can_eol`
842 In order to be able to ignore newline tokens when not relevant, but
843 still include them in the parse when needed, we will need to know
844 which states can start a "line-like" section of code. We ignore
845 newlines when there is an indent since the most recent start of a
848 To know what is line-like, we first need to know which symbols can end
849 a line-like section, which is precisely those which can end with a
850 newline token. These symbols don't necessarily alway end with a
851 newline, but they can. Hence they are not described as "lines" but
854 Clearly the `TK_newline` token can end with a newline. Any symbol
855 which is the head of a production that contains a line-ending symbol
856 followed only by nullable symbols is also a line-ending symbol. We
857 use a new field `can_eol` to record this attribute of symbols, and
858 compute it in a repetitive manner similar to `set_nullable`.
864 static void set_can_eol(struct grammar *g)
867 g->symtab[TK_newline]->can_eol = 1;
868 while (check_again) {
871 for (p = 0; p < g->production_count; p++) {
872 struct production *pr = g->productions[p];
875 if (pr->head->can_eol)
878 for (s = pr->body_size - 1; s >= 0; s--) {
879 if (pr->body[s]->can_eol) {
880 pr->head->can_eol = 1;
884 if (!pr->body[s]->nullable)
891 ### Building the `first` sets
893 When calculating what can follow a particular non-terminal, we will need to
894 know what the "first" terminal in any subsequent non-terminal might be. So
895 we calculate the `first` set for every non-terminal and store them in an
896 array. We don't bother recording the "first" set for terminals as they are
899 As the "first" for one symbol might depend on the "first" of another,
900 we repeat the calculations until no changes happen, just like with
901 `set_nullable`. This makes use of the fact that `symset_union`
902 reports if any change happens.
904 The core of this which finds the "first" of part of a production body
905 will be reused for computing the "follow" sets, so we split it out
906 into a separate function.
908 ###### grammar fields
909 struct symset *first;
913 static int add_first(struct production *pr, int start,
914 struct symset *target, struct grammar *g,
919 for (s = start; s < pr->body_size; s++) {
920 struct symbol *bs = pr->body[s];
921 if (bs->type == Terminal) {
922 if (symset_find(target, bs->num) < 0) {
923 symset_add(target, bs->num, 0);
927 } else if (symset_union(target, &g->first[bs->num]))
933 *to_end = (s == pr->body_size);
937 static void build_first(struct grammar *g)
941 g->first = calloc(g->num_syms, sizeof(g->first[0]));
942 for (s = 0; s < g->num_syms; s++)
943 g->first[s] = INIT_SYMSET;
945 while (check_again) {
948 for (p = 0; p < g->production_count; p++) {
949 struct production *pr = g->productions[p];
950 struct symset *head = &g->first[pr->head->num];
952 if (add_first(pr, 0, head, g, NULL))
958 ### Building the `follow` sets.
960 There are two different situations when we will want to generate "follow"
961 sets. If we are doing an SLR analysis, we want to generate a single
962 "follow" set for each non-terminal in the grammar. That is what is
963 happening here. If we are doing an LALR or LR analysis we will want
964 to generate a separate "LA" set for each item. We do that later
967 There are two parts to generating a "follow" set. Firstly we look at
968 every place that any non-terminal appears in the body of any
969 production, and we find the set of possible "first" symbols after
970 there. This is done using `add_first` above and only needs to be done
971 once as the "first" sets are now stable and will not change.
975 for (p = 0; p < g->production_count; p++) {
976 struct production *pr = g->productions[p];
979 for (b = 0; b < pr->body_size - 1; b++) {
980 struct symbol *bs = pr->body[b];
981 if (bs->type == Terminal)
983 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
987 The second part is to add the "follow" set of the head of a production
988 to the "follow" sets of the final symbol in the production, and any
989 other symbol which is followed only by `nullable` symbols. As this
990 depends on "follow" itself we need to repeatedly perform the process
991 until no further changes happen.
995 for (again = 0, p = 0;
996 p < g->production_count;
997 p < g->production_count-1
998 ? p++ : again ? (again = 0, p = 0)
1000 struct production *pr = g->productions[p];
1003 for (b = pr->body_size - 1; b >= 0; b--) {
1004 struct symbol *bs = pr->body[b];
1005 if (bs->type == Terminal)
1007 if (symset_union(&g->follow[bs->num],
1008 &g->follow[pr->head->num]))
1015 We now just need to create and initialise the `follow` list to get a
1018 ###### grammar fields
1019 struct symset *follow;
1022 static void build_follow(struct grammar *g)
1025 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1026 for (s = 0; s < g->num_syms; s++)
1027 g->follow[s] = INIT_SYMSET;
1031 ### Building itemsets and states
1033 There are three different levels of detail that can be chosen for
1034 building the itemsets and states for the LR grammar. They are:
1036 1. LR(0) or SLR(1), where no look-ahead is considered.
1037 2. LALR(1) where we build look-ahead sets with each item and merge
1038 the LA sets when we find two paths to the same "kernel" set of items.
1039 3. LR(1) where different look-ahead for any item in the set means
1040 a different state must be created.
1042 ###### forward declarations
1043 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1045 We need to be able to look through existing states to see if a newly
1046 generated state already exists. For now we use a simple sorted linked
1049 An item is a pair of numbers: the production number and the position of
1050 "DOT", which is an index into the body of the production.
1051 As the numbers are not enormous we can combine them into a single "short"
1052 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1053 production number (so 4000 productions with maximum of 15 symbols in the
1056 Comparing the itemsets will be a little different to comparing symsets
1057 as we want to do the lookup after generating the "kernel" of an
1058 itemset, so we need to ignore the offset=zero items which are added during
1061 To facilitate this, we modify the "DOT" number so that "0" sorts to
1062 the end of the list in the symset, and then only compare items before
1066 static inline unsigned short item_num(int production, int index)
1068 return production | ((31-index) << 11);
1070 static inline int item_prod(unsigned short item)
1072 return item & 0x7ff;
1074 static inline int item_index(unsigned short item)
1076 return (31-(item >> 11)) & 0x1f;
1079 For LR(1) analysis we need to compare not just the itemset in a state
1080 but also the LA sets. As we assign each unique LA set a number, we
1081 can just compare the symset and the data values together.
1084 static int itemset_cmp(struct symset a, struct symset b,
1085 enum grammar_type type)
1091 i < a.cnt && i < b.cnt &&
1092 item_index(a.syms[i]) > 0 &&
1093 item_index(b.syms[i]) > 0;
1095 int diff = a.syms[i] - b.syms[i];
1099 diff = a.data[i] - b.data[i];
1104 if (i == a.cnt || item_index(a.syms[i]) == 0)
1108 if (i == b.cnt || item_index(b.syms[i]) == 0)
1114 if (type < LR1 || av == -1)
1117 a.data[i] - b.data[i];
1120 And now we can build the list of itemsets. The lookup routine returns
1121 both a success flag and a pointer to where in the list an insert
1122 should happen, so we don't need to search a second time.
1126 struct itemset *next;
1128 struct symset items;
1129 struct symset go_to;
1134 ###### grammar fields
1135 struct itemset *items;
1139 static int itemset_find(struct grammar *g, struct itemset ***where,
1140 struct symset kernel, enum grammar_type type)
1142 struct itemset **ip;
1144 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1145 struct itemset *i = *ip;
1147 diff = itemset_cmp(i->items, kernel, type);
1160 Adding an itemset may require merging the LA sets if LALR analysis is
1161 happening. If any new LA set add symbol that weren't in the old LA set, we
1162 clear the `completed` flag so that the dependants of this itemset will be
1163 recalculated and their LA sets updated.
1165 `add_itemset` must consume the symsets it is passed, either by adding
1166 them to a data structure, of freeing them.
1168 static int add_itemset(struct grammar *g, struct symset ss,
1169 enum grammar_type type, int starts_line)
1171 struct itemset **where, *is;
1173 int found = itemset_find(g, &where, ss, type);
1175 is = calloc(1, sizeof(*is));
1176 is->state = g->states;
1180 is->go_to = INIT_DATASET;
1181 is->starts_line = starts_line;
1190 for (i = 0; i < ss.cnt; i++) {
1191 struct symset temp = INIT_SYMSET, s;
1192 if (ss.data[i] == is->items.data[i])
1194 s = set_find(g, is->items.data[i]);
1195 symset_union(&temp, &s);
1196 s = set_find(g, ss.data[i]);
1197 if (symset_union(&temp, &s)) {
1198 is->items.data[i] = save_set(g, temp);
1209 To build all the itemsets, we first insert the initial itemset made from the
1210 start symbol, complete each itemset, and then generate new itemsets from old
1211 until no new ones can be made.
1213 Completing an itemset means finding all the items where "DOT" is followed by
1214 a nonterminal and adding "DOT=0" items for every production from that
1215 non-terminal - providing each item hasn't already been added.
1217 If LA sets are needed, the LA set for each new item is found using
1218 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1219 be supplemented by the LA set for the item which produce the new item.
1221 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1222 is used in the next stage.
1224 NOTE: precedence handling should happen here - I haven't written this yet
1227 ###### complete itemset
1228 for (i = 0; i < is->items.cnt; i++) {
1229 int p = item_prod(is->items.syms[i]);
1230 int bs = item_index(is->items.syms[i]);
1231 struct production *pr = g->productions[p];
1234 struct symset LA = INIT_SYMSET;
1235 unsigned short sn = 0;
1237 if (bs == pr->body_size)
1240 if (symset_find(&done, s->num) < 0)
1241 symset_add(&done, s->num, 0);
1242 if (s->type != Nonterminal)
1248 add_first(pr, bs+1, &LA, g, &to_end);
1250 struct symset ss = set_find(g, is->items.data[i]);
1251 symset_union(&LA, &ss);
1253 sn = save_set(g, LA);
1254 LA = set_find(g, sn);
1257 /* Add productions for this symbol */
1258 for (p2 = s->first_production;
1259 p2 < g->production_count &&
1260 g->productions[p2]->head == s;
1262 int itm = item_num(p2, 0);
1263 int pos = symset_find(&is->items, itm);
1265 symset_add(&is->items, itm, sn);
1266 /* Will have re-ordered, so start
1267 * from beginning again */
1269 } else if (type >= LALR) {
1270 struct symset ss = set_find(g, is->items.data[pos]);
1271 struct symset tmp = INIT_SYMSET;
1273 symset_union(&tmp, &ss);
1274 if (symset_union(&tmp, &LA)) {
1275 is->items.data[pos] = save_set(g, tmp);
1283 For each symbol we found (and placed in `done`) we collect all the items for
1284 which this symbol is next, and create a new itemset with "DOT" advanced over
1285 the symbol. This is then added to the collection of itemsets (or merged
1286 with a pre-existing itemset).
1288 ###### derive itemsets
1289 // Now we have a completed itemset, so we need to
1290 // compute all the 'next' itemsets and create them
1291 // if they don't exist.
1292 for (i = 0; i < done.cnt; i++) {
1294 unsigned short state;
1295 int starts_line = 0;
1296 struct symbol *sym = g->symtab[done.syms[i]];
1297 struct symset newitemset = INIT_SYMSET;
1299 newitemset = INIT_DATASET;
1302 (sym->nullable && is->starts_line))
1304 for (j = 0; j < is->items.cnt; j++) {
1305 int itm = is->items.syms[j];
1306 int p = item_prod(itm);
1307 int bp = item_index(itm);
1308 struct production *pr = g->productions[p];
1309 unsigned short la = 0;
1312 if (bp == pr->body_size)
1314 if (pr->body[bp] != sym)
1317 la = is->items.data[j];
1318 pos = symset_find(&newitemset, pr->head->num);
1320 symset_add(&newitemset, item_num(p, bp+1), la);
1321 else if (type >= LALR) {
1322 // Need to merge la set.
1323 int la2 = newitemset.data[pos];
1325 struct symset ss = set_find(g, la2);
1326 struct symset LA = INIT_SYMSET;
1327 symset_union(&LA, &ss);
1328 ss = set_find(g, la);
1329 if (symset_union(&LA, &ss))
1330 newitemset.data[pos] = save_set(g, LA);
1336 state = add_itemset(g, newitemset, type, starts_line);
1337 if (symset_find(&is->go_to, done.syms[i]) < 0)
1338 symset_add(&is->go_to, done.syms[i], state);
1341 All that is left is to crate the initial itemset from production zero, and
1342 with `TK_eof` as the LA set.
1345 static void build_itemsets(struct grammar *g, enum grammar_type type)
1347 struct symset first = INIT_SYMSET;
1350 unsigned short la = 0;
1352 // LA set just has eof
1353 struct symset eof = INIT_SYMSET;
1354 symset_add(&eof, TK_eof, 0);
1355 la = save_set(g, eof);
1356 first = INIT_DATASET;
1358 // production 0, offset 0 (with no data)
1359 symset_add(&first, item_num(0, 0), la);
1360 add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
1361 for (again = 0, is = g->items;
1363 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1365 struct symset done = INIT_SYMSET;
1376 ### Completing the analysis.
1378 The exact process of analysis depends on which level was requested. For
1379 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1380 `LR1` we need first, but not follow. For `SLR` we need both.
1382 We don't build the "action" tables that you might expect as the parser
1383 doesn't use them. They will be calculated to some extent if needed for
1386 Once we have built everything we allocate arrays for the two lists:
1387 symbols and itemsets. This allows more efficient access during reporting.
1388 The symbols are grouped as terminals and non-terminals and we record the
1389 changeover point in `first_nonterm`.
1391 ###### grammar fields
1392 struct symbol **symtab;
1393 struct itemset **statetab;
1396 ###### grammar_analyse
1398 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1402 int snum = TK_reserved;
1403 for (s = g->syms; s; s = s->next)
1404 if (s->num < 0 && s->type == Terminal) {
1408 g->first_nonterm = snum;
1409 for (s = g->syms; s; s = s->next)
1415 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1416 for (s = g->syms; s; s = s->next)
1417 g->symtab[s->num] = s;
1427 build_itemsets(g, type);
1429 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1430 for (is = g->items; is ; is = is->next)
1431 g->statetab[is->state] = is;
1434 ## Reporting on the Grammar
1436 The purpose of the report is to give the grammar developer insight into
1437 how the grammar parser will work. It is basically a structured dump of
1438 all the tables that have been generated, plus an description of any conflicts.
1440 ###### grammar_report
1441 static int grammar_report(struct grammar *g, enum grammar_type type)
1447 return report_conflicts(g, type);
1450 Firstly we have the complete list of symbols, together with the "FIRST"
1451 set if that was generated.
1455 static void report_symbols(struct grammar *g)
1459 printf("SYMBOLS + FIRST:\n");
1461 printf("SYMBOLS:\n");
1463 for (n = 0; n < g->num_syms; n++) {
1464 struct symbol *s = g->symtab[n];
1468 printf(" %c%c%3d%c: ",
1469 s->nullable ? '.':' ',
1470 s->can_eol ? '>':' ',
1471 s->num, symtypes[s->type]);
1474 printf(" (%d%s)", s->precedence,
1475 assoc_names[s->assoc]);
1477 if (g->first && s->type == Nonterminal) {
1480 for (j = 0; j < g->first[n].cnt; j++) {
1483 prtxt(g->symtab[g->first[n].syms[j]]->name);
1490 Then we have to follow sets if they were computed.
1492 static void report_follow(struct grammar *g)
1495 printf("FOLLOW:\n");
1496 for (n = 0; n < g->num_syms; n++)
1497 if (g->follow[n].cnt) {
1501 prtxt(g->symtab[n]->name);
1502 for (j = 0; j < g->follow[n].cnt; j++) {
1505 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1511 And finally the item sets. These include the GO TO tables and, for
1512 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1513 it up a bit. First the items, with production number and associativity.
1515 static void report_item(struct grammar *g, int itm)
1517 int p = item_prod(itm);
1518 int dot = item_index(itm);
1519 struct production *pr = g->productions[p];
1523 prtxt(pr->head->name);
1525 for (i = 0; i < pr->body_size; i++) {
1526 printf(" %s", (dot == i ? ". ": ""));
1527 prtxt(pr->body[i]->name);
1529 if (dot == pr->body_size)
1533 printf(" (%d%s)", pr->precedence,
1534 assoc_names[pr->assoc]);
1538 The LA sets which are (possibly) reported with each item:
1540 static void report_la(struct grammar *g, int lanum)
1542 struct symset la = set_find(g, lanum);
1546 printf(" LOOK AHEAD(%d)", lanum);
1547 for (i = 0; i < la.cnt; i++) {
1550 prtxt(g->symtab[la.syms[i]]->name);
1555 Then the go to sets:
1558 static void report_goto(struct grammar *g, struct symset gt)
1563 for (i = 0; i < gt.cnt; i++) {
1565 prtxt(g->symtab[gt.syms[i]]->name);
1566 printf(" -> %d\n", gt.data[i]);
1570 Now we can report all the item sets complete with items, LA sets, and GO TO.
1572 static void report_itemsets(struct grammar *g)
1575 printf("ITEM SETS(%d)\n", g->states);
1576 for (s = 0; s < g->states; s++) {
1578 struct itemset *is = g->statetab[s];
1579 printf(" Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
1580 for (j = 0; j < is->items.cnt; j++) {
1581 report_item(g, is->items.syms[j]);
1582 if (is->items.data != NO_DATA)
1583 report_la(g, is->items.data[j]);
1585 report_goto(g, is->go_to);
1589 ### Reporting conflicts
1591 Conflict detection varies a lot among different analysis levels. However
1592 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1593 LR1 are also similar as they have FOLLOW or LA sets.
1597 ## conflict functions
1599 static int report_conflicts(struct grammar *g, enum grammar_type type)
1602 printf("Conflicts:\n");
1604 cnt = conflicts_lr0(g, type);
1606 cnt = conflicts_slr(g, type);
1608 printf(" - no conflicts\n");
1612 LR0 conflicts are any state which have both a reducible item and
1615 LR05 conflicts only occurs if two possibly reductions exist,
1616 as shifts always over-ride reductions.
1618 ###### conflict functions
1619 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1623 for (i = 0; i < g->states; i++) {
1624 struct itemset *is = g->statetab[i];
1625 int last_reduce = -1;
1626 int prev_reduce = -1;
1627 int last_shift = -1;
1631 for (j = 0; j < is->items.cnt; j++) {
1632 int itm = is->items.syms[j];
1633 int p = item_prod(itm);
1634 int bp = item_index(itm);
1635 struct production *pr = g->productions[p];
1637 if (bp == pr->body_size) {
1638 prev_reduce = last_reduce;
1642 if (pr->body[bp]->type == Terminal)
1645 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1646 printf(" State %d has both SHIFT and REDUCE:\n", i);
1647 report_item(g, is->items.syms[last_shift]);
1648 report_item(g, is->items.syms[last_reduce]);
1651 if (prev_reduce >= 0) {
1652 printf(" State %d has 2 (or more) reducible items\n", i);
1653 report_item(g, is->items.syms[prev_reduce]);
1654 report_item(g, is->items.syms[last_reduce]);
1661 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1662 look ahead, or if a symbol in a look-ahead can be shifted. The differ only
1663 in the source of the look ahead set.
1665 We build a dataset mapping terminal to item for possible SHIFTs and then
1666 another for possible REDUCE operations. We report when we get conflicts
1669 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1674 for (i = 0; i < g->states; i++) {
1675 struct itemset *is = g->statetab[i];
1676 struct symset shifts = INIT_DATASET;
1677 struct symset reduce = INIT_DATASET;
1681 /* First collect the shifts */
1682 for (j = 0; j < is->items.cnt; j++) {
1683 unsigned short itm = is->items.syms[j];
1684 int p = item_prod(itm);
1685 int bp = item_index(itm);
1686 struct production *pr = g->productions[p];
1688 if (bp < pr->body_size &&
1689 pr->body[bp]->type == Terminal) {
1691 int sym = pr->body[bp]->num;
1692 if (symset_find(&shifts, sym) < 0)
1693 symset_add(&shifts, sym, itm);
1696 /* Now look for reduction and conflicts */
1697 for (j = 0; j < is->items.cnt; j++) {
1698 unsigned short itm = is->items.syms[j];
1699 int p = item_prod(itm);
1700 int bp = item_index(itm);
1701 struct production *pr = g->productions[p];
1703 if (bp < pr->body_size)
1708 la = g->follow[pr->head->num];
1710 la = set_find(g, is->items.data[j]);
1712 for (k = 0; k < la.cnt; k++) {
1713 int pos = symset_find(&shifts, la.syms[k]);
1715 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1716 prtxt(g->symtab[la.syms[k]]->name);
1718 report_item(g, shifts.data[pos]);
1719 report_item(g, itm);
1722 pos = symset_find(&reduce, la.syms[k]);
1724 symset_add(&reduce, la.syms[k], itm);
1727 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1728 prtxt(g->symtab[la.syms[k]]->name);
1730 report_item(g, itm);
1731 report_item(g, reduce.data[pos]);
1735 symset_free(shifts);
1736 symset_free(reduce);
1742 ## Generating the parser
1744 The export part of the parser is the `parse_XX` function, where the name
1745 `XX` is based on the name of the parser files.
1747 This takes a `code_node`, a partially initialized `token_config`, and an
1748 optional `FILE` to send tracing to. The `token_config` gets the list of
1749 known words added and then is used with the `code_node` to initialize the
1752 `parse_XX` then call the library function `parser_run` to actually complete
1753 the parse. This needs the `states` table and function to call the various
1754 pieces of code provided in the grammar file, so they are generated first.
1756 ###### parser_generate
1758 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1764 gen_reduce(f, g, file);
1767 fprintf(f, "#line 0 \"gen_parser\"\n");
1768 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1771 fprintf(f, "\tstruct token_state *tokens;\n");
1772 fprintf(f, "\tconfig->words_marks = known;\n");
1773 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1774 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1775 fprintf(f, "\ttokens = token_open(code, config);\n");
1776 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
1777 fprintf(f, "\ttoken_close(tokens);\n");
1778 fprintf(f, "\treturn rv;\n");
1779 fprintf(f, "}\n\n");
1782 ### Table words table
1784 The know words is simply an array of terminal symbols.
1785 The table of nonterminals used for tracing is a similar array.
1789 static void gen_known(FILE *f, struct grammar *g)
1792 fprintf(f, "#line 0 \"gen_known\"\n");
1793 fprintf(f, "static const char *known[] = {\n");
1794 for (i = TK_reserved;
1795 i < g->num_syms && g->symtab[i]->type == Terminal;
1797 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1798 g->symtab[i]->name.txt);
1799 fprintf(f, "};\n\n");
1802 static void gen_non_term(FILE *f, struct grammar *g)
1805 fprintf(f, "#line 0 \"gen_non_term\"\n");
1806 fprintf(f, "static const char *non_term[] = {\n");
1807 for (i = TK_reserved;
1810 if (g->symtab[i]->type == Nonterminal)
1811 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1812 g->symtab[i]->name.txt);
1813 fprintf(f, "};\n\n");
1816 ### States and the goto tables.
1818 For each state we record the goto table, the reducible production if
1819 there is one, or a symbol to shift for error recovery.
1820 Some of the details of the reducible production are stored in the
1821 `do_reduce` function to come later. Here we store the production number,
1822 the body size (useful for stack management) and the resulting symbol (useful
1823 for knowing how to free data later).
1825 The go to table is stored in a simple array of `sym` and corresponding
1828 ###### exported types
1836 const struct lookup * go_to;
1847 static void gen_goto(FILE *f, struct grammar *g)
1850 fprintf(f, "#line 0 \"gen_goto\"\n");
1851 for (i = 0; i < g->states; i++) {
1853 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1855 struct symset gt = g->statetab[i]->go_to;
1856 for (j = 0; j < gt.cnt; j++)
1857 fprintf(f, "\t{ %d, %d },\n",
1858 gt.syms[j], gt.data[j]);
1865 static void gen_states(FILE *f, struct grammar *g)
1868 fprintf(f, "#line 0 \"gen_states\"\n");
1869 fprintf(f, "static const struct state states[] = {\n");
1870 for (i = 0; i < g->states; i++) {
1871 struct itemset *is = g->statetab[i];
1872 int j, prod = -1, prod_len;
1874 int shift_len = 0, shift_remain = 0;
1875 for (j = 0; j < is->items.cnt; j++) {
1876 int itm = is->items.syms[j];
1877 int p = item_prod(itm);
1878 int bp = item_index(itm);
1879 struct production *pr = g->productions[p];
1881 if (bp < pr->body_size) {
1882 if (shift_sym < 0 ||
1883 (shift_len == bp && shift_remain > pr->body_size - bp)) {
1884 shift_sym = pr->body[bp]->num;
1886 shift_remain = pr->body_size - bp;
1890 /* This is what we reduce */
1891 if (prod < 0 || prod_len < pr->body_size) {
1893 prod_len = pr->body_size;
1898 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
1899 i, is->go_to.cnt, i, prod,
1900 g->productions[prod]->body_size,
1901 g->productions[prod]->head->num,
1904 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
1905 i, is->go_to.cnt, i, shift_sym,
1908 fprintf(f, "};\n\n");
1911 ### The `do_reduce` function and the code
1913 When the parser engine decides to reduce a production, it calls `do_reduce`.
1914 This has two functions.
1916 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1917 production being reduced. Secondly it runs the code that was included with
1918 the production if any.
1920 This code needs to be able to store data somewhere. Rather than requiring
1921 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1922 `do_reduce` return the size to be saved.
1924 The code fragment requires translation when written out. Any `$N` needs to
1925 be converted to a reference either to that buffer (if `$0`) or to the
1926 structure returned by a previous reduction. These pointer need to be cast
1927 to the appropriate type for each access. All this is handling in
1933 static void gen_code(struct production *p, FILE *f, struct grammar *g)
1936 fprintf(f, "\t\t\t");
1937 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1946 if (*c < '0' || *c > '9') {
1951 while (c[1] >= '0' && c[1] <= '9') {
1953 n = n * 10 + *c - '0';
1956 fprintf(f, "(*(struct %.*s*)ret)",
1957 p->head->struct_name.len,
1958 p->head->struct_name.txt);
1959 else if (n > p->body_size)
1960 fprintf(f, "$%d", n);
1961 else if (p->body[n-1]->type == Terminal)
1962 fprintf(f, "(*(struct token *)body[%d])",
1964 else if (p->body[n-1]->struct_name.txt == NULL)
1965 fprintf(f, "$%d", n);
1967 fprintf(f, "(*(struct %.*s*)body[%d])",
1968 p->body[n-1]->struct_name.len,
1969 p->body[n-1]->struct_name.txt, n-1);
1976 static void gen_reduce(FILE *f, struct grammar *g, char *file)
1979 fprintf(f, "#line 0 \"gen_reduce\"\n");
1980 fprintf(f, "static int do_reduce(int prod, void **body, void *ret)\n");
1982 fprintf(f, "\tint ret_size = 0;\n");
1984 fprintf(f, "\tswitch(prod) {\n");
1985 for (i = 0; i < g->production_count; i++) {
1986 struct production *p = g->productions[i];
1987 fprintf(f, "\tcase %d:\n", i);
1992 if (p->head->struct_name.txt)
1993 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
1994 p->head->struct_name.len,
1995 p->head->struct_name.txt);
1997 fprintf(f, "\t\tbreak;\n");
1999 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2004 As each non-terminal can potentially cause a different type of data
2005 structure to be allocated and filled in, we need to be able to free it when
2008 It is particularly important to have fine control over freeing during error
2009 recovery where individual stack frames might need to be freed.
2011 For this, the grammar author required to defined a `free_XX` function for
2012 each structure that is used by a non-terminal. `do_free` all call whichever
2013 is appropriate given a symbol number, and will call `free` (as is
2014 appropriate for tokens` on any terminal symbol.
2018 static void gen_free(FILE *f, struct grammar *g)
2022 fprintf(f, "#line 0 \"gen_free\"\n");
2023 fprintf(f, "static void do_free(short sym, void *asn)\n");
2025 fprintf(f, "\tif (!asn) return;\n");
2026 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2027 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2028 fprintf(f, "\tswitch(sym) {\n");
2030 for (i = 0; i < g->num_syms; i++) {
2031 struct symbol *s = g->symtab[i];
2033 s->type != Nonterminal ||
2034 s->struct_name.txt == NULL)
2037 fprintf(f, "\tcase %d:\n", s->num);
2038 fprintf(f, "\t\tfree_%.*s(asn);\n",
2040 s->struct_name.txt);
2041 fprintf(f, "\t\tbreak;\n");
2043 fprintf(f, "\t}\n}\n\n");
2046 ## The main routine.
2048 There are three key parts to the "main" function of parsergen: processing
2049 the arguments, loading the grammar file, and dealing with the grammar.
2051 ### Argument processing.
2053 Command line options allow the selection of analysis mode, name of output
2054 file, and whether or not a report should be generated. By default we create
2055 a report only if no code output was requested.
2057 The `parse_XX` function name uses the basename of the output file which
2058 should not have a suffix (`.c`). `.c` is added to the given name for the
2059 code, and `.h` is added for the header (if header text is specifed in the
2066 static const struct option long_options[] = {
2067 { "LR0", 0, NULL, '0' },
2068 { "LR05", 0, NULL, '5' },
2069 { "SLR", 0, NULL, 'S' },
2070 { "LALR", 0, NULL, 'L' },
2071 { "LR1", 0, NULL, '1' },
2072 { "report", 0, NULL, 'R' },
2073 { "output", 1, NULL, 'o' },
2074 { NULL, 0, NULL, 0 }
2076 const char *options = "05SL1Ro:";
2078 ###### process arguments
2080 char *outfile = NULL;
2084 enum grammar_type type = LR05;
2085 while ((opt = getopt_long(argc, argv, options,
2086 long_options, NULL)) != -1) {
2101 outfile = optarg; break;
2103 fprintf(stderr, "Usage: parsergen ...\n");
2108 infile = argv[optind++];
2110 fprintf(stderr, "No input file given\n");
2113 if (outfile && report == 1)
2116 if (name && strchr(name, '/'))
2117 name = strrchr(name, '/')+1;
2119 if (optind < argc) {
2120 fprintf(stderr, "Excess command line arguments\n");
2124 ### Loading the grammar file
2126 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2129 One we have extracted the code (with `mdcode`) we expect to file three
2130 sections: header, code, and grammar. Anything else is an error.
2132 "header" and "code" are optional, though it is hard to build a working
2133 parser with neither. "grammar" must be provided.
2137 #include <sys/mman.h>
2142 static void pr_err(char *msg)
2145 fprintf(stderr, "%s\n", msg);
2149 struct section *table;
2153 fd = open(infile, O_RDONLY);
2155 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2156 infile, strerror(errno));
2159 len = lseek(fd, 0, 2);
2160 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2161 table = code_extract(file, file+len, pr_err);
2163 struct code_node *hdr = NULL;
2164 struct code_node *code = NULL;
2165 struct code_node *gram = NULL;
2166 for (s = table; s; s = s->next) {
2167 if (text_is(s->section, "header"))
2169 else if (text_is(s->section, "code"))
2171 else if (text_is(s->section, "grammar"))
2174 fprintf(stderr, "Unknown content section: %.*s\n",
2175 s->section.len, s->section.txt);
2180 ### Processing the input
2182 As we need to append an extention to a filename and then open it for
2183 writing, and we need to do this twice, it helps to have a separate function.
2187 static FILE *open_ext(char *base, char *ext)
2189 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2191 strcat(strcpy(fn, base), ext);
2197 If we can read the grammar, then we analyse and optionally report on it. If
2198 the report finds conflicts we will exit with an error status.
2200 ###### process input
2201 struct grammar *g = NULL;
2203 fprintf(stderr, "No grammar section provided\n");
2206 g = grammar_read(gram);
2208 fprintf(stderr, "Failure to parse grammar\n");
2213 grammar_analyse(g, type);
2215 if (grammar_report(g, type))
2219 If a headers section is defined, we write it out and include a declaration
2220 for the `parse_XX` function so it can be used from separate code.
2222 if (rv == 0 && hdr && outfile) {
2223 FILE *f = open_ext(outfile, ".h");
2225 code_node_print(f, hdr, infile);
2226 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2230 fprintf(stderr, "Cannot create %s.h\n",
2236 And if all goes well and an output file was provided, we create the `.c`
2237 file with the code section (if any) and the parser tables and function.
2239 if (rv == 0 && outfile) {
2240 FILE *f = open_ext(outfile, ".c");
2243 code_node_print(f, code, infile);
2244 gen_parser(f, g, infile, name);
2247 fprintf(stderr, "Cannot create %s.c\n",
2253 And that about wraps it up. We need to set the locale so that UTF-8 is
2254 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2256 ###### File: parsergen.mk
2257 parsergen : parsergen.o libscanner.o libmdcode.o
2258 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2265 int main(int argc, char *argv[])
2270 setlocale(LC_ALL,"");
2272 ## process arguments
2279 ## The SHIFT/REDUCE parser
2281 Having analysed the grammar and generated all the table, we only need the
2282 shift/reduce engine to bring it all together.
2284 ### Goto table lookup
2286 The parser generator has nicely provided us with goto tables sorted by
2287 symbol number. We need a binary search function to find a symbol in the
2290 ###### parser functions
2292 static int search(const struct state *l, int sym)
2295 int hi = l->go_to_cnt;
2299 while (lo + 1 < hi) {
2300 int mid = (lo + hi) / 2;
2301 if (l->go_to[mid].sym <= sym)
2306 if (l->go_to[lo].sym == sym)
2307 return l->go_to[lo].state;
2312 ### The state stack.
2314 The core data structure for the parser is the stack. This tracks all the
2315 symbols that have been recognised or partially recognised.
2317 The stack usually won't grow very large - maybe a few tens of entries. So
2318 we dynamically resize and array as required but never bother to shrink it
2321 We keep the stack as two separate allocations. One, `asn_stack` stores the
2322 "abstract syntax nodes" which are created by each reduction. When we call
2323 `do_reduce` we need to pass an array of the `asn`s of the body of the
2324 production, and by keeping a separate `asn` stack, we can just pass a
2325 pointer into this stack.
2327 The other allocation stores all other stack fields of which there are four.
2328 The `state` is the most important one and guides the parsing process. The
2329 `sym` is nearly unnecessary. However when we want to free entries from the
2330 `asn_stack`, it helps to know what type they are so we can call the right
2331 freeing function. The symbol leads us to the right free function through
2334 The `indents` count and the `starts_indented` flag track the line
2335 indents in the symbol. These are used to allow indent information to
2336 guide parsing and error recovery.
2338 As well as the stack of frames we have a `next` frame which is
2339 assembled from the incoming token and other information prior to
2340 pushing it onto the stack.
2342 ###### parser functions
2348 short starts_indented;
2350 short newline_permitted;
2359 Two operations are needed on the stack - shift (which is like push) and pop.
2361 Shift applies not only to terminals but also to non-terminals. When we
2362 reduce a production we will pop off entries corresponding to the body
2363 symbols, then push on an item for the head of the production. This last is
2364 exactly the same process as shifting in a terminal so we use the same
2367 To simplify other code we arrange for `shift` to fail if there is no `goto`
2368 state for the symbol. This is useful in basic parsing due to our design
2369 that we shift when we can, and reduce when we cannot. So the `shift`
2370 function reports if it could.
2372 So `shift` finds the next state. If that succeed it extends the allocations
2373 if needed and pushes all the information onto the stacks.
2375 ###### parser functions
2377 static int shift(struct parser *p,
2379 const struct state states[])
2381 // Push an entry onto the stack
2382 int newstate = search(&states[p->next.state], p->next.sym);
2385 if (p->tos >= p->stack_size) {
2386 p->stack_size += 10;
2387 p->stack = realloc(p->stack, p->stack_size
2388 * sizeof(p->stack[0]));
2389 p->asn_stack = realloc(p->asn_stack, p->stack_size
2390 * sizeof(p->asn_stack[0]));
2392 p->stack[p->tos] = p->next;
2393 p->asn_stack[p->tos] = asn;
2395 p->next.state = newstate;
2396 p->next.indents = 0;
2397 p->next.starts_indented = 0;
2398 // if new state doesn't start a line, we inherit newline_permitted status
2399 if (states[newstate].starts_line)
2400 p->next.newline_permitted = 1;
2404 `pop` simply moves the top of stack (`tos`) back down the required amount
2405 and frees any `asn` entries that need to be freed. It is called _after_ we
2406 reduce a production, just before we `shift` the nonterminal in.
2408 ###### parser functions
2410 static void pop(struct parser *p, int num,
2411 void(*do_free)(short sym, void *asn))
2415 for (i = 0; i < num; i++) {
2416 p->next.indents += p->stack[p->tos+i].indents;
2417 do_free(p->stack[p->tos+i].sym,
2418 p->asn_stack[p->tos+i]);
2422 p->next.state = p->stack[p->tos].state;
2423 p->next.starts_indented = p->stack[p->tos].starts_indented;
2424 p->next.newline_permitted = p->stack[p->tos].newline_permitted;
2425 if (p->next.indents > p->next.starts_indented)
2426 p->next.newline_permitted = 0;
2430 ### Memory allocation
2432 The `scanner` returns tokens in a local variable - we want them in allocated
2433 memory so they can live in the `asn_stack`. Similarly the `asn` produced by
2434 a reduce is in a large buffer. Both of these require some allocation and
2435 copying, hence `memdup` and `tokcopy`.
2437 ###### parser includes
2440 ###### parser functions
2442 void *memdup(void *m, int len)
2448 memcpy(ret, m, len);
2452 static struct token *tok_copy(struct token tk)
2454 struct token *new = malloc(sizeof(*new));
2459 ### The heart of the parser.
2461 Now we have the parser. If we can shift, we do. If not and we can reduce
2462 we do. If the production we reduced was production zero, then we have
2463 accepted the input and can finish.
2465 We return whatever `asn` was returned by reducing production zero.
2467 If we can neither shift nor reduce we have an error to handle. We pop
2468 single entries off the stack until we can shift the `TK_error` symbol, then
2469 drop input tokens until we find one we can shift into the new error state.
2471 When we find `TK_in` and `TK_out` tokens which report indents we need
2472 to handle them directly as the grammar cannot express what we want to
2475 `TK_in` tokens are easy: we simply update the `next` stack frame to
2476 record how many indents there are and that the next token started with
2479 `TK_out` tokens must either be counted off against any pending indent,
2480 or must force reductions until there is a pending indent which isn't
2481 at the start of a production.
2483 ###### parser includes
2486 void *parser_run(struct token_state *tokens,
2487 const struct state states[],
2488 int (*do_reduce)(int, void**, void*),
2489 void (*do_free)(short, void*),
2490 FILE *trace, const char *non_term[], int knowns)
2492 struct parser p = { 0 };
2493 struct token *tk = NULL;
2497 p.next.newline_permitted = states[0].starts_line;
2499 struct token *err_tk;
2501 tk = tok_copy(token_next(tokens));
2502 p.next.sym = tk->num;
2504 parser_trace(trace, &p, tk, states, non_term, knowns);
2506 if (p.next.sym == TK_in) {
2507 p.next.starts_indented = 1;
2513 if (p.next.sym == TK_out) {
2514 if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
2515 (p.stack[p.tos-1].indents == 1 &&
2516 states[p.next.state].reduce_size > 1)) {
2517 p.stack[p.tos-1].indents -= 1;
2518 if (p.stack[p.tos-1].indents == p.stack[p.tos-1].starts_indented) {
2519 // no internal indent any more, reassess 'newline_permitted'
2520 if (states[p.stack[p.tos-1].state].starts_line)
2521 p.stack[p.tos-1].newline_permitted = 1;
2523 p.stack[p.tos-1].newline_permitted = p.stack[p.tos-2].newline_permitted;
2529 // fall through and force a REDUCE (as 'shift'
2532 if (p.next.sym == TK_newline) {
2533 if (! p.stack[p.tos-1].newline_permitted) {
2539 if (shift(&p, tk, states)) {
2543 if (states[p.next.state].reduce_prod >= 0) {
2545 int prod = states[p.next.state].reduce_prod;
2546 int size = states[p.next.state].reduce_size;
2548 static char buf[16*1024];
2549 p.next.sym = states[p.next.state].reduce_sym;
2551 body = p.asn_stack +
2552 (p.tos - states[p.next.state].reduce_size);
2554 bufsize = do_reduce(prod, body, buf);
2556 pop(&p, size, do_free);
2557 shift(&p, memdup(buf, bufsize), states);
2562 if (tk->num == TK_out) {
2563 // Indent problem - synthesise tokens to get us
2565 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
2566 p.next.sym = states[p.next.state].shift_sym;
2567 shift(&p, tok_copy(*tk), states);
2568 // FIXME need to report this error somehow
2571 /* Error. We walk up the stack until we
2572 * find a state which will accept TK_error.
2573 * We then shift in TK_error and see what state
2574 * that takes us too.
2575 * Then we discard input tokens until
2576 * we find one that is acceptable.
2579 err_tk = tok_copy(*tk);
2580 p.next.sym = TK_error;
2581 while (shift(&p, err_tk, states) == 0
2583 // discard this state
2584 pop(&p, 1, do_free);
2587 // no state accepted TK_error
2590 while (search(&states[p.next.state], tk->num) < 0 &&
2591 tk->num != TK_eof) {
2593 tk = tok_copy(token_next(tokens));
2594 if (tk->num == TK_in)
2595 p.next.indents += 1;
2596 if (tk->num == TK_out) {
2597 if (p.next.indents == 0)
2599 p.next.indents -= 1;
2602 if (p.tos == 0 && tk->num == TK_eof)
2607 ret = p.asn_stack[0];
2609 pop(&p, p.tos, do_free);
2615 ###### exported functions
2616 void *parser_run(struct token_state *tokens,
2617 const struct state states[],
2618 int (*do_reduce)(int, void**, void*),
2619 void (*do_free)(short, void*),
2620 FILE *trace, const char *non_term[], int knowns);
2624 Being able to visualize the parser in action can be invaluable when
2625 debugging the parser code, or trying to understand how the parse of a
2626 particular grammar progresses. The stack contains all the important
2627 state, so just printing out the stack every time around the parse loop
2628 can make it possible to see exactly what is happening.
2630 This doesn't explicitly show each SHIFT and REDUCE action. However they
2631 are easily deduced from the change between consecutive lines, and the
2632 details of each state can be found by cross referencing the states list
2633 in the stack with the "report" that parsergen can generate.
2635 For terminal symbols, we just dump the token. For non-terminals we
2636 print the name of the symbol. The look ahead token is reported at the
2637 end inside square brackets.
2639 ###### parser functions
2641 static char *reserved_words[] = {
2642 [TK_error] = "ERROR",
2645 [TK_newline] = "NEWLINE",
2648 static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2650 fprintf(trace, "(%d", f->state);
2652 fprintf(trace, "%c%d", f->starts_indented?':':'.',
2654 if (states[f->state].starts_line)
2655 fprintf(trace, "s");
2656 if (f->newline_permitted)
2657 fprintf(trace, "n");
2658 fprintf(trace, ") ");
2661 void parser_trace(FILE *trace, struct parser *p,
2662 struct token *tk, const struct state states[],
2663 const char *non_term[], int knowns)
2666 for (i = 0; i < p->tos; i++) {
2667 int sym = p->stack[i].sym;
2668 parser_trace_state(trace, &p->stack[i], states);
2669 if (sym < TK_reserved &&
2670 reserved_words[sym] != NULL)
2671 fputs(reserved_words[sym], trace);
2672 else if (sym < TK_reserved + knowns) {
2673 struct token *t = p->asn_stack[i];
2674 text_dump(trace, t->txt, 20);
2676 fputs(non_term[sym - TK_reserved - knowns],
2680 parser_trace_state(trace, &p->next, states);
2681 fprintf(trace, " [");
2682 if (tk->num < TK_reserved &&
2683 reserved_words[tk->num] != NULL)
2684 fputs(reserved_words[tk->num], trace);
2686 text_dump(trace, tk->txt, 20);
2687 fputs("]\n", trace);
2692 The obvious example for a parser is a calculator.
2694 As `scanner` provides number parsing function using `libgmp` is it not much
2695 work to perform arbitrary rational number calculations.
2697 This calculator takes one expression, or an equality test per line. The
2698 results are printed and in any equality test fails, the program exits with
2701 Embedding mdcode inside mdcode is rather horrible. I'd like to find a
2702 better approach, but as the grammar file must have 3 components I need
2703 something like this.
2705 ###### File: parsergen.mk
2706 calc.c : parsergen calc.cgm
2707 ./parsergen -o calc calc.cgm
2708 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2709 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2717 // what do we use for a demo-grammar? A calculator of course.
2731 #include <sys/mman.h>
2736 #include "scanner.h"
2742 static void free_number(struct number *n)
2748 int main(int argc, char *argv[])
2750 int fd = open(argv[1], O_RDONLY);
2751 int len = lseek(fd, 0, 2);
2752 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2753 struct section *s = code_extract(file, file+len, NULL);
2754 struct token_config config = {
2755 .ignored = (1 << TK_line_comment)
2756 | (1 << TK_block_comment)
2759 .number_chars = ".,_+-",
2763 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2771 Session -> Session Line
2774 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2775 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2776 gmp_printf(" or as a decimal: %Fg\n", fl);
2780 | Expression = Expression NEWLINE ${
2781 if (mpq_equal($1.val, $3.val))
2782 gmp_printf("Both equal %Qd\n", $1.val);
2784 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
2789 | NEWLINE ${ printf("Blank line\n"); }$
2790 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2793 Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2794 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2795 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2797 Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2798 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2799 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2801 Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2802 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$