1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. `calc.cgm` is a test grammar for a simple calculator.
22 ###### File: parsergen.c
27 ## forward declarations
38 ###### File: libparser.c
47 ###### File: parsergen.mk
50 parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
53 ## Reading the grammar
55 The grammar must be stored in a markdown literate code file as parsed
56 by "[mdcode][]". It must have three top level (i.e. unreferenced)
57 sections: `header`, `code`, and `grammar`. The first two will be
58 literally copied into the generated `.c` and `.h`. files. The last
59 contains the grammar. This is tokenised with "[scanner][]".
62 [scanner]: scanner.html
68 ###### parser includes
72 The grammar contains production sets, precedence/associativity
73 declarations, and data type declarations. These are all parsed with
74 _ad hoc_ parsing as we don't have a parser generator yet.
76 The precedence and associativity can be set for each production, but
77 can be inherited from symbols. The data type is potentially defined
78 for each non-terminal and describes what C structure is used to store
79 information about each symbol.
82 enum assoc {Left, Right, Non};
83 char *assoc_names[] = {"Left","Right","Non"};
86 struct text struct_name;
88 unsigned short precedence;
92 unsigned short precedence;
100 The strings reported by `mdcode` and `scanner` are `struct text` which have
101 length rather than being null terminated. To help with printing and
102 comparing we define `text_is` and `prtxt`, which should possibly go in
103 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
104 which might contain control characters.
107 static int text_is(struct text t, char *s)
109 return (strlen(s) == t.len &&
110 strncmp(s, t.txt, t.len) == 0);
112 static void prtxt(struct text t)
114 printf("%.*s", t.len, t.txt);
119 Productions are comprised primarily of symbols - terminal and
120 non-terminal. We do not make a syntactic distinction between the two,
121 though non-terminals must be identifiers. Non-terminal symbols are
122 those which appear in the head of a production, terminal symbols are
123 those which don't. There are also "virtual" symbols used for precedence
124 marking discussed later, and sometimes we won't know what type a symbol
127 ###### forward declarations
128 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
129 char *symtypes = "UVTN";
133 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
134 table of known symbols and the resulting parser will report them as
135 `TK_reserved + N`. A small set of identifiers are reserved for the
136 different token types that `scanner` can report.
139 static char *reserved_words[] = {
140 [TK_error] = "ERROR",
141 [TK_number] = "NUMBER",
142 [TK_ident] = "IDENTIFIER",
144 [TK_string] = "STRING",
145 [TK_multi_string] = "MULTI_STRING",
148 [TK_newline] = "NEWLINE",
154 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
155 recognised. The former is automatically expected at the end of the text
156 being parsed. The latter are always ignored by the parser.
158 All of these symbols are stored in a simple symbol table. We use the
159 `struct text` from `mdcode` to store the name and link them together
160 into a sorted list using an insertion sort.
162 We don't have separate `find` and `insert` functions as any symbol we
163 find needs to be remembered. We simply expect `find` to always return a
164 symbol, but its type might be `Unknown`.
173 ###### grammar fields
178 static int text_cmp(struct text a, struct text b)
183 int cmp = strncmp(a.txt, b.txt, len);
187 return a.len - b.len;
190 static struct symbol *sym_find(struct grammar *g, struct text s)
192 struct symbol **l = &g->syms;
197 (cmp = text_cmp((*l)->name, s)) < 0)
201 n = calloc(1, sizeof(*n));
210 static void symbols_init(struct grammar *g)
212 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
214 for (i = 0; i < entries; i++) {
217 t.txt = reserved_words[i];
220 t.len = strlen(t.txt);
227 ### Data types and precedence.
229 Data type specification and precedence specification are both
230 introduced by a dollar sign at the start of the line. If the next
231 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
232 precedence, otherwise it specifies a data type.
234 The data type name is simply stored and applied to the head of all
235 subsequent productions. It must be the name of a structure, so `$expr`
236 maps to `struct expr`.
238 Any productions given before the first data type will have no data type
239 and can carry no information. In order to allow other non-terminals to
240 have no type, the data type `$void` can be given. This does *not* mean
241 that `struct void` will be used, but rather than no type will be
242 associated with future non-terminals.
244 The precedence line must contain a list of symbols - typically
245 terminal symbols, but not necessarily. It can only contain symbols
246 that have not been seen yet, so precedence declaration must precede
247 the productions that they affect.
249 A precedence line may also contain a "virtual" symbol which is an
250 identifier preceded by `$$`. Virtual terminals carry precedence
251 information but are not included in the grammar. A production can
252 declare that it inherits the precedence of a given virtual symbol.
254 This use for `$$` precludes it from being used as a symbol in the
255 described language. Two other symbols: `${` and `}$` are also
258 Each new precedence line introduces a new precedence level and
259 declares how it associates. This level is stored in each symbol
260 listed and may be inherited by any production which uses the symbol. A
261 production inherits from the last symbol which has a precedence.
263 ###### grammar fields
264 struct text current_type;
268 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
269 static const char *known[] = { "$$", "${", "}$" };
272 static char *dollar_line(struct token_state *ts, struct grammar *g)
274 struct token t = token_next(ts);
279 if (t.num != TK_ident) {
280 err = "type or assoc expected after '$'";
283 if (text_is(t.txt, "LEFT"))
285 else if (text_is(t.txt, "RIGHT"))
287 else if (text_is(t.txt, "NON"))
290 g->current_type = t.txt;
291 if (text_is(t.txt, "void"))
292 g->current_type.txt = NULL;
294 if (t.num != TK_newline) {
295 err = "Extra tokens after type name";
301 // This is a precedence line, need some symbols.
305 while (t.num != TK_newline) {
306 enum symtype type = Terminal;
308 if (t.num == TK_virtual) {
311 if (t.num != TK_ident) {
312 err = "$$ must be followed by a word";
315 } else if (t.num != TK_ident &&
317 err = "Illegal token in precedence line";
320 s = sym_find(g, t.txt);
321 if (s->type != Unknown) {
322 err = "Symbols in precedence line must not already be known.";
326 s->precedence = g->prec_levels;
331 err = "No symbols given on precedence line";
335 while (t.num != TK_newline && t.num != TK_eof)
342 A production either starts with an identifier which is the head
343 non-terminal, or a vertical bar (`|`) in which case this production
344 uses the same head as the previous one. The identifier must be
345 followed by a `->` mark. All productions for a given non-terminal must
346 be grouped together with the `nonterminal ->` given only once.
348 After this a (possibly empty) sequence of identifiers and marks form
349 the body of the production. A virtual symbol may be given after the
350 body by preceding it with `$$`. If a virtual symbol is given, the
351 precedence of the production is that for the virtual symbol. If none
352 is given, the precedence is inherited from the last symbol in the
353 production which has a precedence specified.
355 After the optional precedence may come the `${` mark. This indicates
356 the start of a code fragment. If present, this must be on the same
357 line as the start of the production.
359 All of the text from the `${` through to the matching `}$` is
360 collected and forms the code-fragment for the production. It must all
361 be in one `code_node` of the literate code. The `}$` must be
362 at the end of a line.
364 Text in the code fragment will undergo substitutions where `$N` for
365 some numeric `N` will be replaced with a variable holding the parse
366 information for the particular symbol in the production. `$0` is the
367 head of the production, `$1` is the first symbol of the body, etc.
368 The type of `$N` for a terminal symbol is `struct token`. For
369 a non-terminal, it is whatever has been declared for that symbol.
371 While building productions we will need to add to an array which needs to
375 static void array_add(void *varray, int *cnt, void *new)
377 void ***array = varray;
380 current = ((*cnt-1) | (step-1))+1;
381 if (*cnt == current) {
384 *array = realloc(*array, current * sizeof(void*));
386 (*array)[*cnt] = new;
390 Collecting the code fragment simply involves reading tokens until we
391 hit the end token `}$`, and noting the character position of the start and
395 static struct text collect_code(struct token_state *state,
400 code.txt = start.txt.txt + start.txt.len;
402 t = token_next(state);
403 while (t.node == start.node &&
404 t.num != TK_close && t.num != TK_error &&
406 if (t.num == TK_close && t.node == start.node)
407 code.len = t.txt.txt - code.txt;
413 Now we have all the bit we need to parse a full production.
418 ###### grammar fields
419 struct production **productions;
420 int production_count;
422 ###### production fields
424 struct symbol **body;
429 int first_production;
432 static char *parse_production(struct grammar *g,
434 struct token_state *state)
436 /* Head has already been parsed. */
439 struct production p, *pp;
441 memset(&p, 0, sizeof(p));
443 tk = token_next(state);
444 while (tk.num == TK_ident || tk.num == TK_mark) {
445 struct symbol *bs = sym_find(g, tk.txt);
446 if (bs->type == Unknown)
448 if (bs->type == Virtual) {
449 err = "Virtual symbol not permitted in production";
452 if (bs->precedence) {
453 p.precedence = bs->precedence;
456 array_add(&p.body, &p.body_size, bs);
457 tk = token_next(state);
459 if (tk.num == TK_virtual) {
461 tk = token_next(state);
462 if (tk.num != TK_ident) {
463 err = "word required after $$";
466 vs = sym_find(g, tk.txt);
467 if (vs->type != Virtual) {
468 err = "symbol after $$ must be virtual";
471 p.precedence = vs->precedence;
473 tk = token_next(state);
475 if (tk.num == TK_open) {
476 p.code = collect_code(state, tk);
477 if (p.code.txt == NULL) {
478 err = "code fragment not closed properly";
481 tk = token_next(state);
483 if (tk.num != TK_newline && tk.num != TK_eof) {
484 err = "stray tokens at end of line";
487 pp = malloc(sizeof(*pp));
489 array_add(&g->productions, &g->production_count, pp);
492 while (tk.num != TK_newline && tk.num != TK_eof)
493 tk = token_next(state);
497 With the ability to parse production and dollar-lines, we have nearly all
498 that we need to parse a grammar from a `code_node`.
500 The head of the first production will effectively be the `start` symbol of
501 the grammar. However it won't _actually_ be so. Processing the grammar is
502 greatly simplified if the real start symbol only has a single production,
503 and expects `$eof` as the final terminal. So when we find the first
504 explicit production we insert an extra production as production zero which
507 ###### Example: production 0
510 where `START` is the first non-terminal given.
512 ###### create production zero
513 struct production *p = calloc(1,sizeof(*p));
514 struct text start = {"$start",6};
515 struct text eof = {"$eof",4};
516 p->head = sym_find(g, start);
517 p->head->type = Nonterminal;
518 array_add(&p->body, &p->body_size, head);
519 array_add(&p->body, &p->body_size, sym_find(g, eof));
520 g->start = p->head->num;
521 p->head->first_production = g->production_count;
522 array_add(&g->productions, &g->production_count, p);
524 Now we are ready to read in the grammar.
526 ###### grammar fields
527 int start; // the 'start' symbol.
530 static struct grammar *grammar_read(struct code_node *code)
532 struct token_config conf = {
535 .words_marks = known,
536 .known_count = sizeof(known)/sizeof(known[0]),
538 .ignored = (1 << TK_line_comment)
539 | (1 << TK_block_comment)
542 | (1 << TK_multi_string)
547 struct token_state *state = token_open(code, &conf);
548 struct token tk = token_next(state);
549 struct symbol *head = NULL;
553 g = calloc(1, sizeof(*g));
556 for (tk = token_next(state); tk.num != TK_eof;
557 tk = token_next(state)) {
558 if (tk.num == TK_newline)
560 if (tk.num == TK_ident) {
562 head = sym_find(g, tk.txt);
563 if (head->type == Nonterminal)
564 err = "This non-terminal has already be used.";
565 else if (head->type == Virtual)
566 err = "Virtual symbol not permitted in head of production";
568 head->type = Nonterminal;
569 head->struct_name = g->current_type;
571 ## create production zero
573 head->first_production = g->production_count;
574 tk = token_next(state);
575 if (tk.num == TK_mark &&
576 text_is(tk.txt, "->"))
577 err = parse_production(g, head, state);
579 err = "'->' missing in production";
581 } else if (tk.num == TK_mark
582 && text_is(tk.txt, "|")) {
583 // another production for same non-term
585 err = parse_production(g, head, state);
587 err = "First production must have a head";
588 } else if (tk.num == TK_mark
589 && text_is(tk.txt, "$")) {
590 err = dollar_line(state, g);
592 err = "Unrecognised token at start of line.";
600 fprintf(stderr, "Error at line %d: %s\n",
607 ## Analysing the grammar
609 The central task in analysing the grammar is to determine a set of
610 states to drive the parsing process. This will require finding
611 various sets of symbols and of "items". Some of these sets will need
612 to attach information to each element in the set, so they are more
615 Each "item" may have a 'look-ahead' or `LA` set associated with
616 it. Many of these will be the same as each other. To avoid memory
617 wastage, and to simplify some comparisons of sets, these sets will be
618 stored in a list of unique sets, each assigned a number.
620 Once we have the data structures in hand to manage these sets and
621 lists, we can start setting the 'nullable' flag, build the 'FIRST'
622 sets, and then create the item sets which define the various states.
626 Though we don't only store symbols in these sets, they are the main
627 things we store, so they are called symbol sets or "symsets".
629 A symset has a size, an array of shorts, and an optional array of data
630 values, which are also shorts. If the array of data is not present,
631 we store an impossible pointer, as `NULL` is used to indicate that no
632 memory has been allocated yet;
637 unsigned short *syms, *data;
639 #define NO_DATA ((unsigned short *)1)
640 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
641 const struct symset INIT_DATASET = { 0, NULL, NULL };
643 The arrays of shorts are allocated in blocks of 8 and are kept sorted
644 by using an insertion sort. We don't explicitly record the amount of
645 allocated space as it can be derived directly from the current `cnt` using
646 `((cnt - 1) | 7) + 1`.
649 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
652 int current = ((s->cnt-1) | 7) + 1;
653 if (current == s->cnt) {
655 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
656 if (s->data != NO_DATA)
657 s->data = realloc(s->data, sizeof(*s->data) * current);
660 while (i > 0 && s->syms[i-1] > key) {
661 s->syms[i] = s->syms[i-1];
662 if (s->data != NO_DATA)
663 s->data[i] = s->data[i-1];
667 if (s->data != NO_DATA)
672 Finding a symbol (or item) in a `symset` uses a simple binary search.
673 We return the index where the value was found (so data can be accessed),
674 or `-1` to indicate failure.
676 static int symset_find(struct symset *ss, unsigned short key)
683 while (lo + 1 < hi) {
684 int mid = (lo + hi) / 2;
685 if (ss->syms[mid] <= key)
690 if (ss->syms[lo] == key)
695 We will often want to form the union of two symsets. When we do, we
696 will often want to know if anything changed (as they might mean there
697 is more work to do). So `symset_union` reports whether anything was
698 added to the first set. We use a slow quadratic approach as these
699 sets don't really get very big. If profiles shows this to be a problem is
700 can be optimised later.
702 static int symset_union(struct symset *a, struct symset *b)
706 for (i = 0; i < b->cnt; i++)
707 if (symset_find(a, b->syms[i]) < 0) {
708 unsigned short data = 0;
709 if (b->data != NO_DATA)
711 symset_add(a, b->syms[i], data);
717 And of course we must be able to free a symset.
719 static void symset_free(struct symset ss)
722 if (ss.data != NO_DATA)
728 Some symsets are simply stored somewhere appropriate in the `grammar`
729 data structure, others needs a bit of indirection. This applies
730 particularly to "LA" sets. These will be paired with "items" in an
731 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
732 stored in the `data` field, so we need a mapping from a small (short)
733 number to an LA `symset`.
735 As mentioned earlier this involves creating a list of unique symsets.
737 For now, we just use a linear list sorted by insertion. A skiplist
738 would be more efficient and may be added later.
745 struct setlist *next;
748 ###### grammar fields
749 struct setlist *sets;
754 static int ss_cmp(struct symset a, struct symset b)
757 int diff = a.cnt - b.cnt;
760 for (i = 0; i < a.cnt; i++) {
761 diff = (int)a.syms[i] - (int)b.syms[i];
768 static int save_set(struct grammar *g, struct symset ss)
770 struct setlist **sl = &g->sets;
774 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
781 s = malloc(sizeof(*s));
790 Finding a set by number is currently performed by a simple linear search.
791 If this turns out to hurt performance, we can store the sets in a dynamic
792 array like the productions.
794 static struct symset set_find(struct grammar *g, int num)
796 struct setlist *sl = g->sets;
797 while (sl && sl->num != num)
803 ### Setting `nullable`
805 We set `nullable` on the head symbol for any production for which all
806 the body symbols (if any) are nullable. As this is a recursive
807 definition, any change in the `nullable` setting means that we need to
808 re-evaluate where it needs to be set.
810 We simply loop around performing the same calculations until no more
817 static void set_nullable(struct grammar *g)
820 while (check_again) {
823 for (p = 0; p < g->production_count; p++) {
824 struct production *pr = g->productions[p];
827 if (pr->head->nullable)
829 for (s = 0; s < pr->body_size; s++)
830 if (! pr->body[s]->nullable)
832 if (s == pr->body_size) {
833 pr->head->nullable = 1;
840 ### Setting `can_eol`
842 In order to be able to ignore newline tokens when not relevant, but
843 still include them in the parse when needed, we will need to know
844 which states can start a "line-like" section of code. We ignore
845 newlines when there is an indent since the most recent start of a
848 To know what is line-like, we first need to know which symbols can end
849 a line-like section, which is precisely those which can end with a
850 newline token. These symbols don't necessarily alway end with a
851 newline, but they can. Hence they are not described as "lines" but
854 Clearly the `TK_newline` token can end with a newline. Any symbol
855 which is the head of a production that contains a line-ending symbol
856 followed only by nullable symbols is also a line-ending symbol. We
857 use a new field `can_eol` to record this attribute of symbols, and
858 compute it in a repetitive manner similar to `set_nullable`.
864 static void set_can_eol(struct grammar *g)
867 g->symtab[TK_newline]->can_eol = 1;
868 while (check_again) {
871 for (p = 0; p < g->production_count; p++) {
872 struct production *pr = g->productions[p];
875 if (pr->head->can_eol)
878 for (s = pr->body_size - 1; s >= 0; s--) {
879 if (pr->body[s]->can_eol) {
880 pr->head->can_eol = 1;
884 if (!pr->body[s]->nullable)
891 ### Building the `first` sets
893 When calculating what can follow a particular non-terminal, we will need to
894 know what the "first" terminal in an subsequent non-terminal might be. So
895 we calculate the `first` set for every non-terminal and store them in an
896 array. We don't bother recording the "first" set for terminals as they are
899 As the "first" for one symbol might depend on the "first" of another,
900 we repeat the calculations until no changes happen, just like with
901 `set_nullable`. This makes use of the fact that `symset_union`
902 reports if any change happens.
904 The core of this which finds the "first" of part of a production body
905 will be reused for computing the "follow" sets, so we split it out
906 into a separate function.
908 ###### grammar fields
909 struct symset *first;
913 static int add_first(struct production *pr, int start,
914 struct symset *target, struct grammar *g,
919 for (s = start; s < pr->body_size; s++) {
920 struct symbol *bs = pr->body[s];
921 if (bs->type == Terminal) {
922 if (symset_find(target, bs->num) < 0) {
923 symset_add(target, bs->num, 0);
927 } else if (symset_union(target, &g->first[bs->num]))
933 *to_end = (s == pr->body_size);
937 static void build_first(struct grammar *g)
941 g->first = calloc(g->num_syms, sizeof(g->first[0]));
942 for (s = 0; s < g->num_syms; s++)
943 g->first[s] = INIT_SYMSET;
945 while (check_again) {
948 for (p = 0; p < g->production_count; p++) {
949 struct production *pr = g->productions[p];
950 struct symset *head = &g->first[pr->head->num];
952 if (add_first(pr, 0, head, g, NULL))
958 ### Building the `follow` sets.
960 There are two different situations when we will want to generate "follow"
961 sets. If we are doing an SLR analysis, we want to generate a single
962 "follow" set for each non-terminal in the grammar. That is what is
963 happening here. If we are doing an LALR or LR analysis we will want
964 to generate a separate "LA" set for each item. We do that later
967 There are two parts to generating a "follow" set. Firstly we look at
968 every place that any non-terminal appears in the body of any
969 production, and we find the set of possible "first" symbols after
970 there. This is done using `add_first` above and only needs to be done
971 once as the "first" sets are now stable and will not change.
975 for (p = 0; p < g->production_count; p++) {
976 struct production *pr = g->productions[p];
979 for (b = 0; b < pr->body_size - 1; b++) {
980 struct symbol *bs = pr->body[b];
981 if (bs->type == Terminal)
983 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
987 The second part is to add the "follow" set of the head of a production
988 to the "follow" sets of the final symbol in the production, and any
989 other symbol which is followed only by `nullable` symbols. As this
990 depends on "follow" itself we need to repeatedly perform the process
991 until no further changes happen.
995 for (again = 0, p = 0;
996 p < g->production_count;
997 p < g->production_count-1
998 ? p++ : again ? (again = 0, p = 0)
1000 struct production *pr = g->productions[p];
1003 for (b = pr->body_size - 1; b >= 0; b--) {
1004 struct symbol *bs = pr->body[b];
1005 if (bs->type == Terminal)
1007 if (symset_union(&g->follow[bs->num],
1008 &g->follow[pr->head->num]))
1015 We now just need to create and initialise the `follow` list to get a
1018 ###### grammar fields
1019 struct symset *follow;
1022 static void build_follow(struct grammar *g)
1025 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1026 for (s = 0; s < g->num_syms; s++)
1027 g->follow[s] = INIT_SYMSET;
1031 ### Building itemsets and states
1033 There are three different levels of detail that can be chosen for
1034 building the itemsets and states for the LR grammar. They are:
1036 1. LR(0) or SLR(1), where no look-ahead is considered.
1037 2. LALR(1) where we build look-ahead sets with each item and merge
1038 the LA sets when we find two paths to the same "kernel" set of items.
1039 3. LR(1) where different look-ahead for any item in the set means
1040 a different state must be created.
1042 ###### forward declarations
1043 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1045 We need to be able to look through existing states to see if a newly
1046 generated state already exists. For now we use a simple sorted linked
1049 An item is a pair of numbers: the production number and the position of
1050 "DOT", which is an index into the body of the production.
1051 As the numbers are not enormous we can combine them into a single "short"
1052 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1053 production number (so 4000 productions with maximum of 15 symbols in the
1056 Comparing the itemsets will be a little different to comparing symsets
1057 as we want to do the lookup after generating the "kernel" of an
1058 itemset, so we need to ignore the offset=zero items which are added during
1061 To facilitate this, we modify the "DOT" number so that "0" sorts to
1062 the end of the list in the symset, and then only compare items before
1066 static inline unsigned short item_num(int production, int index)
1068 return production | ((31-index) << 11);
1070 static inline int item_prod(unsigned short item)
1072 return item & 0x7ff;
1074 static inline int item_index(unsigned short item)
1076 return (31-(item >> 11)) & 0x1f;
1079 For LR(1) analysis we need to compare not just the itemset in a state
1080 but also the LA sets. As we assign each unique LA set a number, we
1081 can just compare the symset and the data values together.
1084 static int itemset_cmp(struct symset a, struct symset b,
1085 enum grammar_type type)
1091 i < a.cnt && i < b.cnt &&
1092 item_index(a.syms[i]) > 0 &&
1093 item_index(b.syms[i]) > 0;
1095 int diff = a.syms[i] - b.syms[i];
1099 diff = a.data[i] - b.data[i];
1104 if (i == a.cnt || item_index(a.syms[i]) == 0)
1108 if (i == b.cnt || item_index(b.syms[i]) == 0)
1114 if (type < LR1 || av == -1)
1117 a.data[i] - b.data[i];
1120 And now we can build the list of itemsets. The lookup routine returns
1121 both a success flag and a pointer to where in the list an insert
1122 should happen, so we don't need to search a second time.
1126 struct itemset *next;
1128 struct symset items;
1129 struct symset go_to;
1133 ###### grammar fields
1134 struct itemset *items;
1138 static int itemset_find(struct grammar *g, struct itemset ***where,
1139 struct symset kernel, enum grammar_type type)
1141 struct itemset **ip;
1143 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1144 struct itemset *i = *ip;
1146 diff = itemset_cmp(i->items, kernel, type);
1159 Adding an itemset may require merging the LA sets if LALR analysis is
1160 happening. If any new LA set add symbol that weren't in the old LA set, we
1161 clear the `completed` flag so that the dependants of this itemset will be
1162 recalculated and their LA sets updated.
1164 `add_itemset` must consume the symsets it is passed, either by adding
1165 them to a data structure, of freeing them.
1167 static int add_itemset(struct grammar *g, struct symset ss,
1168 enum grammar_type type)
1170 struct itemset **where, *is;
1172 int found = itemset_find(g, &where, ss, type);
1174 is = calloc(1, sizeof(*is));
1175 is->state = g->states;
1179 is->go_to = INIT_DATASET;
1188 for (i = 0; i < ss.cnt; i++) {
1189 struct symset temp = INIT_SYMSET, s;
1190 if (ss.data[i] == is->items.data[i])
1192 s = set_find(g, is->items.data[i]);
1193 symset_union(&temp, &s);
1194 s = set_find(g, ss.data[i]);
1195 if (symset_union(&temp, &s)) {
1196 is->items.data[i] = save_set(g, temp);
1207 To build all the itemsets, we first insert the initial itemset made from the
1208 start symbol, complete each itemset, and then generate new itemsets from old
1209 until no new ones can be made.
1211 Completing an itemset means finding all the items where "DOT" is followed by
1212 a nonterminal and adding "DOT=0" items for every production from that
1213 non-terminal - providing each item hasn't already been added.
1215 If LA sets are needed, the LA set for each new item is found using
1216 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1217 be supplemented by the LA set for the item which produce the new item.
1219 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1220 is used in the next stage.
1222 NOTE: precedence handling should happen here - I haven't written this yet
1225 ###### complete itemset
1226 for (i = 0; i < is->items.cnt; i++) {
1227 int p = item_prod(is->items.syms[i]);
1228 int bs = item_index(is->items.syms[i]);
1229 struct production *pr = g->productions[p];
1232 struct symset LA = INIT_SYMSET;
1233 unsigned short sn = 0;
1235 if (bs == pr->body_size)
1238 if (symset_find(&done, s->num) < 0)
1239 symset_add(&done, s->num, 0);
1240 if (s->type != Nonterminal)
1246 add_first(pr, bs+1, &LA, g, &to_end);
1248 struct symset ss = set_find(g, is->items.data[i]);
1249 symset_union(&LA, &ss);
1251 sn = save_set(g, LA);
1252 LA = set_find(g, sn);
1254 /* Add productions for this symbol */
1255 for (p2 = s->first_production;
1256 p2 < g->production_count &&
1257 g->productions[p2]->head == s;
1259 int itm = item_num(p2, 0);
1260 int pos = symset_find(&is->items, itm);
1262 symset_add(&is->items, itm, sn);
1263 /* Will have re-ordered, so start
1264 * from beginning again */
1266 } else if (type >= LALR) {
1267 struct symset ss = set_find(g, is->items.data[pos]);
1268 struct symset tmp = INIT_SYMSET;
1270 symset_union(&tmp, &ss);
1271 if (symset_union(&tmp, &LA)) {
1272 is->items.data[pos] = save_set(g, tmp);
1280 For each symbol we found (and placed in `done`) we collect all the items for
1281 which this symbol is next, and create a new itemset with "DOT" advanced over
1282 the symbol. This is then added to the collection of itemsets (or merged
1283 with a pre-existing itemset).
1285 ###### derive itemsets
1286 // Now we have a completed itemset, so we need to
1287 // create all the 'next' itemset and create them
1288 // if they don't exist.
1289 for (i = 0; i < done.cnt; i++) {
1291 unsigned short state;
1292 struct symset newitemset = INIT_SYMSET;
1294 newitemset = INIT_DATASET;
1296 for (j = 0; j < is->items.cnt; j++) {
1297 int itm = is->items.syms[j];
1298 int p = item_prod(itm);
1299 int bp = item_index(itm);
1300 struct production *pr = g->productions[p];
1301 unsigned short la = 0;
1304 if (bp == pr->body_size)
1306 if (pr->body[bp]->num != done.syms[i])
1309 la = is->items.data[j];
1310 pos = symset_find(&newitemset, pr->head->num);
1312 symset_add(&newitemset, item_num(p, bp+1), la);
1313 else if (type >= LALR) {
1314 // Need to merge la set.
1315 int la2 = newitemset.data[pos];
1317 struct symset ss = set_find(g, la2);
1318 struct symset LA = INIT_SYMSET;
1319 symset_union(&LA, &ss);
1320 ss = set_find(g, la);
1321 if (symset_union(&LA, &ss))
1322 newitemset.data[pos] = save_set(g, LA);
1328 state = add_itemset(g, newitemset, type);
1329 if (symset_find(&is->go_to, done.syms[i]) < 0)
1330 symset_add(&is->go_to, done.syms[i], state);
1333 All that is left is to crate the initial itemset from production zero, and
1334 with `TK_eof` as the LA set.
1337 static void build_itemsets(struct grammar *g, enum grammar_type type)
1339 struct symset first = INIT_SYMSET;
1342 unsigned short la = 0;
1344 // LA set just has eof
1345 struct symset eof = INIT_SYMSET;
1346 symset_add(&eof, TK_eof, 0);
1347 la = save_set(g, eof);
1348 first = INIT_DATASET;
1350 // production 0, offset 0 (with no data)
1351 symset_add(&first, item_num(0, 0), la);
1352 add_itemset(g, first, type);
1353 for (again = 0, is = g->items;
1355 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1357 struct symset done = INIT_SYMSET;
1368 ### Completing the analysis.
1370 The exact process of analysis depends on which level was requested. For
1371 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1372 `LR1` we need first, but not follow. For `SLR` we need both.
1374 We don't build the "action" tables that you might expect as the parser
1375 doesn't use them. They will be calculated to some extent if needed for
1378 Once we have built everything we allocate arrays for the two lists:
1379 symbols and itemsets. This allows more efficient access during reporting.
1380 The symbols are grouped as terminals and non-terminals and we record the
1381 changeover point in `first_nonterm`.
1383 ###### grammar fields
1384 struct symbol **symtab;
1385 struct itemset **statetab;
1388 ###### grammar_analyse
1390 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1394 int snum = TK_reserved;
1395 for (s = g->syms; s; s = s->next)
1396 if (s->num < 0 && s->type == Terminal) {
1400 g->first_nonterm = snum;
1401 for (s = g->syms; s; s = s->next)
1407 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1408 for (s = g->syms; s; s = s->next)
1409 g->symtab[s->num] = s;
1419 build_itemsets(g, type);
1421 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1422 for (is = g->items; is ; is = is->next)
1423 g->statetab[is->state] = is;
1426 ## Reporting on the Grammar
1428 The purpose of the report is to give the grammar developer insight into
1429 how the grammar parser will work. It is basically a structured dump of
1430 all the tables that have been generated, plus an description of any conflicts.
1432 ###### grammar_report
1433 static int grammar_report(struct grammar *g, enum grammar_type type)
1439 return report_conflicts(g, type);
1442 Firstly we have the complete list of symbols, together with the "FIRST"
1443 set if that was generated.
1447 static void report_symbols(struct grammar *g)
1451 printf("SYMBOLS + FIRST:\n");
1453 printf("SYMBOLS:\n");
1455 for (n = 0; n < g->num_syms; n++) {
1456 struct symbol *s = g->symtab[n];
1460 printf(" %c%c%3d%c: ",
1461 s->nullable ? '.':' ',
1462 s->can_eol ? '>':' ',
1463 s->num, symtypes[s->type]);
1466 printf(" (%d%s)", s->precedence,
1467 assoc_names[s->assoc]);
1469 if (g->first && s->type == Nonterminal) {
1472 for (j = 0; j < g->first[n].cnt; j++) {
1475 prtxt(g->symtab[g->first[n].syms[j]]->name);
1482 Then we have to follow sets if they were computed.
1484 static void report_follow(struct grammar *g)
1487 printf("FOLLOW:\n");
1488 for (n = 0; n < g->num_syms; n++)
1489 if (g->follow[n].cnt) {
1493 prtxt(g->symtab[n]->name);
1494 for (j = 0; j < g->follow[n].cnt; j++) {
1497 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1503 And finally the item sets. These include the GO TO tables and, for
1504 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1505 it up a bit. First the items, with production number and associativity.
1507 static void report_item(struct grammar *g, int itm)
1509 int p = item_prod(itm);
1510 int dot = item_index(itm);
1511 struct production *pr = g->productions[p];
1515 prtxt(pr->head->name);
1517 for (i = 0; i < pr->body_size; i++) {
1518 printf(" %s", (dot == i ? ". ": ""));
1519 prtxt(pr->body[i]->name);
1521 if (dot == pr->body_size)
1525 printf(" (%d%s)", pr->precedence,
1526 assoc_names[pr->assoc]);
1530 The LA sets which are (possibly) reported with each item:
1532 static void report_la(struct grammar *g, int lanum)
1534 struct symset la = set_find(g, lanum);
1538 printf(" LOOK AHEAD(%d)", lanum);
1539 for (i = 0; i < la.cnt; i++) {
1542 prtxt(g->symtab[la.syms[i]]->name);
1547 Then the go to sets:
1550 static void report_goto(struct grammar *g, struct symset gt)
1555 for (i = 0; i < gt.cnt; i++) {
1557 prtxt(g->symtab[gt.syms[i]]->name);
1558 printf(" -> %d\n", gt.data[i]);
1562 Now we can report all the item sets complete with items, LA sets, and GO TO.
1564 static void report_itemsets(struct grammar *g)
1567 printf("ITEM SETS(%d)\n", g->states);
1568 for (s = 0; s < g->states; s++) {
1570 struct itemset *is = g->statetab[s];
1571 printf(" Itemset %d:\n", s);
1572 for (j = 0; j < is->items.cnt; j++) {
1573 report_item(g, is->items.syms[j]);
1574 if (is->items.data != NO_DATA)
1575 report_la(g, is->items.data[j]);
1577 report_goto(g, is->go_to);
1581 ### Reporting conflicts
1583 Conflict detection varies a lot among different analysis levels. However
1584 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1585 LR1 are also similar as they have FOLLOW or LA sets.
1589 ## conflict functions
1591 static int report_conflicts(struct grammar *g, enum grammar_type type)
1594 printf("Conflicts:\n");
1596 cnt = conflicts_lr0(g, type);
1598 cnt = conflicts_slr(g, type);
1600 printf(" - no conflicts\n");
1604 LR0 conflicts are any state which have both a reducible item and
1607 LR05 conflicts only occurs if two possibly reductions exist,
1608 as shifts always over-ride reductions.
1610 ###### conflict functions
1611 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1615 for (i = 0; i < g->states; i++) {
1616 struct itemset *is = g->statetab[i];
1617 int last_reduce = -1;
1618 int prev_reduce = -1;
1619 int last_shift = -1;
1623 for (j = 0; j < is->items.cnt; j++) {
1624 int itm = is->items.syms[j];
1625 int p = item_prod(itm);
1626 int bp = item_index(itm);
1627 struct production *pr = g->productions[p];
1629 if (bp == pr->body_size) {
1630 prev_reduce = last_reduce;
1634 if (pr->body[bp]->type == Terminal)
1637 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1638 printf(" State %d has both SHIFT and REDUCE:\n", i);
1639 report_item(g, is->items.syms[last_shift]);
1640 report_item(g, is->items.syms[last_reduce]);
1643 if (prev_reduce >= 0) {
1644 printf(" State %d has 2 (or more) reducible items\n", i);
1645 report_item(g, is->items.syms[prev_reduce]);
1646 report_item(g, is->items.syms[last_reduce]);
1653 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1654 look ahead, or if a symbol in a look-ahead can be shifted. The differ only
1655 in the source of the look ahead set.
1657 We build a dataset mapping terminal to item for possible SHIFTs and then
1658 another for possible REDUCE operations. We report when we get conflicts
1661 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1666 for (i = 0; i < g->states; i++) {
1667 struct itemset *is = g->statetab[i];
1668 struct symset shifts = INIT_DATASET;
1669 struct symset reduce = INIT_DATASET;
1673 /* First collect the shifts */
1674 for (j = 0; j < is->items.cnt; j++) {
1675 unsigned short itm = is->items.syms[j];
1676 int p = item_prod(itm);
1677 int bp = item_index(itm);
1678 struct production *pr = g->productions[p];
1680 if (bp < pr->body_size &&
1681 pr->body[bp]->type == Terminal) {
1683 int sym = pr->body[bp]->num;
1684 if (symset_find(&shifts, sym) < 0)
1685 symset_add(&shifts, sym, itm);
1688 /* Now look for reduction and conflicts */
1689 for (j = 0; j < is->items.cnt; j++) {
1690 unsigned short itm = is->items.syms[j];
1691 int p = item_prod(itm);
1692 int bp = item_index(itm);
1693 struct production *pr = g->productions[p];
1695 if (bp < pr->body_size)
1700 la = g->follow[pr->head->num];
1702 la = set_find(g, is->items.data[j]);
1704 for (k = 0; k < la.cnt; k++) {
1705 int pos = symset_find(&shifts, la.syms[k]);
1707 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1708 prtxt(g->symtab[la.syms[k]]->name);
1710 report_item(g, shifts.data[pos]);
1711 report_item(g, itm);
1714 pos = symset_find(&reduce, la.syms[k]);
1716 symset_add(&reduce, la.syms[k], itm);
1719 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1720 prtxt(g->symtab[la.syms[k]]->name);
1722 report_item(g, itm);
1723 report_item(g, reduce.data[pos]);
1727 symset_free(shifts);
1728 symset_free(reduce);
1734 ## Generating the parser
1736 The export part of the parser is the `parse_XX` function, where the name
1737 `XX` is based on the name of the parser files.
1739 This takes a `code_node`, a partially initialized `token_config`, and an
1740 optional `FILE` to send tracing to. The `token_config` gets the list of
1741 known words added and then is used with the `code_node` to initialize the
1744 `parse_XX` then call the library function `parser_run` to actually complete
1745 the parse. This needs the `states` table and function to call the various
1746 pieces of code provided in the grammar file, so they are generated first.
1748 ###### parser_generate
1750 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1756 gen_reduce(f, g, file);
1759 fprintf(f, "#line 0 \"gen_parser\"\n");
1760 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1763 fprintf(f, "\tstruct token_state *tokens;\n");
1764 fprintf(f, "\tconfig->words_marks = known;\n");
1765 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1766 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1767 fprintf(f, "\ttokens = token_open(code, config);\n");
1768 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
1769 fprintf(f, "\ttoken_close(tokens);\n");
1770 fprintf(f, "\treturn rv;\n");
1771 fprintf(f, "}\n\n");
1774 ### Table words table
1776 The know words is simply an array of terminal symbols.
1777 The table of nonterminals used for tracing is a similar array.
1781 static void gen_known(FILE *f, struct grammar *g)
1784 fprintf(f, "#line 0 \"gen_known\"\n");
1785 fprintf(f, "static const char *known[] = {\n");
1786 for (i = TK_reserved;
1787 i < g->num_syms && g->symtab[i]->type == Terminal;
1789 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1790 g->symtab[i]->name.txt);
1791 fprintf(f, "};\n\n");
1794 static void gen_non_term(FILE *f, struct grammar *g)
1797 fprintf(f, "#line 0 \"gen_non_term\"\n");
1798 fprintf(f, "static const char *non_term[] = {\n");
1799 for (i = TK_reserved;
1802 if (g->symtab[i]->type == Nonterminal)
1803 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1804 g->symtab[i]->name.txt);
1805 fprintf(f, "};\n\n");
1808 ### States and the goto tables.
1810 For each state we record the goto table, the reducible production if
1811 there is one, or a symbol to shift for error recovery.
1812 Some of the details of the reducible production are stored in the
1813 `do_reduce` function to come later. Here we store the production number,
1814 the body size (useful for stack management) and the resulting symbol (useful
1815 for knowing how to free data later).
1817 The go to table is stored in a simple array of `sym` and corresponding
1820 ###### exported types
1828 const struct lookup * go_to;
1838 static void gen_goto(FILE *f, struct grammar *g)
1841 fprintf(f, "#line 0 \"gen_goto\"\n");
1842 for (i = 0; i < g->states; i++) {
1844 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1846 struct symset gt = g->statetab[i]->go_to;
1847 for (j = 0; j < gt.cnt; j++)
1848 fprintf(f, "\t{ %d, %d },\n",
1849 gt.syms[j], gt.data[j]);
1856 static void gen_states(FILE *f, struct grammar *g)
1859 fprintf(f, "#line 0 \"gen_states\"\n");
1860 fprintf(f, "static const struct state states[] = {\n");
1861 for (i = 0; i < g->states; i++) {
1862 struct itemset *is = g->statetab[i];
1863 int j, prod = -1, prod_len;
1865 int shift_len = 0, shift_remain = 0;
1866 for (j = 0; j < is->items.cnt; j++) {
1867 int itm = is->items.syms[j];
1868 int p = item_prod(itm);
1869 int bp = item_index(itm);
1870 struct production *pr = g->productions[p];
1872 if (bp < pr->body_size) {
1873 if (shift_sym < 0 ||
1874 (shift_len == bp && shift_remain > pr->body_size - bp)) {
1875 shift_sym = pr->body[bp]->num;
1877 shift_remain = pr->body_size - bp;
1881 /* This is what we reduce */
1882 if (prod < 0 || prod_len < pr->body_size) {
1884 prod_len = pr->body_size;
1889 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0 },\n",
1890 i, is->go_to.cnt, i, prod,
1891 g->productions[prod]->body_size,
1892 g->productions[prod]->head->num);
1894 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d },\n",
1895 i, is->go_to.cnt, i, shift_sym);
1897 fprintf(f, "};\n\n");
1900 ### The `do_reduce` function and the code
1902 When the parser engine decides to reduce a production, it calls `do_reduce`.
1903 This has two functions.
1905 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1906 production being reduced. Secondly it runs the code that was included with
1907 the production if any.
1909 This code needs to be able to store data somewhere. Rather than requiring
1910 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1911 `do_reduce` return the size to be saved.
1913 The code fragment requires translation when written out. Any `$N` needs to
1914 be converted to a reference either to that buffer (if `$0`) or to the
1915 structure returned by a previous reduction. These pointer need to be cast
1916 to the appropriate type for each access. All this is handling in
1922 static void gen_code(struct production *p, FILE *f, struct grammar *g)
1925 fprintf(f, "\t\t\t");
1926 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1935 if (*c < '0' || *c > '9') {
1940 while (c[1] >= '0' && c[1] <= '9') {
1942 n = n * 10 + *c - '0';
1945 fprintf(f, "(*(struct %.*s*)ret)",
1946 p->head->struct_name.len,
1947 p->head->struct_name.txt);
1948 else if (n > p->body_size)
1949 fprintf(f, "$%d", n);
1950 else if (p->body[n-1]->type == Terminal)
1951 fprintf(f, "(*(struct token *)body[%d])",
1953 else if (p->body[n-1]->struct_name.txt == NULL)
1954 fprintf(f, "$%d", n);
1956 fprintf(f, "(*(struct %.*s*)body[%d])",
1957 p->body[n-1]->struct_name.len,
1958 p->body[n-1]->struct_name.txt, n-1);
1965 static void gen_reduce(FILE *f, struct grammar *g, char *file)
1968 fprintf(f, "#line 0 \"gen_reduce\"\n");
1969 fprintf(f, "static int do_reduce(int prod, void **body, void *ret)\n");
1971 fprintf(f, "\tint ret_size = 0;\n");
1973 fprintf(f, "\tswitch(prod) {\n");
1974 for (i = 0; i < g->production_count; i++) {
1975 struct production *p = g->productions[i];
1976 fprintf(f, "\tcase %d:\n", i);
1981 if (p->head->struct_name.txt)
1982 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
1983 p->head->struct_name.len,
1984 p->head->struct_name.txt);
1986 fprintf(f, "\t\tbreak;\n");
1988 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
1993 As each non-terminal can potentially cause a different type of data
1994 structure to be allocated and filled in, we need to be able to free it when
1997 It is particularly important to have fine control over freeing during error
1998 recovery where individual stack frames might need to be freed.
2000 For this, the grammar author required to defined a `free_XX` function for
2001 each structure that is used by a non-terminal. `do_free` all call whichever
2002 is appropriate given a symbol number, and will call `free` (as is
2003 appropriate for tokens` on any terminal symbol.
2007 static void gen_free(FILE *f, struct grammar *g)
2011 fprintf(f, "#line 0 \"gen_free\"\n");
2012 fprintf(f, "static void do_free(short sym, void *asn)\n");
2014 fprintf(f, "\tif (!asn) return;\n");
2015 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2016 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2017 fprintf(f, "\tswitch(sym) {\n");
2019 for (i = 0; i < g->num_syms; i++) {
2020 struct symbol *s = g->symtab[i];
2022 s->type != Nonterminal ||
2023 s->struct_name.txt == NULL)
2026 fprintf(f, "\tcase %d:\n", s->num);
2027 fprintf(f, "\t\tfree_%.*s(asn);\n",
2029 s->struct_name.txt);
2030 fprintf(f, "\t\tbreak;\n");
2032 fprintf(f, "\t}\n}\n\n");
2035 ## The main routine.
2037 There are three key parts to the "main" function of parsergen: processing
2038 the arguments, loading the grammar file, and dealing with the grammar.
2040 ### Argument processing.
2042 Command line options allow the selection of analysis mode, name of output
2043 file, and whether or not a report should be generated. By default we create
2044 a report only if no code output was requested.
2046 The `parse_XX` function name uses the basename of the output file which
2047 should not have a suffix (`.c`). `.c` is added to the given name for the
2048 code, and `.h` is added for the header (if header text is specifed in the
2055 static const struct option long_options[] = {
2056 { "LR0", 0, NULL, '0' },
2057 { "LR05", 0, NULL, '5' },
2058 { "SLR", 0, NULL, 'S' },
2059 { "LALR", 0, NULL, 'L' },
2060 { "LR1", 0, NULL, '1' },
2061 { "report", 0, NULL, 'R' },
2062 { "output", 1, NULL, 'o' },
2063 { NULL, 0, NULL, 0 }
2065 const char *options = "05SL1Ro:";
2067 ###### process arguments
2069 char *outfile = NULL;
2073 enum grammar_type type = LR05;
2074 while ((opt = getopt_long(argc, argv, options,
2075 long_options, NULL)) != -1) {
2090 outfile = optarg; break;
2092 fprintf(stderr, "Usage: parsergen ...\n");
2097 infile = argv[optind++];
2099 fprintf(stderr, "No input file given\n");
2102 if (outfile && report == 1)
2105 if (name && strchr(name, '/'))
2106 name = strrchr(name, '/')+1;
2108 if (optind < argc) {
2109 fprintf(stderr, "Excess command line arguments\n");
2113 ### Loading the grammar file
2115 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2118 One we have extracted the code (with `mdcode`) we expect to file three
2119 sections: header, code, and grammar. Anything else is an error.
2121 "header" and "code" are optional, though it is hard to build a working
2122 parser with neither. "grammar" must be provided.
2126 #include <sys/mman.h>
2131 static void pr_err(char *msg)
2134 fprintf(stderr, "%s\n", msg);
2138 struct section *table;
2142 fd = open(infile, O_RDONLY);
2144 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2145 infile, strerror(errno));
2148 len = lseek(fd, 0, 2);
2149 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2150 table = code_extract(file, file+len, pr_err);
2152 struct code_node *hdr = NULL;
2153 struct code_node *code = NULL;
2154 struct code_node *gram = NULL;
2155 for (s = table; s; s = s->next) {
2156 if (text_is(s->section, "header"))
2158 else if (text_is(s->section, "code"))
2160 else if (text_is(s->section, "grammar"))
2163 fprintf(stderr, "Unknown content section: %.*s\n",
2164 s->section.len, s->section.txt);
2169 ### Processing the input
2171 As we need to append an extention to a filename and then open it for
2172 writing, and we need to do this twice, it helps to have a separate function.
2176 static FILE *open_ext(char *base, char *ext)
2178 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2180 strcat(strcpy(fn, base), ext);
2186 If we can read the grammar, then we analyse and optionally report on it. If
2187 the report finds conflicts we will exit with an error status.
2189 ###### process input
2190 struct grammar *g = NULL;
2192 fprintf(stderr, "No grammar section provided\n");
2195 g = grammar_read(gram);
2197 fprintf(stderr, "Failure to parse grammar\n");
2202 grammar_analyse(g, type);
2204 if (grammar_report(g, type))
2208 If a headers section is defined, we write it out and include a declaration
2209 for the `parse_XX` function so it can be used from separate code.
2211 if (rv == 0 && hdr && outfile) {
2212 FILE *f = open_ext(outfile, ".h");
2214 code_node_print(f, hdr, infile);
2215 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2219 fprintf(stderr, "Cannot create %s.h\n",
2225 And if all goes well and an output file was provided, we create the `.c`
2226 file with the code section (if any) and the parser tables and function.
2228 if (rv == 0 && outfile) {
2229 FILE *f = open_ext(outfile, ".c");
2232 code_node_print(f, code, infile);
2233 gen_parser(f, g, infile, name);
2236 fprintf(stderr, "Cannot create %s.c\n",
2242 And that about wraps it up. We need to set the locale so that UTF-8 is
2243 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2245 ###### File: parsergen.mk
2246 parsergen : parsergen.o libscanner.o libmdcode.o
2247 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2254 int main(int argc, char *argv[])
2259 setlocale(LC_ALL,"");
2261 ## process arguments
2268 ## The SHIFT/REDUCE parser
2270 Having analysed the grammar and generated all the table, we only need the
2271 shift/reduce engine to bring it all together.
2273 ### Goto table lookup
2275 The parser generator has nicely provided us with goto tables sorted by
2276 symbol number. We need a binary search function to find a symbol in the
2279 ###### parser functions
2281 static int search(const struct state *l, int sym)
2284 int hi = l->go_to_cnt;
2288 while (lo + 1 < hi) {
2289 int mid = (lo + hi) / 2;
2290 if (l->go_to[mid].sym <= sym)
2295 if (l->go_to[lo].sym == sym)
2296 return l->go_to[lo].state;
2301 ### The state stack.
2303 The core data structure for the parser is the stack. This tracks all the
2304 symbols that have been recognised or partially recognised.
2306 The stack usually won't grow very large - maybe a few tens of entries. So
2307 we dynamically resize and array as required but never bother to shrink it
2310 We keep the stack as two separate allocations. One, `asn_stack` stores the
2311 "abstract syntax nodes" which are created by each reduction. When we call
2312 `do_reduce` we need to pass an array of the `asn`s of the body of the
2313 production, and by keeping a separate `asn` stack, we can just pass a
2314 pointer into this stack.
2316 The other allocation stores all other stack fields of which there are four.
2317 The `state` is the most important one and guides the parsing process. The
2318 `sym` is nearly unnecessary. However when we want to free entries from the
2319 `asn_stack`, it helps to know what type they are so we can call the right
2320 freeing function. The symbol leads us to the right free function through
2323 The `indents` count and the `starts_indented` flag track the line
2324 indents in the symbol. These are used to allow indent information to
2325 guide parsing and error recovery.
2327 As well as the stack of frames we have a `next` frame which is
2328 assembled from the incoming token and other information prior to
2329 pushing it onto the stack.
2331 ###### parser functions
2337 short starts_indented;
2347 Two operations are needed on the stack - shift (which is like push) and pop.
2349 Shift applies not only to terminals but also to non-terminals. When we
2350 reduce a production we will pop off entries corresponding to the body
2351 symbols, then push on an item for the head of the production. This last is
2352 exactly the same process as shifting in a terminal so we use the same
2355 To simplify other code we arrange for `shift` to fail if there is no `goto`
2356 state for the symbol. This is useful in basic parsing due to our design
2357 that we shift when we can, and reduce when we cannot. So the `shift`
2358 function reports if it could.
2360 So `shift` finds the next state. If that succeed it extends the allocations
2361 if needed and pushed all the information onto the stacks.
2363 ###### parser functions
2365 static int shift(struct parser *p,
2367 const struct state states[])
2369 // Push an entry onto the stack
2370 int newstate = search(&states[p->next.state], p->next.sym);
2373 if (p->tos >= p->stack_size) {
2374 p->stack_size += 10;
2375 p->stack = realloc(p->stack, p->stack_size
2376 * sizeof(p->stack[0]));
2377 p->asn_stack = realloc(p->asn_stack, p->stack_size
2378 * sizeof(p->asn_stack[0]));
2380 p->stack[p->tos] = p->next;
2381 p->asn_stack[p->tos] = asn;
2383 p->next.state = newstate;
2384 p->next.indents = 0;
2385 p->next.starts_indented = 0;
2389 `pop` simply moves the top of stack (`tos`) back down the required amount
2390 and frees any `asn` entries that need to be freed. It is called _after_ we
2391 reduce a production, just before we `shift` the nonterminal in.
2393 ###### parser functions
2395 static void pop(struct parser *p, int num,
2396 void(*do_free)(short sym, void *asn))
2400 for (i = 0; i < num; i++) {
2401 p->next.indents += p->stack[p->tos+i].indents;
2402 do_free(p->stack[p->tos+i].sym,
2403 p->asn_stack[p->tos+i]);
2407 p->next.state = p->stack[p->tos].state;
2408 p->next.starts_indented = p->stack[p->tos].starts_indented;
2412 ### Memory allocation
2414 The `scanner` returns tokens in a local variable - we want them in allocated
2415 memory so they can live in the `asn_stack`. Similarly the `asn` produced by
2416 a reduce is in a large buffer. Both of these require some allocation and
2417 copying, hence `memdup` and `tokcopy`.
2419 ###### parser includes
2422 ###### parser functions
2424 void *memdup(void *m, int len)
2430 memcpy(ret, m, len);
2434 static struct token *tok_copy(struct token tk)
2436 struct token *new = malloc(sizeof(*new));
2441 ### The heart of the parser.
2443 Now we have the parser. If we can shift, we do. If not and we can reduce
2444 we do. If the production we reduced was production zero, then we have
2445 accepted the input and can finish.
2447 We return whatever `asn` was returned by reducing production zero.
2449 If we can neither shift nor reduce we have an error to handle. We pop
2450 single entries off the stack until we can shift the `TK_error` symbol, then
2451 drop input tokens until we find one we can shift into the new error state.
2453 When we find `TK_in` and `TK_out` tokens which report indents we need
2454 to handle them directly as the grammar cannot express what we want to
2457 `TK_in` tokens are easy: we simply update the `next` stack frame to
2458 record how many indents there are and that the next token started with
2461 `TK_out` tokens must either be counted off against any pending indent,
2462 or must force reductions until there is a pending indent which isn't
2463 at the start of a production.
2465 ###### parser includes
2468 void *parser_run(struct token_state *tokens,
2469 const struct state states[],
2470 int (*do_reduce)(int, void**, void*),
2471 void (*do_free)(short, void*),
2472 FILE *trace, const char *non_term[], int knowns)
2474 struct parser p = { 0 };
2475 struct token *tk = NULL;
2480 struct token *err_tk;
2482 tk = tok_copy(token_next(tokens));
2483 p.next.sym = tk->num;
2485 parser_trace(trace, &p, tk, states, non_term, knowns);
2487 if (p.next.sym == TK_in) {
2488 p.next.starts_indented = 1;
2494 if (p.next.sym == TK_out) {
2495 if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
2496 (p.stack[p.tos-1].indents == 1 &&
2497 states[p.next.state].reduce_size > 1)) {
2498 p.stack[p.tos-1].indents -= 1;
2503 // fall through and force a REDUCE (as 'shift'
2506 if (shift(&p, tk, states)) {
2510 if (states[p.next.state].reduce_prod >= 0) {
2512 int prod = states[p.next.state].reduce_prod;
2513 int size = states[p.next.state].reduce_size;
2515 static char buf[16*1024];
2516 p.next.sym = states[p.next.state].reduce_sym;
2518 body = p.asn_stack +
2519 (p.tos - states[p.next.state].reduce_size);
2521 bufsize = do_reduce(prod, body, buf);
2523 pop(&p, size, do_free);
2524 shift(&p, memdup(buf, bufsize), states);
2529 if (tk->num == TK_out) {
2530 // Indent problem - synthesise tokens to get us
2532 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
2533 p.next.sym = states[p.next.state].shift_sym;
2534 shift(&p, tok_copy(*tk), states);
2535 // FIXME need to report this error somehow
2538 /* Error. We walk up the stack until we
2539 * find a state which will accept TK_error.
2540 * We then shift in TK_error and see what state
2541 * that takes us too.
2542 * Then we discard input tokens until
2543 * we find one that is acceptable.
2546 err_tk = tok_copy(*tk);
2547 p.next.sym = TK_error;
2548 while (shift(&p, err_tk, states) == 0
2550 // discard this state
2551 pop(&p, 1, do_free);
2554 // no state accepted TK_error
2557 while (search(&states[p.next.state], tk->num) < 0 &&
2558 tk->num != TK_eof) {
2560 tk = tok_copy(token_next(tokens));
2561 if (tk->num == TK_in)
2562 p.next.indents += 1;
2563 if (tk->num == TK_out) {
2564 if (p.next.indents == 0)
2566 p.next.indents -= 1;
2569 if (p.tos == 0 && tk->num == TK_eof)
2574 ret = p.asn_stack[0];
2576 pop(&p, p.tos, do_free);
2582 ###### exported functions
2583 void *parser_run(struct token_state *tokens,
2584 const struct state states[],
2585 int (*do_reduce)(int, void**, void*),
2586 void (*do_free)(short, void*),
2587 FILE *trace, const char *non_term[], int knowns);
2591 Being able to visualize the parser in action can be invaluable when
2592 debugging the parser code, or trying to understand how the parse of a
2593 particular grammar progresses. The stack contains all the important
2594 state, so just printing out the stack every time around the parse loop
2595 can make it possible to see exactly what is happening.
2597 This doesn't explicitly show each SHIFT and REDUCE action. However they
2598 are easily deduced from the change between consecutive lines, and the
2599 details of each state can be found by cross referencing the states list
2600 in the stack with the "report" that parsergen can generate.
2602 For terminal symbols, we just dump the token. For non-terminals we
2603 print the name of the symbol. The look ahead token is reported at the
2604 end inside square brackets.
2606 ###### parser functions
2608 static char *reserved_words[] = {
2609 [TK_error] = "ERROR",
2612 [TK_newline] = "NEWLINE",
2615 void parser_trace(FILE *trace, struct parser *p,
2616 struct token *tk, const struct state states[],
2617 const char *non_term[], int knowns)
2620 for (i = 0; i < p->tos; i++) {
2621 int sym = p->stack[i].sym;
2622 fprintf(trace, "(%d) ", p->stack[i].state);
2623 if (sym < TK_reserved &&
2624 reserved_words[sym] != NULL)
2625 fputs(reserved_words[sym], trace);
2626 else if (sym < TK_reserved + knowns) {
2627 struct token *t = p->asn_stack[i];
2628 text_dump(trace, t->txt, 20);
2630 fputs(non_term[sym - TK_reserved - knowns],
2634 fprintf(trace, "(%d) [", p->next.state);
2635 if (tk->num < TK_reserved &&
2636 reserved_words[tk->num] != NULL)
2637 fputs(reserved_words[tk->num], trace);
2639 text_dump(trace, tk->txt, 20);
2640 fputs("]\n", trace);
2645 The obvious example for a parser is a calculator.
2647 As `scanner` provides number parsing function using `libgmp` is it not much
2648 work to perform arbitrary rational number calculations.
2650 This calculator takes one expression, or an equality test per line. The
2651 results are printed and in any equality test fails, the program exits with
2654 Embedding mdcode inside mdcode is rather horrible. I'd like to find a
2655 better approach, but as the grammar file must have 3 components I need
2656 something like this.
2658 ###### File: parsergen.mk
2659 calc.c : parsergen calc.cgm
2660 ./parsergen -o calc calc.cgm
2661 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2662 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2670 // what do we use for a demo-grammar? A calculator of course.
2684 #include <sys/mman.h>
2689 #include "scanner.h"
2695 static void free_number(struct number *n)
2701 int main(int argc, char *argv[])
2703 int fd = open(argv[1], O_RDONLY);
2704 int len = lseek(fd, 0, 2);
2705 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2706 struct section *s = code_extract(file, file+len, NULL);
2707 struct token_config config = {
2708 .ignored = (1 << TK_line_comment)
2709 | (1 << TK_block_comment)
2712 .number_chars = ".,_+-",
2716 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2724 Session -> Session Line
2727 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2728 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2729 gmp_printf(" or as a decimal: %Fg\n", fl);
2733 | Expression = Expression NEWLINE ${
2734 if (mpq_equal($1.val, $3.val))
2735 gmp_printf("Both equal %Qd\n", $1.val);
2737 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
2742 | NEWLINE ${ printf("Blank line\n"); }$
2743 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2746 Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2747 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2748 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2750 Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2751 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2752 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2754 Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2755 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$