1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. `calc.cgm` is a test grammar for a simple calculator.
22 ###### File: parsergen.c
27 ## forward declarations
38 ###### File: libparser.c
47 ###### File: parsergen.mk
50 parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
53 ## Reading the grammar
55 The grammar must be stored in a markdown literate code file as parsed
56 by "[mdcode][]". It must have three top level (i.e. unreferenced)
57 sections: `header`, `code`, and `grammar`. The first two will be
58 literally copied into the generated `.c` and `.h`. files. The last
59 contains the grammar. This is tokenised with "[scanner][]".
62 [scanner]: scanner.html
68 ###### parser includes
72 The grammar contains production sets, precedence/associativity
73 declarations, and data type declarations. These are all parsed with
74 _ad hoc_ parsing as we don't have a parser generator yet.
76 The precedence and associativity can be set for each production, but
77 can be inherited from symbols. The data type is potentially defined
78 for each non-terminal and describes what C structure is used to store
79 information about each symbol.
82 enum assoc {Left, Right, Non};
83 char *assoc_names[] = {"Left","Right","Non"};
86 struct text struct_name;
88 unsigned short precedence;
92 unsigned short precedence;
100 The strings reported by `mdcode` and `scanner` are `struct text` which have
101 length rather than being null terminated. To help with printing an
102 comparing we define `text_is` and `prtxt`, which should possibly go in
103 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
104 which might contain control characters.
107 static int text_is(struct text t, char *s)
109 return (strlen(s) == t.len &&
110 strncmp(s, t.txt, t.len) == 0);
112 static void prtxt(struct text t)
114 printf("%.*s", t.len, t.txt);
119 Productions are comprised primarily of symbols - terminal and
120 non-terminal. We do not make a syntactic distinction between the two,
121 though non-terminals must be identifiers. Non-terminal symbols are
122 those which appear in the head of a production, terminal symbols are
123 those which don't. There are also "virtual" symbols used for precedence
124 marking discussed later, and sometimes we won't know what type a symbol
127 ###### forward declarations
128 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
129 char *symtypes = "UVTN";
133 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
134 table of known symbols and the resulting parser will report them as
135 `TK_reserved + N`. A small set of identifiers are reserved for the
136 different token types that `scanner` can report.
139 static char *reserved_words[] = {
140 [TK_error] = "ERROR",
141 [TK_number] = "NUMBER",
142 [TK_ident] = "IDENTIFIER",
144 [TK_string] = "STRING",
145 [TK_multi_string] = "MULTI_STRING",
146 [TK_indent] = "INDENT",
147 [TK_undent] = "UNDENT",
148 [TK_newline] = "NEWLINE",
154 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
155 recognised. The former is automatically expected at the end of the text
156 being parsed. The latter are always ignored by the parser.
158 All of these symbols are stored in a simple symbol table. We use the
159 `struct text` from `mdcode` to store the name and link them together
160 into a sorted list using an insertion sort.
162 We don't have separate `find` and `insert` functions as any symbol we
163 find needs to be remembered. We simply expect `find` to always return a
164 symbol, but its type might be `Unknown`.
173 ###### grammar fields
178 static int text_cmp(struct text a, struct text b)
183 int cmp = strncmp(a.txt, b.txt, len);
187 return a.len - b.len;
190 static struct symbol *sym_find(struct grammar *g, struct text s)
192 struct symbol **l = &g->syms;
197 (cmp = text_cmp((*l)->name, s)) < 0)
201 n = calloc(1, sizeof(*n));
210 static void symbols_init(struct grammar *g)
212 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
214 for (i = 0; i < entries; i++) {
217 t.txt = reserved_words[i];
220 t.len = strlen(t.txt);
227 ### Data types and precedence.
229 Data type specification and precedence specification are both
230 introduced by a dollar sign at the start of the line. If the next
231 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
232 precedence, otherwise it specifies a data type.
234 The data type name is simply stored and applied to the head of all
235 subsequent productions. It must be the name of a structure, so `$expr`
236 maps to `struct expr`.
238 The precedence line must contain a list of symbols - typically
239 terminal symbols, but not necessarily. It can only contain symbols
240 that have not been seen yet, so precedence declaration must precede
241 the productions that they affect.
243 A precedence line may also contain a "virtual" symbol which is an
244 identifier preceded by `$$`. Virtual terminals carry precedence
245 information but are not included in the grammar. A production can
246 declare that it inherits the precedence of a given virtual symbol.
248 This use for `$$` precludes it from being used as a symbol in the
249 described language. Two other symbols: `${` and `}$` are also
252 Each new precedence line introduces a new precedence level and
253 declares how it associates. This level is stored in each symbol
254 listed and may be inherited by any production which uses the symbol. A
255 production inherits from the last symbol which has a precedence.
257 ###### grammar fields
258 struct text current_type;
262 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
263 static const char *known[] = { "$$", "${", "}$" };
266 static char *dollar_line(struct token_state *ts, struct grammar *g)
268 struct token t = token_next(ts);
273 if (t.num != TK_ident) {
274 err = "type or assoc expected after '$'";
277 if (text_is(t.txt, "LEFT"))
279 else if (text_is(t.txt, "RIGHT"))
281 else if (text_is(t.txt, "NON"))
284 g->current_type = t.txt;
286 if (t.num != TK_newline) {
287 err = "Extra tokens after type name";
293 // This is a precedence line, need some symbols.
297 while (t.num != TK_newline) {
298 enum symtype type = Terminal;
300 if (t.num == TK_virtual) {
303 if (t.num != TK_ident) {
304 err = "$$ must be followed by a word";
307 } else if (t.num != TK_ident &&
309 err = "Illegal token in precedence line";
312 s = sym_find(g, t.txt);
313 if (s->type != Unknown) {
314 err = "Symbols in precedence line must not already be known.";
318 s->precedence = g->prec_levels;
323 err = "No symbols given on precedence line";
327 while (t.num != TK_newline && t.num != TK_eof)
334 A production either starts with an identifier which is the head
335 non-terminal, or a vertical bar (`|`) in which case this production
336 uses the same head as the previous one. The identifier must be
337 followed by a `->` mark. All productions for a given non-terminal must
338 be grouped together with the `nonterminal ->` given only once.
340 After this a (possibly empty) sequence of identifiers and marks form
341 the body of the production. A virtual symbol may be given after the
342 body by preceding it with `$$`. If a virtual symbol is given, the
343 precedence of the production is that for the virtual symbol. If none
344 is given, the precedence is inherited from the last symbol in the
345 production which has a precedence specified.
347 After the optional precedence may come the `${` mark. This indicates
348 the start of a code fragment. If present, this must be on the same
349 line as the start of the production.
351 All of the text from the `${` through to the matching `}$` is
352 collected and forms the code-fragment for the production. It must all
353 be in one `code_node` of the literate code. The `}$` must be
354 at the end of a line.
356 Text in the code fragment will undergo substitutions where `$N` for
357 some numeric `N` will be replaced with a variable holding the parse
358 information for the particular symbol in the production. `$0` is the
359 head of the production, `$1` is the first symbol of the body, etc.
360 The type of `$N` for a terminal symbol is `struct token`. For
361 non-terminal, it is whatever has been declared for that symbol.
363 While building productions we will need to add to an array which needs to
367 static void array_add(void *varray, int *cnt, void *new)
369 void ***array = varray;
372 current = ((*cnt-1) | (step-1))+1;
373 if (*cnt == current) {
376 *array = realloc(*array, current * sizeof(void*));
378 (*array)[*cnt] = new;
382 Collecting the code fragment simply involves reading tokens until we
383 hit the end token `}$`, and noting the character position of the start and
387 static struct text collect_code(struct token_state *state,
392 code.txt = start.txt.txt + start.txt.len;
394 t = token_next(state);
395 while (t.node == start.node &&
396 t.num != TK_close && t.num != TK_error &&
398 if (t.num == TK_close && t.node == start.node)
399 code.len = t.txt.txt - code.txt;
405 Now we have all the bit we need to parse a full production.
410 ###### grammar fields
411 struct production **productions;
412 int production_count;
414 ###### production fields
416 struct symbol **body;
421 int first_production;
424 static char *parse_production(struct grammar *g,
426 struct token_state *state)
428 /* Head has already been parsed. */
431 struct production p, *pp;
433 memset(&p, 0, sizeof(p));
435 tk = token_next(state);
436 while (tk.num == TK_ident || tk.num == TK_mark) {
437 struct symbol *bs = sym_find(g, tk.txt);
438 if (bs->type == Unknown)
440 if (bs->type == Virtual) {
441 err = "Virtual symbol not permitted in production";
444 if (bs->precedence) {
445 p.precedence = bs->precedence;
448 array_add(&p.body, &p.body_size, bs);
449 tk = token_next(state);
451 if (tk.num == TK_virtual) {
453 tk = token_next(state);
454 if (tk.num != TK_ident) {
455 err = "word required after $$";
458 vs = sym_find(g, tk.txt);
459 if (vs->type != Virtual) {
460 err = "symbol after $$ must be virtual";
463 p.precedence = vs->precedence;
465 tk = token_next(state);
467 if (tk.num == TK_open) {
468 p.code = collect_code(state, tk);
469 if (p.code.txt == NULL) {
470 err = "code fragment not closed properly";
473 tk = token_next(state);
475 if (tk.num != TK_newline && tk.num != TK_eof) {
476 err = "stray tokens at end of line";
479 pp = malloc(sizeof(*pp));
481 array_add(&g->productions, &g->production_count, pp);
484 while (tk.num != TK_newline && tk.num != TK_eof)
485 tk = token_next(state);
489 With the ability to parse production and dollar-lines, we have nearly all
490 that we need to parse a grammar from a `code_node`.
492 The head of the first production will effectively be the `start` symbol of
493 the grammar. However it wont _actually_ be so. Processing the grammar is
494 greatly simplified if the real start symbol only has a single production,
495 and expect `$eof` as the final terminal. So when we find the first explicit
496 production we insert an extra production as production zero which looks like
498 ###### Example: production 0
501 where `START` is the first non-terminal give.
503 ###### create production zero
504 struct production *p = calloc(1,sizeof(*p));
505 struct text start = {"$start",6};
506 struct text eof = {"$eof",4};
507 p->head = sym_find(g, start);
508 p->head->type = Nonterminal;
509 array_add(&p->body, &p->body_size, head);
510 array_add(&p->body, &p->body_size, sym_find(g, eof));
511 g->start = p->head->num;
512 p->head->first_production = g->production_count;
513 array_add(&g->productions, &g->production_count, p);
515 Now we are ready to read in the grammar.
517 ###### grammar fields
518 int start; // the 'start' symbol.
521 static struct grammar *grammar_read(struct code_node *code)
523 struct token_config conf = {
526 .words_marks = known,
527 .known_count = sizeof(known)/sizeof(known[0]),
529 .ignored = (1 << TK_line_comment)
530 | (1 << TK_block_comment)
533 | (1 << TK_multi_string)
538 struct token_state *state = token_open(code, &conf);
539 struct token tk = token_next(state);
540 struct symbol *head = NULL;
544 g = calloc(1, sizeof(*g));
547 for (tk = token_next(state); tk.num != TK_eof;
548 tk = token_next(state)) {
549 if (tk.num == TK_newline)
551 if (tk.num == TK_ident) {
553 head = sym_find(g, tk.txt);
554 if (head->type == Nonterminal)
555 err = "This non-terminal has already be used.";
556 else if (head->type == Virtual)
557 err = "Virtual symbol not permitted in head of production";
559 head->type = Nonterminal;
560 head->struct_name = g->current_type;
562 ## create production zero
564 head->first_production = g->production_count;
565 tk = token_next(state);
566 if (tk.num == TK_mark &&
567 text_is(tk.txt, "->"))
568 err = parse_production(g, head, state);
570 err = "'->' missing in production";
572 } else if (tk.num == TK_mark
573 && text_is(tk.txt, "|")) {
574 // another production for same non-term
576 err = parse_production(g, head, state);
578 err = "First production must have a head";
579 } else if (tk.num == TK_mark
580 && text_is(tk.txt, "$")) {
581 err = dollar_line(state, g);
583 err = "Unrecognised token at start of line.";
591 fprintf(stderr, "Error at line %d: %s\n",
598 ## Analysing the grammar
600 The central task in analysing the grammar is to determine a set of
601 states to drive the parsing process. This will require finding
602 various sets of symbols and of "items". Some of these sets will need
603 to attach information to each element in the set, so they are more
606 Each "item" may have a 'look-ahead' or `LA` set associated with
607 it. Many of these will be the same as each other. To avoid memory
608 wastage, and to simplify some comparisons of sets, these sets will be
609 stored in a list of unique sets, each assigned a number.
611 Once we have the data structures in hand to manage these sets and
612 lists, we can start setting the 'nullable' flag, build the 'FIRST'
613 sets, and then create the item sets which define the various states.
617 Though we don't only store symbols in these sets, they are the main
618 things we store, so they are called symbol sets or "symsets".
620 A symset has a size, an array of shorts, and an optional array of data
621 values, which are also shorts. If the array of data is not present,
622 we store an impossible pointer, as `NULL` is used to indicate that no
623 memory has been allocated yet;
628 unsigned short *syms, *data;
630 #define NO_DATA ((unsigned short *)1)
631 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
632 const struct symset INIT_DATASET = { 0, NULL, NULL };
634 The arrays of shorts are allocated in blocks of 8 and are kept sorted
635 by using an insertion sort. We don't explicitly record the amount of
636 allocated space as it can be derived directly from the current `cnt` using
637 `((cnt - 1) | 7) + 1`.
640 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
643 int current = ((s->cnt-1) | 7) + 1;
644 if (current == s->cnt) {
646 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
647 if (s->data != NO_DATA)
648 s->data = realloc(s->data, sizeof(*s->data) * current);
651 while (i > 0 && s->syms[i-1] > key) {
652 s->syms[i] = s->syms[i-1];
653 if (s->data != NO_DATA)
654 s->data[i] = s->data[i-1];
658 if (s->data != NO_DATA)
663 Finding a symbol (or item) in a `symset` uses a simple binary search.
664 We return the index where the value was found (so data can be accessed),
665 or `-1` to indicate failure.
667 static int symset_find(struct symset *ss, unsigned short key)
674 while (lo + 1 < hi) {
675 int mid = (lo + hi) / 2;
676 if (ss->syms[mid] <= key)
681 if (ss->syms[lo] == key)
686 We will often want to form the union of two symsets. When we do, we
687 will often want to know if anything changed (as they might mean there
688 is more work to do). So `symset_union` reports whether anything was
689 added to the first set. We use a slow quadratic approach as these
690 sets don't really get very big. If profiles shows this to be a problem is
691 can be optimised later.
693 static int symset_union(struct symset *a, struct symset *b)
697 for (i = 0; i < b->cnt; i++)
698 if (symset_find(a, b->syms[i]) < 0) {
699 unsigned short data = 0;
700 if (b->data != NO_DATA)
702 symset_add(a, b->syms[i], data);
708 And of course we must be able to free a symset.
710 static void symset_free(struct symset ss)
713 if (ss.data != NO_DATA)
719 Some symsets are simply stored somewhere appropriate in the `grammar`
720 data structure, others needs a bit of indirection. This applies
721 particularly to "LA" sets. These will be paired with "items" in an
722 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
723 stored in the `data` field, so we need a mapping from a small (short)
724 number to an LA `symset`.
726 As mentioned earlier this involves creating a list of unique symsets.
728 For now, we just use a linear list sorted by insertion. A skiplist
729 would be more efficient and may be added later.
736 struct setlist *next;
739 ###### grammar fields
740 struct setlist *sets;
745 static int ss_cmp(struct symset a, struct symset b)
748 int diff = a.cnt - b.cnt;
751 for (i = 0; i < a.cnt; i++) {
752 diff = (int)a.syms[i] - (int)b.syms[i];
759 static int save_set(struct grammar *g, struct symset ss)
761 struct setlist **sl = &g->sets;
765 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
772 s = malloc(sizeof(*s));
781 Finding a set by number is currently performed by a simple linear search.
782 If this turns out to hurt performance, we can store the sets in a dynamic
783 array like the productions.
785 static struct symset set_find(struct grammar *g, int num)
787 struct setlist *sl = g->sets;
788 while (sl && sl->num != num)
794 ### Setting `nullable`
796 We set `nullable` on the head symbol for any production for which all
797 the body symbols (if any) are nullable. As this is a recursive
798 definition, any change in the `nullable` setting means that we need to
799 re-evaluate where it needs to be set.
801 We simply loop around performing the same calculations until no more
808 static void set_nullable(struct grammar *g)
811 while (check_again) {
814 for (p = 0; p < g->production_count; p++) {
815 struct production *pr = g->productions[p];
818 if (pr->head->nullable)
820 for (s = 0; s < pr->body_size; s++)
821 if (! pr->body[s]->nullable)
823 if (s == pr->body_size) {
824 pr->head->nullable = 1;
831 ### Building the `first` sets
833 When calculating what can follow a particular non-terminal, we will need to
834 know what the "first" terminal in an subsequent non-terminal might be. So
835 we calculate the `first` set for every non-terminal and store them in an
836 array. We don't bother recording the "first" set for terminals as they are
839 As the "first" for one symbol might depend on the "first" of another,
840 we repeat the calculations until no changes happen, just like with
841 `set_nullable`. This makes use of the fact that `symset_union`
842 reports if any change happens.
844 The core of this which finds the "first" of part of a production body
845 will be reused for computing the "follow" sets, so we split it out
846 into a separate function.
848 ###### grammar fields
849 struct symset *first;
853 static int add_first(struct production *pr, int start,
854 struct symset *target, struct grammar *g,
859 for (s = start; s < pr->body_size; s++) {
860 struct symbol *bs = pr->body[s];
861 if (bs->type == Terminal) {
862 if (symset_find(target, bs->num) < 0) {
863 symset_add(target, bs->num, 0);
867 } else if (symset_union(target, &g->first[bs->num]))
873 *to_end = (s == pr->body_size);
877 static void build_first(struct grammar *g)
881 g->first = calloc(g->num_syms, sizeof(g->first[0]));
882 for (s = 0; s < g->num_syms; s++)
883 g->first[s] = INIT_SYMSET;
885 while (check_again) {
888 for (p = 0; p < g->production_count; p++) {
889 struct production *pr = g->productions[p];
890 struct symset *head = &g->first[pr->head->num];
892 if (add_first(pr, 0, head, g, NULL))
898 ### Building the `follow` sets.
900 There are two different situations when we will want to generate "follow"
901 sets. If we are doing an SLR analysis, we want to generate a single
902 "follow" set for each non-terminal in the grammar. That is what is
903 happening here. If we are doing an LALR or LR analysis we will want
904 to generate a separate "LA" set for each item. We do that later
907 There are two parts to generating a "follow" set. Firstly we look at
908 every place that any non-terminal appears in the body of any
909 production, and we find the set of possible "first" symbols after
910 there. This is done using `add_first` above and only needs to be done
911 once as the "first" sets are now stable and will not change.
915 for (p = 0; p < g->production_count; p++) {
916 struct production *pr = g->productions[p];
919 for (b = 0; b < pr->body_size - 1; b++) {
920 struct symbol *bs = pr->body[b];
921 if (bs->type == Terminal)
923 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
927 The second part is to add the "follow" set of the head of a production
928 to the "follow" sets of the final symbol in the production, and any
929 other symbol which is followed only by `nullable` symbols. As this
930 depends on "follow" itself we need to repeatedly perform the process
931 until no further changes happen.
935 for (again = 0, p = 0;
936 p < g->production_count;
937 p < g->production_count-1
938 ? p++ : again ? (again = 0, p = 0)
940 struct production *pr = g->productions[p];
943 for (b = pr->body_size - 1; b >= 0; b--) {
944 struct symbol *bs = pr->body[b];
945 if (bs->type == Terminal)
947 if (symset_union(&g->follow[bs->num],
948 &g->follow[pr->head->num]))
955 We now just need to create and initialise the `follow` list to get a
958 ###### grammar fields
959 struct symset *follow;
962 static void build_follow(struct grammar *g)
965 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
966 for (s = 0; s < g->num_syms; s++)
967 g->follow[s] = INIT_SYMSET;
971 ### Building itemsets and states
973 There are three different levels of detail that can be chosen for
974 building the itemsets and states for the LR grammar. They are:
976 1. LR(0) or SLR(1), where no look-ahead is considered.
977 2. LALR(1) where we build look-ahead sets with each item and merge
978 the LA sets when we find two paths to the same "kernel" set of items.
979 3. LR(1) where different look-ahead for any item in the code means
980 a different state must be created.
982 ###### forward declarations
983 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
985 We need to be able to look through existing states to see if a newly
986 generated state already exists. For now we use a simple sorted linked
989 An item is a pair of numbers: the production number and the position of
990 "DOT", which is an index into the body of the production.
991 As the numbers are not enormous we can combine them into a single "short"
992 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
993 production number (so 4000 productions with maximum of 15 symbols in the
996 Comparing the itemsets will be a little different to comparing symsets
997 as we want to do the lookup after generating the "kernel" of an
998 itemset, so we need to ignore the offset=zero items which are added during
1001 To facilitate this, we modify the "DOT" number so that "0" sorts to the end of
1002 the list in the symset, and then only compare items before the first "0".
1005 static inline unsigned short item_num(int production, int index)
1007 return production | ((31-index) << 11);
1009 static inline int item_prod(unsigned short item)
1011 return item & 0x7ff;
1013 static inline int item_index(unsigned short item)
1015 return (31-(item >> 11)) & 0x1f;
1018 For LR(1) analysis we need to compare not just the itemset in a state
1019 but also the LA sets. As we assign each unique LA set a number, we
1020 can just compare the symset and the data values together.
1023 static int itemset_cmp(struct symset a, struct symset b,
1024 enum grammar_type type)
1030 i < a.cnt && i < b.cnt &&
1031 item_index(a.syms[i]) > 0 &&
1032 item_index(b.syms[i]) > 0;
1034 int diff = a.syms[i] - b.syms[i];
1038 diff = a.data[i] - b.data[i];
1043 if (i == a.cnt || item_index(a.syms[i]) == 0)
1047 if (i == b.cnt || item_index(b.syms[i]) == 0)
1053 if (type < LR1 || av == -1)
1056 a.data[i] - b.data[i];
1059 And now we can build the list of itemsets. The lookup routine returns
1060 both a success flag and a pointer to where in the list an insert
1061 should happen, so we don't need to search a second time.
1065 struct itemset *next;
1067 struct symset items;
1068 struct symset go_to;
1072 ###### grammar fields
1073 struct itemset *items;
1077 static int itemset_find(struct grammar *g, struct itemset ***where,
1078 struct symset kernel, enum grammar_type type)
1080 struct itemset **ip;
1082 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1083 struct itemset *i = *ip;
1085 diff = itemset_cmp(i->items, kernel, type);
1098 Adding an itemset may require merging the LA sets if LALR analysis is
1099 happening. If any new LA set add symbol that weren't in the old LA set, we
1100 clear the `completed` flag so that the dependants of this itemset will be
1101 recalculated and their LA sets updated.
1103 `add_itemset` must consume the symsets it is passed, either by adding
1104 them to a data structure, of freeing them.
1106 static int add_itemset(struct grammar *g, struct symset ss,
1107 enum grammar_type type)
1109 struct itemset **where, *is;
1111 int found = itemset_find(g, &where, ss, type);
1113 is = calloc(1, sizeof(*is));
1114 is->state = g->states;
1118 is->go_to = INIT_DATASET;
1127 for (i = 0; i < ss.cnt; i++) {
1128 struct symset temp = INIT_SYMSET, s;
1129 if (ss.data[i] == is->items.data[i])
1131 s = set_find(g, is->items.data[i]);
1132 symset_union(&temp, &s);
1133 s = set_find(g, ss.data[i]);
1134 if (symset_union(&temp, &s)) {
1135 is->items.data[i] = save_set(g, temp);
1146 To build all the itemsets, we first insert the initial itemset made from the
1147 start symbol, complete each itemset, and then generate new itemsets from old
1148 until no new ones can be made.
1150 Completing an itemset means finding all the items where "DOT" is followed by
1151 a nonterminal and adding "DOT=0" items for every production from that
1152 non-terminal - providing each item hasn't already been added.
1154 If LA sets are needed, the LA set for each new item is found using
1155 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1156 be supplemented by the LA set for the item which produce the new item.
1158 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1159 is used in the next stage.
1161 NOTE: precedence handling should happen here - I haven't written this yet
1164 ###### complete itemset
1165 for (i = 0; i < is->items.cnt; i++) {
1166 int p = item_prod(is->items.syms[i]);
1167 int bs = item_index(is->items.syms[i]);
1168 struct production *pr = g->productions[p];
1171 struct symset LA = INIT_SYMSET;
1172 unsigned short sn = 0;
1174 if (bs == pr->body_size)
1177 if (symset_find(&done, s->num) < 0)
1178 symset_add(&done, s->num, 0);
1179 if (s->type != Nonterminal)
1185 add_first(pr, bs+1, &LA, g, &to_end);
1187 struct symset ss = set_find(g, is->items.data[i]);
1188 symset_union(&LA, &ss);
1190 sn = save_set(g, LA);
1191 LA = set_find(g, sn);
1193 /* Add productions for this symbol */
1194 for (p2 = s->first_production;
1195 p2 < g->production_count &&
1196 g->productions[p2]->head == s;
1198 int itm = item_num(p2, 0);
1199 int pos = symset_find(&is->items, itm);
1201 symset_add(&is->items, itm, sn);
1202 /* Will have re-ordered, so start
1203 * from beginning again */
1205 } else if (type >= LALR) {
1206 struct symset ss = set_find(g, is->items.data[pos]);
1207 struct symset tmp = INIT_SYMSET;
1209 symset_union(&tmp, &ss);
1210 if (symset_union(&tmp, &LA)) {
1211 is->items.data[pos] = save_set(g, tmp);
1219 For each symbol we found (and placed in `done`) we collect all the items for
1220 which this symbol is next, and create a new itemset with "DOT" advanced over
1221 the symbol. This is then added to the collection of itemsets (or merged
1222 with a pre-existing itemset).
1224 ###### derive itemsets
1225 // Now we have a completed itemset, so we need to
1226 // create all the 'next' itemset and create them
1227 // if they don't exist.
1228 for (i = 0; i < done.cnt; i++) {
1230 unsigned short state;
1231 struct symset newitemset = INIT_SYMSET;
1233 newitemset = INIT_DATASET;
1235 for (j = 0; j < is->items.cnt; j++) {
1236 int itm = is->items.syms[j];
1237 int p = item_prod(itm);
1238 int bp = item_index(itm);
1239 struct production *pr = g->productions[p];
1240 unsigned short la = 0;
1243 if (bp == pr->body_size)
1245 if (pr->body[bp]->num != done.syms[i])
1248 la = is->items.data[j];
1249 pos = symset_find(&newitemset, pr->head->num);
1251 symset_add(&newitemset, item_num(p, bp+1), la);
1252 else if (type >= LALR) {
1253 // Need to merge la set.
1254 int la2 = newitemset.data[pos];
1256 struct symset ss = set_find(g, la2);
1257 struct symset LA = INIT_SYMSET;
1258 symset_union(&LA, &ss);
1259 ss = set_find(g, la);
1260 if (symset_union(&LA, &ss))
1261 newitemset.data[pos] = save_set(g, LA);
1267 state = add_itemset(g, newitemset, type);
1268 if (symset_find(&is->go_to, done.syms[i]) < 0)
1269 symset_add(&is->go_to, done.syms[i], state);
1272 All that is left is to crate the initial itemset from production zero, and
1273 with `TK_eof` as the LA set.
1276 static void build_itemsets(struct grammar *g, enum grammar_type type)
1278 struct symset first = INIT_SYMSET;
1281 unsigned short la = 0;
1283 // LA set just has eof
1284 struct symset eof = INIT_SYMSET;
1285 symset_add(&eof, TK_eof, 0);
1286 la = save_set(g, eof);
1287 first = INIT_DATASET;
1289 // production 0, offset 0 (with no data)
1290 symset_add(&first, item_num(0, 0), la);
1291 add_itemset(g, first, type);
1292 for (again = 0, is = g->items;
1294 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1296 struct symset done = INIT_SYMSET;
1307 ### Completing the analysis.
1309 The exact process of analysis depends on which level was requested. For
1310 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1311 `LR1` we need first, but not follow. For `SLR` we need both.
1313 We don't build the "action" tables that you might expect as the parser
1314 doesn't use them. They will be calculated to some extent if needed for
1317 Once we have built everything we allocate arrays for the two lists:
1318 symbols and itemsets. This allows more efficient access during reporting.
1319 The symbols are grouped as terminals and non-terminals and we record the
1320 changeover point in `first_nonterm`.
1322 ###### grammar fields
1323 struct symbol **symtab;
1324 struct itemset **statetab;
1327 ###### grammar_analyse
1329 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1333 int snum = TK_reserved;
1334 for (s = g->syms; s; s = s->next)
1335 if (s->num < 0 && s->type == Terminal) {
1339 g->first_nonterm = snum;
1340 for (s = g->syms; s; s = s->next)
1346 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1347 for (s = g->syms; s; s = s->next)
1348 g->symtab[s->num] = s;
1357 build_itemsets(g, type);
1359 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1360 for (is = g->items; is ; is = is->next)
1361 g->statetab[is->state] = is;
1364 ## Reporting on the Grammar
1366 The purpose of the report is to give the grammar developer insight into
1367 how the grammar parser will work. It is basically a structured dump of
1368 all the tables that have been generated, plus an description of any conflicts.
1370 ###### grammar_report
1371 static int grammar_report(struct grammar *g, enum grammar_type type)
1377 return report_conflicts(g, type);
1380 Firstly we have the complete list of symbols, together with the "FIRST"
1381 set if that was generated.
1385 static void report_symbols(struct grammar *g)
1389 printf("SYMBOLS + FIRST:\n");
1391 printf("SYMBOLS:\n");
1393 for (n = 0; n < g->num_syms; n++) {
1394 struct symbol *s = g->symtab[n];
1398 printf(" %c%3d%c: ",
1399 s->nullable ? '*':' ',
1400 s->num, symtypes[s->type]);
1403 printf(" (%d%s)", s->precedence,
1404 assoc_names[s->assoc]);
1406 if (g->first && s->type == Nonterminal) {
1409 for (j = 0; j < g->first[n].cnt; j++) {
1412 prtxt(g->symtab[g->first[n].syms[j]]->name);
1419 Then we have to follow sets if they were computed.
1421 static void report_follow(struct grammar *g)
1424 printf("FOLLOW:\n");
1425 for (n = 0; n < g->num_syms; n++)
1426 if (g->follow[n].cnt) {
1430 prtxt(g->symtab[n]->name);
1431 for (j = 0; j < g->follow[n].cnt; j++) {
1434 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1440 And finally the item sets. These include the GO TO tables and, for
1441 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1442 it up a bit. First the items, with production number and associativity.
1444 static void report_item(struct grammar *g, int itm)
1446 int p = item_prod(itm);
1447 int dot = item_index(itm);
1448 struct production *pr = g->productions[p];
1452 prtxt(pr->head->name);
1454 for (i = 0; i < pr->body_size; i++) {
1455 printf(" %s", (dot == i ? ". ": ""));
1456 prtxt(pr->body[i]->name);
1458 if (dot == pr->body_size)
1462 printf(" (%d%s)", pr->precedence,
1463 assoc_names[pr->assoc]);
1467 The LA sets which are (possibly) reported with each item:
1469 static void report_la(struct grammar *g, int lanum)
1471 struct symset la = set_find(g, lanum);
1475 printf(" LOOK AHEAD(%d)", lanum);
1476 for (i = 0; i < la.cnt; i++) {
1479 prtxt(g->symtab[la.syms[i]]->name);
1484 Then the go to sets:
1487 static void report_goto(struct grammar *g, struct symset gt)
1492 for (i = 0; i < gt.cnt; i++) {
1494 prtxt(g->symtab[gt.syms[i]]->name);
1495 printf(" -> %d\n", gt.data[i]);
1499 Now we can report all the item sets complete with items, LA sets, and GO TO.
1501 static void report_itemsets(struct grammar *g)
1504 printf("ITEM SETS(%d)\n", g->states);
1505 for (s = 0; s < g->states; s++) {
1507 struct itemset *is = g->statetab[s];
1508 printf(" Itemset %d:\n", s);
1509 for (j = 0; j < is->items.cnt; j++) {
1510 report_item(g, is->items.syms[j]);
1511 if (is->items.data != NO_DATA)
1512 report_la(g, is->items.data[j]);
1514 report_goto(g, is->go_to);
1518 ### Reporting conflicts
1520 Conflict detection varies a lot among different analysis levels. However
1521 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1522 LR1 are also similar as they have FOLLOW or LA sets.
1526 ## conflict functions
1528 static int report_conflicts(struct grammar *g, enum grammar_type type)
1531 printf("Conflicts:\n");
1533 cnt = conflicts_lr0(g, type);
1535 cnt = conflicts_slr(g, type);
1537 printf(" - no conflicts\n");
1541 LR0 conflicts are any state which have both a reducible item and
1544 LR05 conflicts only occurs if two possibly reductions exist,
1545 as shifts always over-ride reductions.
1547 ###### conflict functions
1548 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1552 for (i = 0; i < g->states; i++) {
1553 struct itemset *is = g->statetab[i];
1554 int last_reduce = -1;
1555 int prev_reduce = -1;
1556 int last_shift = -1;
1560 for (j = 0; j < is->items.cnt; j++) {
1561 int itm = is->items.syms[j];
1562 int p = item_prod(itm);
1563 int bp = item_index(itm);
1564 struct production *pr = g->productions[p];
1566 if (bp == pr->body_size) {
1567 prev_reduce = last_reduce;
1571 if (pr->body[bp]->type == Terminal)
1574 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1575 printf(" State %d has both SHIFT and REDUCE:\n", i);
1576 report_item(g, is->items.syms[last_shift]);
1577 report_item(g, is->items.syms[last_reduce]);
1580 if (prev_reduce >= 0) {
1581 printf(" State %d has 2 (or more) reducible items\n", i);
1582 report_item(g, is->items.syms[prev_reduce]);
1583 report_item(g, is->items.syms[last_reduce]);
1590 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1591 look ahead, or if a symbol in a look-ahead can be shifted. The differ only
1592 in the source of the look ahead set.
1594 We build a dataset mapping terminal to item for possible SHIFTs and then
1595 another for possible REDUCE operations. We report when we get conflicts
1598 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1603 for (i = 0; i < g->states; i++) {
1604 struct itemset *is = g->statetab[i];
1605 struct symset shifts = INIT_DATASET;
1606 struct symset reduce = INIT_DATASET;
1610 /* First collect the shifts */
1611 for (j = 0; j < is->items.cnt; j++) {
1612 unsigned short itm = is->items.syms[j];
1613 int p = item_prod(itm);
1614 int bp = item_index(itm);
1615 struct production *pr = g->productions[p];
1617 if (bp < pr->body_size &&
1618 pr->body[bp]->type == Terminal) {
1620 int sym = pr->body[bp]->num;
1621 if (symset_find(&shifts, sym) < 0)
1622 symset_add(&shifts, sym, itm);
1625 /* Now look for reduction and conflicts */
1626 for (j = 0; j < is->items.cnt; j++) {
1627 unsigned short itm = is->items.syms[j];
1628 int p = item_prod(itm);
1629 int bp = item_index(itm);
1630 struct production *pr = g->productions[p];
1632 if (bp < pr->body_size)
1637 la = g->follow[pr->head->num];
1639 la = set_find(g, is->items.data[j]);
1641 for (k = 0; k < la.cnt; k++) {
1642 int pos = symset_find(&shifts, la.syms[k]);
1644 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1645 prtxt(g->symtab[la.syms[k]]->name);
1647 report_item(g, shifts.data[pos]);
1648 report_item(g, itm);
1651 pos = symset_find(&reduce, la.syms[k]);
1653 symset_add(&reduce, la.syms[k], itm);
1656 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1657 prtxt(g->symtab[la.syms[k]]->name);
1659 report_item(g, itm);
1660 report_item(g, reduce.data[pos]);
1664 symset_free(shifts);
1665 symset_free(reduce);
1671 ## Generating the parser
1673 The export part of the parser is the `parse_XX` function, where the name
1674 `XX` is based on the name of the parser files.
1676 This takes a `code_node`, a partially initialized `token_config`, and an
1677 optional `FILE` to send tracing to. The `token_config` gets the list of
1678 known words added and then is used with the `code_node` to initialize the
1681 `parse_XX` then call the library function `parser_run` to actually complete
1682 the parse, This needs the `states` table and function to call the various
1683 pieces of code provided in the grammar file, so they are generated first.
1685 ###### parser_generate
1687 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1693 gen_reduce(f, g, file);
1696 fprintf(f, "#line 0 \"gen_parser\"\n");
1697 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1700 fprintf(f, "\tstruct token_state *tokens;\n");
1701 fprintf(f, "\tconfig->words_marks = known;\n");
1702 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1703 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1704 fprintf(f, "\ttokens = token_open(code, config);\n");
1705 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
1706 fprintf(f, "\ttoken_close(tokens);\n");
1707 fprintf(f, "\treturn rv;\n");
1708 fprintf(f, "}\n\n");
1711 ### Table words table
1713 The know words is simply an array of terminal symbols.
1714 The table of nonterminals used for tracing is a similar array.
1718 static void gen_known(FILE *f, struct grammar *g)
1721 fprintf(f, "#line 0 \"gen_known\"\n");
1722 fprintf(f, "static const char *known[] = {\n");
1723 for (i = TK_reserved;
1724 i < g->num_syms && g->symtab[i]->type == Terminal;
1726 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1727 g->symtab[i]->name.txt);
1728 fprintf(f, "};\n\n");
1731 static void gen_non_term(FILE *f, struct grammar *g)
1734 fprintf(f, "#line 0 \"gen_non_term\"\n");
1735 fprintf(f, "static const char *non_term[] = {\n");
1736 for (i = TK_reserved;
1739 if (g->symtab[i]->type == Nonterminal)
1740 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1741 g->symtab[i]->name.txt);
1742 fprintf(f, "};\n\n");
1745 ### States and the goto tables.
1747 For each state we record the goto table and the reducible production if
1749 Some of the details of the reducible production are stored in the
1750 `do_reduce` function to come later. Here we store the production number,
1751 the body size (useful for stack management) and the resulting symbol (useful
1752 for knowing how to free data later).
1754 The go to table is stored in a simple array of `sym` and corresponding
1757 ###### exported types
1765 const struct lookup * go_to;
1774 static void gen_goto(FILE *f, struct grammar *g)
1777 fprintf(f, "#line 0 \"gen_goto\"\n");
1778 for (i = 0; i < g->states; i++) {
1780 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1782 struct symset gt = g->statetab[i]->go_to;
1783 for (j = 0; j < gt.cnt; j++)
1784 fprintf(f, "\t{ %d, %d },\n",
1785 gt.syms[j], gt.data[j]);
1792 static void gen_states(FILE *f, struct grammar *g)
1795 fprintf(f, "#line 0 \"gen_states\"\n");
1796 fprintf(f, "static const struct state states[] = {\n");
1797 for (i = 0; i < g->states; i++) {
1798 struct itemset *is = g->statetab[i];
1800 for (j = 0; j < is->items.cnt; j++) {
1801 int itm = is->items.syms[j];
1802 int p = item_prod(itm);
1803 int bp = item_index(itm);
1804 struct production *pr = g->productions[p];
1806 if (bp < pr->body_size)
1808 /* This is what we reduce */
1813 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d },\n",
1814 i, is->go_to.cnt, i, prod,
1815 prod < 0 ? -1 : g->productions[prod]->body_size,
1816 prod < 0 ? -1 : g->productions[prod]->head->num);
1818 fprintf(f, "};\n\n");
1821 ### The `do_reduce` function and the code
1823 When the parser engine decides to reduce a production, it calls `do_reduce`.
1824 This has two functions.
1826 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1827 production being reduced. Secondly it runs the code that was included with
1828 the production if any.
1830 This code needs to be able to store data somewhere. Rather than requiring
1831 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1832 `do_reduce` return the size to be saved.
1834 The code fragment requires translation when written out. Any `$N` needs to
1835 be converted to a reference either to that buffer (if `$0`) or to the
1836 structure returned by a previous reduction. These pointer need to be cast
1837 to the appropriate type for each access. All this is handling in
1843 static void gen_code(struct production *p, FILE *f, struct grammar *g)
1846 fprintf(f, "\t\t\t");
1847 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1856 if (*c < '0' || *c > '9') {
1861 while (c[1] >= '0' && c[1] <= '9') {
1863 n = n * 10 + *c - '0';
1866 fprintf(f, "(*(struct %.*s*)ret)",
1867 p->head->struct_name.len,
1868 p->head->struct_name.txt);
1869 else if (n > p->body_size)
1870 fprintf(f, "$%d", n);
1871 else if (p->body[n-1]->type == Terminal)
1872 fprintf(f, "(*(struct token *)body[%d])",
1874 else if (p->body[n-1]->struct_name.txt == NULL)
1875 fprintf(f, "$%d", n);
1877 fprintf(f, "(*(struct %.*s*)body[%d])",
1878 p->body[n-1]->struct_name.len,
1879 p->body[n-1]->struct_name.txt, n-1);
1886 static void gen_reduce(FILE *f, struct grammar *g, char *file)
1889 fprintf(f, "#line 0 \"gen_reduce\"\n");
1890 fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n");
1891 fprintf(f, " void *ret)\n");
1893 fprintf(f, "\tint ret_size = 0;\n");
1895 fprintf(f, "\tswitch(prod) {\n");
1896 for (i = 0; i < g->production_count; i++) {
1897 struct production *p = g->productions[i];
1898 fprintf(f, "\tcase %d:\n", i);
1903 if (p->head->struct_name.txt)
1904 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
1905 p->head->struct_name.len,
1906 p->head->struct_name.txt);
1908 fprintf(f, "\t\tbreak;\n");
1910 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
1915 As each non-terminal can potentially cause a different type of data
1916 structure to be allocated and filled in, we need to be able to free it when
1919 It is particularly important to have fine control over freeing during error
1920 recovery where individual stack frames might need to be freed.
1922 For this, the grammar author required to defined a `free_XX` function for
1923 each structure that is used by a non-terminal. `do_free` all call whichever
1924 is appropriate given a symbol number, and will call `free` (as is
1925 appropriate for tokens` on any terminal symbol.
1929 static void gen_free(FILE *f, struct grammar *g)
1933 fprintf(f, "#line 0 \"gen_free\"\n");
1934 fprintf(f, "static void do_free(short sym, void *asn)\n");
1936 fprintf(f, "\tif (!asn) return;\n");
1937 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
1938 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
1939 fprintf(f, "\tswitch(sym) {\n");
1941 for (i = 0; i < g->num_syms; i++) {
1942 struct symbol *s = g->symtab[i];
1944 s->type != Nonterminal ||
1945 s->struct_name.txt == NULL)
1948 fprintf(f, "\tcase %d:\n", s->num);
1949 fprintf(f, "\t\tfree_%.*s(asn);\n",
1951 s->struct_name.txt);
1952 fprintf(f, "\t\tbreak;\n");
1954 fprintf(f, "\t}\n}\n\n");
1957 ## The main routine.
1959 There are three key parts to the "main" function of parsergen: processing
1960 the arguments, loading the grammar file, and dealing with the grammar.
1962 ### Argument processing.
1964 Command line options allow the selection of analysis mode, name of output
1965 file, and whether or not a report should be generated. By default we create
1966 a report only if no code output was requested.
1968 The `parse_XX` function name uses the basename of the output file which
1969 should not have a suffix (`.c`). `.c` is added to the given name for the
1970 code, and `.h` is added for the header (if header text is specifed in the
1977 static const struct option long_options[] = {
1978 { "LR0", 0, NULL, '0' },
1979 { "LR05", 0, NULL, '5' },
1980 { "SLR", 0, NULL, 'S' },
1981 { "LALR", 0, NULL, 'L' },
1982 { "LR1", 0, NULL, '1' },
1983 { "report", 0, NULL, 'R' },
1984 { "output", 1, NULL, 'o' },
1985 { NULL, 0, NULL, 0 }
1987 const char *options = "05SL1Ro:";
1989 ###### process arguments
1991 char *outfile = NULL;
1995 enum grammar_type type = LR05;
1996 while ((opt = getopt_long(argc, argv, options,
1997 long_options, NULL)) != -1) {
2012 outfile = optarg; break;
2014 fprintf(stderr, "Usage: parsergen ...\n");
2019 infile = argv[optind++];
2021 fprintf(stderr, "No input file given\n");
2024 if (outfile && report == 1)
2027 if (name && strchr(name, '/'))
2028 name = strrchr(name, '/')+1;
2030 if (optind < argc) {
2031 fprintf(stderr, "Excess command line arguments\n");
2035 ### Loading the grammar file
2037 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2040 One we have extracted the code (with `mdcode`) we expect to file three
2041 sections: header, code, and grammar. Anything else is an error.
2043 "header" and "code" are optional, though it is hard to build a working
2044 parser with neither. "grammar" must be provided.
2048 #include <sys/mman.h>
2053 static void pr_err(char *msg)
2056 fprintf(stderr, "%s\n", msg);
2060 struct section *table;
2064 fd = open(infile, O_RDONLY);
2066 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2067 infile, strerror(errno));
2070 len = lseek(fd, 0, 2);
2071 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2072 table = code_extract(file, file+len, pr_err);
2074 struct code_node *hdr = NULL;
2075 struct code_node *code = NULL;
2076 struct code_node *gram = NULL;
2077 for (s = table; s; s = s->next) {
2078 if (text_is(s->section, "header"))
2080 else if (text_is(s->section, "code"))
2082 else if (text_is(s->section, "grammar"))
2085 fprintf(stderr, "Unknown content section: %.*s\n",
2086 s->section.len, s->section.txt);
2091 ### Processing the input
2093 As we need to append an extention to a filename and then open it for
2094 writing, and we need to do this twice, it helps to have a separate function.
2098 static FILE *open_ext(char *base, char *ext)
2100 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2102 strcat(strcpy(fn, base), ext);
2108 If we can read the grammar, then we analyse and optionally report on it. If
2109 the report finds conflicts we will exit with an error status.
2111 ###### process input
2112 struct grammar *g = NULL;
2114 fprintf(stderr, "No grammar section provided\n");
2117 g = grammar_read(gram);
2119 fprintf(stderr, "Failure to parse grammar\n");
2124 grammar_analyse(g, type);
2126 if (grammar_report(g, type))
2130 If a headers section is defined, we write it out and include a declaration
2131 for the `parse_XX` function so it can be used from separate code.
2133 if (rv == 0 && hdr && outfile) {
2134 FILE *f = open_ext(outfile, ".h");
2136 code_node_print(f, hdr, infile);
2137 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2141 fprintf(stderr, "Cannot create %s.h\n",
2147 And if all goes well and an output file was provided, we create the `.c`
2148 file with the code section (if any) and the parser tables and function.
2150 if (rv == 0 && outfile) {
2151 FILE *f = open_ext(outfile, ".c");
2154 code_node_print(f, code, infile);
2155 gen_parser(f, g, infile, name);
2158 fprintf(stderr, "Cannot create %s.c\n",
2164 And that about wraps it up. We need to set the locale so that UTF-8 is
2165 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2167 ###### File: parsergen.mk
2168 parsergen : parsergen.o libscanner.o libmdcode.o
2169 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2176 int main(int argc, char *argv[])
2181 setlocale(LC_ALL,"");
2183 ## process arguments
2190 ## The SHIFT/REDUCE parser
2192 Having analysed the grammar and generated all the table, we only need the
2193 shift/reduce engine to bring it all together.
2195 ### Goto table lookup
2197 The parser generator has nicely provided us with goto tables sorted by
2198 symbol number. We need a binary search function to find a symbol in the
2201 ###### parser functions
2203 static int search(const struct state *l, int sym)
2206 int hi = l->go_to_cnt;
2210 while (lo + 1 < hi) {
2211 int mid = (lo + hi) / 2;
2212 if (l->go_to[mid].sym <= sym)
2217 if (l->go_to[lo].sym == sym)
2218 return l->go_to[lo].state;
2223 ### The state stack.
2225 The core data structure for the parser is the stack. This track all the
2226 symbols that have been recognised or partially recognised.
2228 The stack usually won't grow very large - maybe a few tens of entries. So
2229 we dynamically resize and array as required but never bother to shrink it
2232 We keep the stack as two separate allocations. One, `asn_stack` stores the
2233 "abstract syntax nodes" which are created by each reduction. When we call
2234 `do_reduce` we need to pass an array of the `asn`s of the body of the
2235 production, and by keeping a separate `asn` stack, we can just pass a
2236 pointer into this stack.
2238 The other allocation store all other stack fields of which there are two.
2239 The `state` is the most important one and guides the parsing process. The
2240 `sym` is nearly unnecessary. However when we want to free entries from the
2241 `asn_stack`, it helps to know what type they are so we can call the right
2242 freeing function. The symbol leads us to the right free function through
2245 ###### parser functions
2260 The operations are needed on the stack - shift (which is like push) and pop.
2262 Shift applies no only to terminals but also to non-terminals. When we
2263 reduce a production we will pop off entries corresponding to the body
2264 symbols, then push on an item for the head of the production. This last is
2265 exactly the same process as shifting in a terminal so we use the same
2268 To simplify other code we arrange for `shift` to fail if there is no `goto`
2269 state for the symbol. This is useful in basic parsing due to our design
2270 that we shift when we can, and reduce when we cannot. So the `shift`
2271 function reports if it could.
2273 So `shift` finds the next state. If that succeed it extends the allocations
2274 if needed and pushed all the information onto the stacks.
2276 ###### parser functions
2278 static int shift(struct parser *p,
2280 const struct state states[])
2282 // Push an entry onto the stack
2283 int newstate = search(&states[p->state], sym);
2286 if (p->tos >= p->stack_size) {
2287 p->stack_size += 10;
2288 p->stack = realloc(p->stack, p->stack_size
2289 * sizeof(p->stack[0]));
2290 p->asn_stack = realloc(p->asn_stack, p->stack_size
2291 * sizeof(p->asn_stack[0]));
2293 p->stack[p->tos].state = p->state;
2294 p->stack[p->tos].sym = sym;
2295 p->asn_stack[p->tos] = asn;
2297 p->state = newstate;
2301 `pop` simply moves the top of stack (`tos`) back down the required amount
2302 and frees and `asn` entries that need to be freed. It is called _after_ we
2303 reduce a production, just before we `shift` the nonterminal in.
2305 ###### parser functions
2307 static void pop(struct parser *p, int num,
2308 void(*do_free)(short sym, void *asn))
2312 for (i = 0; i < num; i++)
2313 do_free(p->stack[p->tos+i].sym,
2314 p->asn_stack[p->tos+i]);
2316 p->state = p->stack[p->tos].state;
2319 ### Memory allocation
2321 The `scanner` returns tokens in a local variable - we want them in allocated
2322 memory so they can live in the `asn_stack`. Similarly the `asn` produce by
2323 a reduce is in a large buffer. Both of these require some allocation and
2324 copying, hence `memdup` and `tokcopy`.
2326 ###### parser includes
2329 ###### parser functions
2331 void *memdup(void *m, int len)
2337 memcpy(ret, m, len);
2341 static struct token *tok_copy(struct token tk)
2343 struct token *new = malloc(sizeof(*new));
2348 ### The heart of the parser.
2350 Now we have the parser. If we can shift, we do. If not and we can reduce
2351 we do. If the production we reduced was production zero, then we have
2352 accepted the input and can finish.
2354 If we can neither shift nor reduce we have an error to handle. We pop
2355 single entries off the stack until we can shift the `TK_error` symbol, the
2356 drop input tokens until we find one we can shift into the new error state.
2358 We return whatever `asn` was returned by reducing production zero.
2360 ###### parser includes
2363 void *parser_run(struct token_state *tokens,
2364 const struct state states[],
2365 int (*do_reduce)(int, int, void**, void*),
2366 void (*do_free)(short, void*),
2367 FILE *trace, const char *non_term[], int knowns)
2369 struct parser p = { 0 };
2374 tk = tok_copy(token_next(tokens));
2377 parser_trace(trace, &p, tk, states, non_term, knowns);
2379 if (shift(&p, tk->num, tk, states)) {
2380 tk = tok_copy(token_next(tokens));
2383 if (states[p.state].reduce_prod >= 0) {
2385 int prod = states[p.state].reduce_prod;
2386 int size = states[p.state].reduce_size;
2387 int sym = states[p.state].reduce_sym;
2389 static char buf[16*1024];
2391 body = p.asn_stack +
2392 (p.tos - states[p.state].reduce_size);
2394 bufsize = do_reduce(prod, p.tos, body,
2397 pop(&p, size, do_free);
2398 shift(&p, sym, memdup(buf, bufsize), states);
2403 /* Error. we walk up the stack until we
2404 * find a state which will accept TK_error.
2405 * We then shift in TK_error and see what state
2406 * that takes us too.
2407 * Then we discard input tokens until
2408 * we find one that is acceptable.
2410 while (shift(&p, TK_error, tk, states) == 0
2412 // discard this state
2413 pop(&p, 1, do_free);
2415 while (search(&states[p.state], tk->num) < 0 &&
2416 tk->num != TK_eof) {
2418 tk = tok_copy(token_next(tokens));
2420 if (p.tos == 0 && tk->num == TK_eof)
2425 ret = p.asn_stack[0];
2427 pop(&p, p.tos, do_free);
2433 ###### exported functions
2434 void *parser_run(struct token_state *tokens,
2435 const struct state states[],
2436 int (*do_reduce)(int, int, void**, void*),
2437 void (*do_free)(short, void*),
2438 FILE *trace, const char *non_term[], int knowns);
2442 Being able to visualize the parser in action can be invaluable when
2443 debugging the parser code, or trying to understand how the parse of a
2444 particular grammar progresses. The stack contains all the important
2445 state, so just printing out the stack every time around the parse loop
2446 can make it possible to see exactly what is happening.
2448 This doesn't explicitly show each SHIFT and REDUCE action. However they
2449 are easily deduced from the change between consecutive lines, and the
2450 details of each state can be found by cross referencing the states list
2451 in the stack with the "report" that parsergen can generate.
2453 For terminal symbols, we just dump the token. For non-terminals we
2454 print the name of the symbol. The look ahead token is reported at the
2455 end inside square brackets.
2457 ###### parser functions
2459 void parser_trace(FILE *trace, struct parser *p,
2460 struct token *tk, const struct state states[],
2461 const char *non_term[], int knowns)
2464 for (i = 0; i < p->tos; i++) {
2465 int sym = p->stack[i].sym;
2466 fprintf(trace, "(%d) ", p->stack[i].state);
2467 if (sym < TK_reserved + knowns) {
2468 struct token *t = p->asn_stack[i];
2469 text_dump(trace, t->txt, 20);
2471 fputs(non_term[sym - TK_reserved - knowns],
2475 fprintf(trace, "(%d) [", p->state);
2476 text_dump(trace, tk->txt, 20);
2477 fputs("]\n", trace);
2482 The obvious example for a parser is a calculator.
2484 As `scanner` provides number parsing function using `libgmp` is it not much
2485 work to perform arbitrary rational number calculations.
2487 This calculator takes one expression, or an equality test per line. The
2488 results are printed and in any equality test fails, the program exits with
2491 Embedding mdcode inside mdcode is rather horrible. I'd like to find a
2492 better approach, but as the grammar file must have 3 components I need
2493 something like this.
2495 ###### File: parsergen.mk
2496 calc.c : parsergen calc.cgm
2497 ./parsergen -o calc calc.cgm
2498 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2499 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2507 // what do we use for a demo-grammar? A calculator of course.
2521 #include <sys/mman.h>
2526 #include "scanner.h"
2532 static void free_number(struct number *n)
2538 int main(int argc, char *argv[])
2540 int fd = open(argv[1], O_RDONLY);
2541 int len = lseek(fd, 0, 2);
2542 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2543 struct section *s = code_extract(file, file+len, NULL);
2544 struct token_config config = {
2545 .ignored = (1 << TK_line_comment)
2546 | (1 << TK_block_comment)
2549 .number_chars = ".,_+-",
2553 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2561 Session -> Session Line
2564 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2565 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2566 gmp_printf(" or as a decimal: %Fg\n", fl);
2570 | Expression = Expression NEWLINE ${
2571 if (mpq_equal($1.val, $3.val))
2572 gmp_printf("Both equal %Qd\n", $1.val);
2574 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
2579 | NEWLINE ${ printf("Blank line\n"); }$
2580 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2583 Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2584 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2585 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2587 Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2588 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2589 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2591 Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2592 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$