1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. `calc.cgm` is a test grammar for a simple calculator.
22 ###### File: parsergen.c
27 ## forward declarations
38 ###### File: libparser.c
46 ###### File: parsergen.mk
49 parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
52 ## Reading the grammar
54 The grammar must be stored in a markdown literate code file as parsed
55 by "[mdcode][]". It must have three top level (i.e. unreferenced)
56 sections: `header`, `code`, and `grammar`. The first two will be
57 literally copied into the generated `.c` and `.h`. files. The last
58 contains the grammar. This is tokenised with "[scanner][]".
61 [scanner]: scanner.html
67 ###### parser includes
71 The grammar contains production sets, precedence/associativity
72 declarations, and data type declarations. These are all parsed with
73 _ad hoc_ parsing as we don't have a parser generator yet.
75 The precedence and associativity can be set for each production, but
76 can be inherited from symbols. The data type is potentially defined
77 for each non-terminal and describes what C structure is used to store
78 information about each symbol.
81 enum assoc {Left, Right, Non};
82 char *assoc_names[] = {"Left","Right","Non"};
85 struct text struct_name;
87 unsigned short precedence;
91 unsigned short precedence;
99 The strings reported by `mdcode` and `scanner` are `struct text` which have
100 length rather than being null terminated. To help with printing an
101 comparing we define `text_is` and `prtxt`, which should possibly go in
102 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
103 which might contain control characters.
106 static int text_is(struct text t, char *s)
108 return (strlen(s) == t.len &&
109 strncmp(s, t.txt, t.len) == 0);
111 static void prtxt(struct text t)
113 printf("%.*s", t.len, t.txt);
118 Productions are comprised primarily of symbols - terminal and
119 non-terminal. We do not make a syntactic distinction between the two,
120 though non-terminals must be identifiers. Non-terminal symbols are
121 those which appear in the head of a production, terminal symbols are
122 those which don't. There are also "virtual" symbols used for precedence
123 marking discussed later, and sometimes we won't know what type a symbol
126 ###### forward declarations
127 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
128 char *symtypes = "UVTN";
132 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
133 table of known symbols and the resulting parser will report them as
134 `TK_reserved + N`. A small set of identifiers are reserved for the
135 different token types that `scanner` can report.
138 static char *reserved_words[] = {
139 [TK_error] = "ERROR",
140 [TK_number] = "NUMBER",
141 [TK_ident] = "IDENTIFIER",
143 [TK_string] = "STRING",
144 [TK_multi_string] = "MULTI_STRING",
145 [TK_indent] = "INDENT",
146 [TK_undent] = "UNDENT",
147 [TK_newline] = "NEWLINE",
153 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
154 recognised. The former is automatically expected at the end of the text
155 being parsed. The latter are always ignored by the parser.
157 All of these symbols are stored in a simple symbol table. We use the
158 `struct text` from `mdcode` to store the name and link them together
159 into a sorted list using an insertion sort.
161 We don't have separate `find` and `insert` functions as any symbol we
162 find needs to be remembered. We simply expect `find` to always return a
163 symbol, but its type might be `Unknown`.
172 ###### grammar fields
177 static int text_cmp(struct text a, struct text b)
182 int cmp = strncmp(a.txt, b.txt, len);
186 return a.len - b.len;
189 static struct symbol *sym_find(struct grammar *g, struct text s)
191 struct symbol **l = &g->syms;
196 (cmp = text_cmp((*l)->name, s)) < 0)
200 n = calloc(1, sizeof(*n));
209 static void symbols_init(struct grammar *g)
211 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
213 for (i = 0; i < entries; i++) {
216 t.txt = reserved_words[i];
219 t.len = strlen(t.txt);
226 ### Data types and precedence.
228 Data type specification and precedence specification are both
229 introduced by a dollar sign at the start of the line. If the next
230 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
231 precedence, otherwise it specifies a data type.
233 The data type name is simply stored and applied to the head of all
234 subsequent productions. It must be the name of a structure, so `$expr`
235 maps to `struct expr`.
237 The precedence line must contain a list of symbols - typically
238 terminal symbols, but not necessarily. It can only contain symbols
239 that have not been seen yet, so precedence declaration must precede
240 the productions that they affect.
242 A precedence line may also contain a "virtual" symbol which is an
243 identifier preceded by `$$`. Virtual terminals carry precedence
244 information but are not included in the grammar. A production can
245 declare that it inherits the precedence of a given virtual symbol.
247 This use for `$$` precludes it from being used as a symbol in the
248 described language. Two other symbols: `${` and `}$` are also
251 Each new precedence line introduces a new precedence level and
252 declares how it associates. This level is stored in each symbol
253 listed and may be inherited by any production which uses the symbol. A
254 production inherits from the last symbol which has a precedence.
256 ###### grammar fields
257 struct text current_type;
261 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
262 static const char *known[] = { "$$", "${", "}$" };
265 static char *dollar_line(struct token_state *ts, struct grammar *g)
267 struct token t = token_next(ts);
272 if (t.num != TK_ident) {
273 err = "type or assoc expected after '$'";
276 if (text_is(t.txt, "LEFT"))
278 else if (text_is(t.txt, "RIGHT"))
280 else if (text_is(t.txt, "NON"))
283 g->current_type = t.txt;
285 if (t.num != TK_newline) {
286 err = "Extra tokens after type name";
292 // This is a precedence line, need some symbols.
296 while (t.num != TK_newline) {
297 enum symtype type = Terminal;
299 if (t.num == TK_virtual) {
302 if (t.num != TK_ident) {
303 err = "$$ must be followed by a word";
306 } else if (t.num != TK_ident &&
308 err = "Illegal token in precedence line";
311 s = sym_find(g, t.txt);
312 if (s->type != Unknown) {
313 err = "Symbols in precedence line must not already be known.";
317 s->precedence = g->prec_levels;
322 err = "No symbols given on precedence line";
326 while (t.num != TK_newline && t.num != TK_eof)
333 A production either starts with an identifier which is the head
334 non-terminal, or a vertical bar (`|`) in which case this production
335 uses the same head as the previous one. The identifier must be
336 followed by a `->` mark. All productions for a given non-terminal must
337 be grouped together with the `nonterminal ->` given only once.
339 After this a (possibly empty) sequence of identifiers and marks form
340 the body of the production. A virtual symbol may be given after the
341 body by preceding it with `$$`. If a virtual symbol is given, the
342 precedence of the production is that for the virtual symbol. If none
343 is given, the precedence is inherited from the last symbol in the
344 production which has a precedence specified.
346 After the optional precedence may come the `${` mark. This indicates
347 the start of a code fragment. If present, this must be on the same
348 line as the start of the production.
350 All of the text from the `${` through to the matching `}$` is
351 collected and forms the code-fragment for the production. It must all
352 be in one `code_node` of the literate code. The `}$` must be
353 at the end of a line.
355 Text in the code fragment will undergo substitutions where `$N` for
356 some numeric `N` will be replaced with a variable holding the parse
357 information for the particular symbol in the production. `$0` is the
358 head of the production, `$1` is the first symbol of the body, etc.
359 The type of `$N` for a terminal symbol is `struct token`. For
360 non-terminal, it is whatever has been declared for that symbol.
362 While building productions we will need to add to an array which needs to
366 static void array_add(void *varray, int *cnt, void *new)
368 void ***array = varray;
371 current = ((*cnt-1) | (step-1))+1;
372 if (*cnt == current) {
375 *array = realloc(*array, current * sizeof(void*));
377 (*array)[*cnt] = new;
381 Collecting the code fragment simply involves reading tokens until we
382 hit the end token `}$`, and noting the character position of the start and
386 static struct text collect_code(struct token_state *state,
391 code.txt = start.txt.txt + start.txt.len;
393 t = token_next(state);
394 while (t.node == start.node &&
395 t.num != TK_close && t.num != TK_error &&
397 if (t.num == TK_close && t.node == start.node)
398 code.len = t.txt.txt - code.txt;
404 Now we have all the bit we need to parse a full production.
409 ###### grammar fields
410 struct production **productions;
411 int production_count;
413 ###### production fields
415 struct symbol **body;
420 int first_production;
423 static char *parse_production(struct grammar *g,
425 struct token_state *state)
427 /* Head has already been parsed. */
430 struct production p, *pp;
432 memset(&p, 0, sizeof(p));
434 tk = token_next(state);
435 while (tk.num == TK_ident || tk.num == TK_mark) {
436 struct symbol *bs = sym_find(g, tk.txt);
437 if (bs->type == Unknown)
439 if (bs->type == Virtual) {
440 err = "Virtual symbol not permitted in production";
443 if (bs->precedence) {
444 p.precedence = bs->precedence;
447 array_add(&p.body, &p.body_size, bs);
448 tk = token_next(state);
450 if (tk.num == TK_virtual) {
452 tk = token_next(state);
453 if (tk.num != TK_ident) {
454 err = "word required after $$";
457 vs = sym_find(g, tk.txt);
458 if (vs->type != Virtual) {
459 err = "symbol after $$ must be virtual";
462 p.precedence = vs->precedence;
464 tk = token_next(state);
466 if (tk.num == TK_open) {
467 p.code = collect_code(state, tk);
468 if (p.code.txt == NULL) {
469 err = "code fragment not closed properly";
472 tk = token_next(state);
474 if (tk.num != TK_newline && tk.num != TK_eof) {
475 err = "stray tokens at end of line";
478 pp = malloc(sizeof(*pp));
480 array_add(&g->productions, &g->production_count, pp);
483 while (tk.num != TK_newline && tk.num != TK_eof)
484 tk = token_next(state);
488 With the ability to parse production and dollar-lines, we have nearly all
489 that we need to parse a grammar from a `code_node`.
491 The head of the first production will effectively be the `start` symbol of
492 the grammar. However it wont _actually_ be so. Processing the grammar is
493 greatly simplified if the real start symbol only has a single production,
494 and expect `$eof` as the final terminal. So when we find the first explicit
495 production we insert an extra production as production zero which looks like
497 ###### Example: production 0
500 where `START` is the first non-terminal give.
502 ###### create production zero
503 struct production *p = calloc(1,sizeof(*p));
504 struct text start = {"$start",6};
505 struct text eof = {"$eof",4};
506 p->head = sym_find(g, start);
507 p->head->type = Nonterminal;
508 array_add(&p->body, &p->body_size, head);
509 array_add(&p->body, &p->body_size, sym_find(g, eof));
510 g->start = p->head->num;
511 p->head->first_production = g->production_count;
512 array_add(&g->productions, &g->production_count, p);
514 Now we are ready to read in the grammar.
516 ###### grammar fields
517 int start; // the 'start' symbol.
520 static struct grammar *grammar_read(struct code_node *code)
522 struct token_config conf = {
525 .words_marks = known,
526 .known_count = sizeof(known)/sizeof(known[0]),
528 .ignored = (1 << TK_line_comment)
529 | (1 << TK_block_comment)
532 | (1 << TK_multi_string)
537 struct token_state *state = token_open(code, &conf);
538 struct token tk = token_next(state);
539 struct symbol *head = NULL;
543 g = calloc(1, sizeof(*g));
546 for (tk = token_next(state); tk.num != TK_eof;
547 tk = token_next(state)) {
548 if (tk.num == TK_newline)
550 if (tk.num == TK_ident) {
552 head = sym_find(g, tk.txt);
553 if (head->type == Nonterminal)
554 err = "This non-terminal has already be used.";
555 else if (head->type == Virtual)
556 err = "Virtual symbol not permitted in head of production";
558 head->type = Nonterminal;
559 head->struct_name = g->current_type;
561 ## create production zero
563 head->first_production = g->production_count;
564 tk = token_next(state);
565 if (tk.num == TK_mark &&
566 text_is(tk.txt, "->"))
567 err = parse_production(g, head, state);
569 err = "'->' missing in production";
571 } else if (tk.num == TK_mark
572 && text_is(tk.txt, "|")) {
573 // another production for same non-term
575 err = parse_production(g, head, state);
577 err = "First production must have a head";
578 } else if (tk.num == TK_mark
579 && text_is(tk.txt, "$")) {
580 err = dollar_line(state, g);
582 err = "Unrecognised token at start of line.";
590 fprintf(stderr, "Error at line %d: %s\n",
597 ## Analysing the grammar
599 The central task in analysing the grammar is to determine a set of
600 states to drive the parsing process. This will require finding
601 various sets of symbols and of "items". Some of these sets will need
602 to attach information to each element in the set, so they are more
605 Each "item" may have a 'look-ahead' or `LA` set associated with
606 it. Many of these will be the same as each other. To avoid memory
607 wastage, and to simplify some comparisons of sets, these sets will be
608 stored in a list of unique sets, each assigned a number.
610 Once we have the data structures in hand to manage these sets and
611 lists, we can start setting the 'nullable' flag, build the 'FIRST'
612 sets, and then create the item sets which define the various states.
616 Though we don't only store symbols in these sets, they are the main
617 things we store, so they are called symbol sets or "symsets".
619 A symset has a size, an array of shorts, and an optional array of data
620 values, which are also shorts. If the array of data is not present,
621 we store an impossible pointer, as `NULL` is used to indicate that no
622 memory has been allocated yet;
627 unsigned short *syms, *data;
629 #define NO_DATA ((unsigned short *)1)
630 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
631 const struct symset INIT_DATASET = { 0, NULL, NULL };
633 The arrays of shorts are allocated in blocks of 8 and are kept sorted
634 by using an insertion sort. We don't explicitly record the amount of
635 allocated space as it can be derived directly from the current `cnt` using
636 `((cnt - 1) | 7) + 1`.
639 static void symset_add(struct symset *s, int key, int val)
642 int current = ((s->cnt-1) | 7) + 1;
643 if (current == s->cnt) {
645 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
646 if (s->data != NO_DATA)
647 s->data = realloc(s->data, sizeof(*s->data) * current);
650 while (i > 0 && s->syms[i-1] > key) {
651 s->syms[i] = s->syms[i-1];
652 if (s->data != NO_DATA)
653 s->data[i] = s->data[i-1];
657 if (s->data != NO_DATA)
662 Finding a symbol (or item) in a `symset` uses a simple binary search.
663 We return the index where the value was found (so data can be accessed),
664 or `-1` to indicate failure.
666 static int symset_find(struct symset *ss, int key)
673 while (lo + 1 < hi) {
674 int mid = (lo + hi) / 2;
675 if (ss->syms[mid] <= key)
680 if (ss->syms[lo] == key)
685 We will often want to form the union of two symsets. When we do, we
686 will often want to know if anything changed (as they might mean there
687 is more work to do). So `symset_union` reports whether anything was
688 added to the first set. We use a slow quadratic approach as these
689 sets don't really get very big. If profiles shows this to be a problem is
690 can be optimised later.
692 static int symset_union(struct symset *a, struct symset *b)
696 for (i = 0; i < b->cnt; i++)
697 if (symset_find(a, b->syms[i]) < 0) {
699 if (b->data != NO_DATA)
701 symset_add(a, b->syms[i], data);
707 And of course we must be able to free a symset.
709 static void symset_free(struct symset ss)
712 if (ss.data != NO_DATA)
718 Some symsets are simply stored somewhere appropriate in the `grammar`
719 data structure, others needs a bit of indirection. This applies
720 particularly to "LA" sets. These will be paired with "items" in an
721 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
722 stored in the `data` field, so we need a mapping from a small (short)
723 number to an LA `symset`.
725 As mentioned earlier this involves creating a list of unique symsets.
727 For now, we just use a linear list sorted by insertion. A skiplist
728 would be more efficient and may be added later.
735 struct setlist *next;
738 ###### grammar fields
739 struct setlist *sets;
744 static int ss_cmp(struct symset a, struct symset b)
747 int diff = a.cnt - b.cnt;
750 for (i = 0; i < a.cnt; i++) {
751 diff = (int)a.syms[i] - (int)b.syms[i];
758 static int save_set(struct grammar *g, struct symset ss)
760 struct setlist **sl = &g->sets;
764 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
771 s = malloc(sizeof(*s));
780 Finding a set by number is currently performed by a simple linear search.
781 If this turns out to hurt performance, we can store the sets in a dynamic
782 array like the productions.
784 static struct symset set_find(struct grammar *g, int num)
786 struct setlist *sl = g->sets;
787 while (sl && sl->num != num)
793 ### Setting `nullable`
795 We set `nullable` on the head symbol for any production for which all
796 the body symbols (if any) are nullable. As this is a recursive
797 definition, any change in the `nullable` setting means that we need to
798 re-evaluate where it needs to be set.
800 We simply loop around performing the same calculations until no more
807 static void set_nullable(struct grammar *g)
810 while (check_again) {
813 for (p = 0; p < g->production_count; p++) {
814 struct production *pr = g->productions[p];
817 if (pr->head->nullable)
819 for (s = 0; s < pr->body_size; s++)
820 if (! pr->body[s]->nullable)
822 if (s == pr->body_size) {
823 pr->head->nullable = 1;
830 ### Building the `first` sets
832 When calculating what can follow a particular non-terminal, we will need to
833 know what the "first" terminal in an subsequent non-terminal might be. So
834 we calculate the `first` set for every non-terminal and store them in an
835 array. We don't bother recording the "first" set for terminals as they are
838 As the "first" for one symbol might depend on the "first" of another,
839 we repeat the calculations until no changes happen, just like with
840 `set_nullable`. This makes use of the fact that `symset_union`
841 reports if any change happens.
843 The core of this which finds the "first" of part of a production body
844 will be reused for computing the "follow" sets, so we split it out
845 into a separate function.
847 ###### grammar fields
848 struct symset *first;
852 static int add_first(struct production *pr, int start,
853 struct symset *target, struct grammar *g,
858 for (s = start; s < pr->body_size; s++) {
859 struct symbol *bs = pr->body[s];
860 if (bs->type == Terminal) {
861 if (symset_find(target, bs->num) < 0) {
862 symset_add(target, bs->num, 0);
866 } else if (symset_union(target, &g->first[bs->num]))
872 *to_end = (s == pr->body_size);
876 static void build_first(struct grammar *g)
880 g->first = calloc(g->num_syms, sizeof(g->first[0]));
881 for (s = 0; s < g->num_syms; s++)
882 g->first[s] = INIT_SYMSET;
884 while (check_again) {
887 for (p = 0; p < g->production_count; p++) {
888 struct production *pr = g->productions[p];
889 struct symset *head = &g->first[pr->head->num];
891 if (add_first(pr, 0, head, g, NULL))
897 ### Building the `follow` sets.
899 There are two different situations when we will want to generate "follow"
900 sets. If we are doing an SLR analysis, we want to generate a single
901 "follow" set for each non-terminal in the grammar. That is what is
902 happening here. If we are doing an LALR or LR analysis we will want
903 to generate a separate "LA" set for each item. We do that later
906 There are two parts to generating a "follow" set. Firstly we look at
907 every place that any non-terminal appears in the body of any
908 production, and we find the set of possible "first" symbols after
909 there. This is done using `add_first` above and only needs to be done
910 once as the "first" sets are now stable and will not change.
914 for (p = 0; p < g->production_count; p++) {
915 struct production *pr = g->productions[p];
918 for (b = 0; b < pr->body_size - 1; b++) {
919 struct symbol *bs = pr->body[b];
920 if (bs->type == Terminal)
922 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
926 The second part is to add the "follow" set of the head of a production
927 to the "follow" sets of the final symbol in the production, and any
928 other symbol which is followed only by `nullable` symbols. As this
929 depends on "follow" itself we need to repeatedly perform the process
930 until no further changes happen.
934 for (again = 0, p = 0;
935 p < g->production_count;
936 p < g->production_count-1
937 ? p++ : again ? (again = 0, p = 0)
939 struct production *pr = g->productions[p];
942 for (b = pr->body_size - 1; b >= 0; b--) {
943 struct symbol *bs = pr->body[b];
944 if (bs->type == Terminal)
946 if (symset_union(&g->follow[bs->num],
947 &g->follow[pr->head->num]))
954 We now just need to create and initialise the `follow` list to get a
957 ###### grammar fields
958 struct symset *follow;
961 static void build_follow(struct grammar *g)
964 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
965 for (s = 0; s < g->num_syms; s++)
966 g->follow[s] = INIT_SYMSET;
970 ### Building itemsets and states
972 There are three different levels of detail that can be chosen for
973 building the itemsets and states for the LR grammar. They are:
975 1. LR(0) or SLR(1), where no look-ahead is considered.
976 2. LALR(1) where we build look-ahead sets with each item and merge
977 the LA sets when we find two paths to the same "kernel" set of items.
978 3. LR(1) where different look-ahead for any item in the code means
979 a different state must be created.
981 ###### forward declarations
982 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
984 We need to be able to look through existing states to see if a newly
985 generated state already exists. For now we use a simple sorted linked
988 An item is a pair of numbers: the production number and the position of
989 "DOT", which is an index into the body of the production.
990 As the numbers are not enormous we can combine them into a single "short"
991 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
992 production number (so 4000 productions with maximum of 15 symbols in the
995 Comparing the itemsets will be a little different to comparing symsets
996 as we want to do the lookup after generating the "kernel" of an
997 itemset, so we need to ignore the offset=zero items which are added during
1000 To facilitate this, we modify the "DOT" number so that "0" sorts to the end of
1001 the list in the symset, and then only compare items before the first "0".
1004 static inline unsigned short item_num(int production, int index)
1006 return production | (((index-1)&0x1f) << 11);
1008 static inline int item_prod(unsigned short item)
1010 return item & 0x7ff;
1012 static inline int item_index(unsigned short item)
1014 return ((item >> 11)+1) & 0x1f;
1017 For LR(1) analysis we need to compare not just the itemset in a state
1018 but also the LA sets. As we assign each unique LA set a number, we
1019 can just compare the symset and the data values together.
1022 static int itemset_cmp(struct symset a, struct symset b,
1023 enum grammar_type type)
1029 i < a.cnt && i < b.cnt &&
1030 item_index(a.syms[i]) > 0 &&
1031 item_index(b.syms[i]) > 0;
1033 int diff = a.syms[i] - b.syms[i];
1037 diff = a.data[i] - b.data[i];
1042 if (i == a.cnt || item_index(a.syms[i]) == 0)
1046 if (i == b.cnt || item_index(b.syms[i]) == 0)
1052 if (type < LR1 || av == -1)
1055 a.data[i] - b.data[i];
1058 And now we can build the list of itemsets. The lookup routine returns
1059 both a success flag and a pointer to where in the list an insert
1060 should happen, so we don't need to search a second time.
1064 struct itemset *next;
1066 struct symset items;
1067 struct symset go_to;
1071 ###### grammar fields
1072 struct itemset *items;
1076 static int itemset_find(struct grammar *g, struct itemset ***where,
1077 struct symset kernel, enum grammar_type type)
1079 struct itemset **ip;
1081 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1082 struct itemset *i = *ip;
1084 diff = itemset_cmp(i->items, kernel, type);
1097 Adding an itemset may require merging the LA sets if LALR analysis is
1098 happening. If any new LA set add symbol that weren't in the old LA set, we
1099 clear the `completed` flag so that the dependants of this itemset will be
1100 recalculated and their LA sets updated.
1102 `add_itemset` must consume the symsets it is passed, either by adding
1103 them to a data structure, of freeing them.
1105 static int add_itemset(struct grammar *g, struct symset ss,
1106 enum grammar_type type)
1108 struct itemset **where, *is;
1110 int found = itemset_find(g, &where, ss, type);
1112 is = calloc(1, sizeof(*is));
1113 is->state = g->states;
1117 is->go_to = INIT_DATASET;
1126 for (i = 0; i < ss.cnt; i++) {
1127 struct symset temp = INIT_SYMSET, s;
1128 if (ss.data[i] == is->items.data[i])
1130 s = set_find(g, is->items.data[i]);
1131 symset_union(&temp, &s);
1132 s = set_find(g, ss.data[i]);
1133 if (symset_union(&temp, &s)) {
1134 is->items.data[i] = save_set(g, temp);
1145 To build all the itemsets, we first insert the initial itemset made from the
1146 start symbol, complete each itemset, and then generate new itemsets from old
1147 until no new ones can be made.
1149 Completing an itemset means finding all the items where "DOT" is followed by
1150 a nonterminal and adding "DOT=0" items for every production from that
1151 non-terminal - providing each item hasn't already been added.
1153 If LA sets are needed, the LA set for each new item is found using
1154 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1155 be supplemented by the LA set for the item which produce the new item.
1157 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1158 is used in the next stage.
1160 NOTE: precedence handling should happen here - I haven't written this yet
1163 ###### complete itemset
1164 for (i = 0; i < is->items.cnt; i++) {
1165 int p = item_prod(is->items.syms[i]);
1166 int bs = item_index(is->items.syms[i]);
1167 struct production *pr = g->productions[p];
1170 struct symset LA = INIT_SYMSET;
1173 if (bs == pr->body_size)
1176 if (symset_find(&done, s->num) < 0)
1177 symset_add(&done, s->num, 0);
1178 if (s->type != Nonterminal)
1184 add_first(pr, bs+1, &LA, g, &to_end);
1186 struct symset ss = set_find(g, is->items.data[i]);
1187 symset_union(&LA, &ss);
1189 sn = save_set(g, LA);
1190 LA = set_find(g, sn);
1192 /* Add productions for this symbol */
1193 for (p2 = s->first_production;
1194 p2 < g->production_count &&
1195 g->productions[p2]->head == s;
1197 int itm = item_num(p2, 0);
1198 int pos = symset_find(&is->items, itm);
1200 symset_add(&is->items, itm, sn);
1201 /* Will have re-ordered, so start
1202 * from beginning again */
1204 } else if (type >= LALR) {
1205 struct symset ss = set_find(g, is->items.data[pos]);
1206 struct symset tmp = INIT_SYMSET;
1208 symset_union(&tmp, &ss);
1209 if (symset_union(&tmp, &LA)) {
1210 is->items.data[pos] = save_set(g, tmp);
1218 For each symbol we found (and placed in `done`) we collect all the items for
1219 which this symbol is next, and create a new itemset with "DOT" advanced over
1220 the symbol. This is then added to the collection of itemsets (or merged
1221 with a pre-existing itemset).
1223 ###### derive itemsets
1224 // Now we have a completed itemset, so we need to
1225 // create all the 'next' itemset and create them
1226 // if they don't exist.
1227 for (i = 0; i < done.cnt; i++) {
1230 struct symset newitemset = INIT_SYMSET;
1232 newitemset = INIT_DATASET;
1234 for (j = 0; j < is->items.cnt; j++) {
1235 int itm = is->items.syms[j];
1236 int p = item_prod(itm);
1237 int bp = item_index(itm);
1238 struct production *pr = g->productions[p];
1242 if (bp == pr->body_size)
1244 if (pr->body[bp]->num != done.syms[i])
1247 la = is->items.data[j];
1248 pos = symset_find(&newitemset, pr->head->num);
1250 symset_add(&newitemset, item_num(p, bp+1), la);
1251 else if (type >= LALR) {
1252 // Need to merge la set.
1253 int la2 = newitemset.data[pos];
1255 struct symset ss = set_find(g, la2);
1256 struct symset LA = INIT_SYMSET;
1257 symset_union(&LA, &ss);
1258 ss = set_find(g, la);
1259 if (symset_union(&LA, &ss))
1260 newitemset.data[pos] = save_set(g, LA);
1266 state = add_itemset(g, newitemset, type);
1267 if (symset_find(&is->go_to, done.syms[i]) < 0)
1268 symset_add(&is->go_to, done.syms[i], state);
1271 All that is left is to crate the initial itemset from production zero, and
1272 with `TK_eof` as the LA set.
1275 static void build_itemsets(struct grammar *g, enum grammar_type type)
1277 struct symset first = INIT_SYMSET;
1282 // LA set just has eof
1283 struct symset eof = INIT_SYMSET;
1284 symset_add(&eof, TK_eof, 0);
1285 la = save_set(g, eof);
1286 first = INIT_DATASET;
1288 // production 0, offset 0 (with no data)
1289 symset_add(&first, item_num(0, 0), la);
1290 add_itemset(g, first, type);
1291 for (again = 0, is = g->items;
1293 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1295 struct symset done = INIT_SYMSET;
1305 ### Completing the analysis.
1307 The exact process of analysis depends on which level was requested. For
1308 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1309 `LR1` we need first, but not follow. For `SLR` we need both.
1311 We don't build the "action" tables that you might expect as the parser
1312 doesn't use them. They will be calculated to some extent if needed for
1315 Once we have built everything we allocate arrays for the two lists:
1316 symbols and itemsets. This allows more efficient access during reporting.
1317 The symbols are grouped as terminals and non-terminals and we record the
1318 changeover point in `first_nonterm`.
1320 ###### grammar fields
1321 struct symbol **symtab;
1322 struct itemset **statetab;
1325 ###### grammar_analyse
1327 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1331 int snum = TK_reserved;
1332 for (s = g->syms; s; s = s->next)
1333 if (s->num < 0 && s->type == Terminal) {
1337 g->first_nonterm = snum;
1338 for (s = g->syms; s; s = s->next)
1344 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1345 for (s = g->syms; s; s = s->next)
1346 g->symtab[s->num] = s;
1355 build_itemsets(g, type);
1357 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1358 for (is = g->items; is ; is = is->next)
1359 g->statetab[is->state] = is;
1362 ## Reporting on the Grammar
1364 The purpose of the report is to give the grammar developer insight into
1365 how the grammar parser will work. It is basically a structured dump of
1366 all the tables that have been generated, plus an description of any conflicts.
1368 ###### grammar_report
1369 static int grammar_report(struct grammar *g, enum grammar_type type)
1375 return report_conflicts(g, type);
1378 Firstly we have the complete list of symbols, together with the "FIRST"
1379 set if that was generated.
1383 static void report_symbols(struct grammar *g)
1387 printf("SYMBOLS + FIRST:\n");
1389 printf("SYMBOLS:\n");
1391 for (n = 0; n < g->num_syms; n++) {
1392 struct symbol *s = g->symtab[n];
1396 printf(" %c%3d%c: ",
1397 s->nullable ? '*':' ',
1398 s->num, symtypes[s->type]);
1401 printf(" (%d%s)", s->precedence,
1402 assoc_names[s->assoc]);
1404 if (g->first && s->type == Nonterminal) {
1407 for (j = 0; j < g->first[n].cnt; j++) {
1410 prtxt(g->symtab[g->first[n].syms[j]]->name);
1417 Then we have to follow sets if they were computed.
1419 static void report_follow(struct grammar *g)
1422 printf("FOLLOW:\n");
1423 for (n = 0; n < g->num_syms; n++)
1424 if (g->follow[n].cnt) {
1428 prtxt(g->symtab[n]->name);
1429 for (j = 0; j < g->follow[n].cnt; j++) {
1432 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1438 And finally the item sets. These include the GO TO tables and, for
1439 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1440 it up a bit. First the items, with production number and associativity.
1442 static void report_item(struct grammar *g, int itm)
1444 int p = item_prod(itm);
1445 int dot = item_index(itm);
1446 struct production *pr = g->productions[p];
1450 prtxt(pr->head->name);
1452 for (i = 0; i < pr->body_size; i++) {
1453 printf(" %s", (dot == i ? ". ": ""));
1454 prtxt(pr->body[i]->name);
1456 if (dot == pr->body_size)
1460 printf(" (%d%s)", pr->precedence,
1461 assoc_names[pr->assoc]);
1465 The LA sets which are (possibly) reported with each item:
1467 static void report_la(struct grammar *g, int lanum)
1469 struct symset la = set_find(g, lanum);
1473 printf(" LOOK AHEAD(%d)", lanum);
1474 for (i = 0; i < la.cnt; i++) {
1477 prtxt(g->symtab[la.syms[i]]->name);
1482 Then the go to sets:
1485 static void report_goto(struct grammar *g, struct symset gt)
1490 for (i = 0; i < gt.cnt; i++) {
1492 prtxt(g->symtab[gt.syms[i]]->name);
1493 printf(" -> %d\n", gt.data[i]);
1497 Now we can report all the item sets complete with items, LA sets, and GO TO.
1499 static void report_itemsets(struct grammar *g)
1502 printf("ITEM SETS(%d)\n", g->states);
1503 for (s = 0; s < g->states; s++) {
1505 struct itemset *is = g->statetab[s];
1506 printf(" Itemset %d:\n", s);
1507 for (j = 0; j < is->items.cnt; j++) {
1508 report_item(g, is->items.syms[j]);
1509 if (is->items.data != NO_DATA)
1510 report_la(g, is->items.data[j]);
1512 report_goto(g, is->go_to);
1516 ### Reporting conflicts
1518 Conflict detection varies a lot among different analysis levels. However
1519 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1520 LR1 are also similar as they have FOLLOW or LA sets.
1524 ## conflict functions
1526 static int report_conflicts(struct grammar *g, enum grammar_type type)
1529 printf("Conflicts:\n");
1531 cnt = conflicts_lr0(g, type);
1533 cnt = conflicts_slr(g, type);
1535 printf(" - no conflicts\n");
1539 LR0 conflicts are any state which have both a reducible item and
1542 LR05 conflicts only occurs if two possibly reductions exist,
1543 as shifts always over-ride reductions.
1545 ###### conflict functions
1546 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1550 for (i = 0; i < g->states; i++) {
1551 struct itemset *is = g->statetab[i];
1552 int last_reduce = -1;
1553 int prev_reduce = -1;
1554 int last_shift = -1;
1558 for (j = 0; j < is->items.cnt; j++) {
1559 int itm = is->items.syms[j];
1560 int p = item_prod(itm);
1561 int bp = item_index(itm);
1562 struct production *pr = g->productions[p];
1564 if (bp == pr->body_size) {
1565 prev_reduce = last_reduce;
1569 if (pr->body[bp]->type == Terminal)
1572 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1573 printf(" State %d has both SHIFT and REDUCE:\n", i);
1574 report_item(g, is->items.syms[last_shift]);
1575 report_item(g, is->items.syms[last_reduce]);
1578 if (prev_reduce >= 0) {
1579 printf(" State %d has 2 (or more) reducible items\n", i);
1580 report_item(g, is->items.syms[prev_reduce]);
1581 report_item(g, is->items.syms[last_reduce]);
1588 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1589 look ahead, or if a symbol in a look-ahead can be shifted. The differ only
1590 in the source of the look ahead set.
1592 We build a dataset mapping terminal to item for possible SHIFTs and then
1593 another for possible REDUCE operations. We report when we get conflicts
1596 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1601 for (i = 0; i < g->states; i++) {
1602 struct itemset *is = g->statetab[i];
1603 struct symset shifts = INIT_DATASET;
1604 struct symset reduce = INIT_DATASET;
1608 /* First collect the shifts */
1609 for (j = 0; j < is->items.cnt; j++) {
1610 int itm = is->items.syms[j];
1611 int p = item_prod(itm);
1612 int bp = item_index(itm);
1613 struct production *pr = g->productions[p];
1615 if (bp < pr->body_size &&
1616 pr->body[bp]->type == Terminal) {
1618 int sym = pr->body[bp]->num;
1619 if (symset_find(&shifts, sym) < 0)
1620 symset_add(&shifts, sym, itm);
1623 /* Now look for reduction and conflicts */
1624 for (j = 0; j < is->items.cnt; j++) {
1625 int itm = is->items.syms[j];
1626 int p = item_prod(itm);
1627 int bp = item_index(itm);
1628 struct production *pr = g->productions[p];
1630 if (bp < pr->body_size)
1635 la = g->follow[pr->head->num];
1637 la = set_find(g, is->items.data[j]);
1639 for (k = 0; k < la.cnt; k++) {
1640 int pos = symset_find(&shifts, la.syms[k]);
1642 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1643 prtxt(g->symtab[la.syms[k]]->name);
1645 report_item(g, shifts.data[pos]);
1646 report_item(g, itm);
1649 pos = symset_find(&reduce, la.syms[k]);
1651 symset_add(&reduce, la.syms[k], itm);
1654 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1655 prtxt(g->symtab[la.syms[k]]->name);
1657 report_item(g, itm);
1658 report_item(g, reduce.data[pos]);
1662 symset_free(shifts);
1663 symset_free(reduce);
1669 ## Generating the parser
1671 The export part of the parser is the `parse_XX` function, where the name
1672 `XX` is based on the name of the parser files.
1674 This takes a `code_node`, a partially initialized `token_config`, and an
1675 optional `FILE` to send tracing to. The `token_config` gets the list of
1676 known words added and then is used with the `code_node` to initialize the
1679 `parse_XX` then call the library function `parser_run` to actually complete
1680 the parse, This needs the `states` table and function to call the various
1681 pieces of code provided in the grammar file, so they are generated first.
1683 ###### parser_generate
1685 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1690 gen_reduce(f, g, file);
1693 fprintf(f, "#line 0 \"gen_parser\"\n");
1694 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1697 fprintf(f, "\tstruct token_state *tokens;\n");
1698 fprintf(f, "\tconfig->words_marks = known;\n");
1699 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1700 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1701 fprintf(f, "\ttokens = token_open(code, config);\n");
1702 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace);\n");
1703 fprintf(f, "\ttoken_close(tokens);\n");
1704 fprintf(f, "\treturn rv;\n");
1705 fprintf(f, "}\n\n");
1708 ### Table words table
1710 The know words is simply an array of terminal symbols.
1714 static void gen_known(FILE *f, struct grammar *g)
1717 fprintf(f, "#line 0 \"gen_known\"\n");
1718 fprintf(f, "static const char *known[] = {\n");
1719 for (i = TK_reserved;
1720 i < g->num_syms && g->symtab[i]->type == Terminal;
1722 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1723 g->symtab[i]->name.txt);
1724 fprintf(f, "};\n\n");
1727 ### States and the goto tables.
1729 For each state we record the goto table and the reducible production if
1731 Some of the details of the reducible production are stored in the
1732 `do_reduce` function to come later. Here we store the production number,
1733 the body size (useful for stack management) and the resulting symbol (useful
1734 for knowing how to free data later).
1736 The go to table is stored in a simple array of `sym` and corresponding
1739 ###### exported types
1747 const struct lookup * go_to;
1756 static void gen_goto(FILE *f, struct grammar *g)
1759 fprintf(f, "#line 0 \"gen_goto\"\n");
1760 for (i = 0; i < g->states; i++) {
1762 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1764 struct symset gt = g->statetab[i]->go_to;
1765 for (j = 0; j < gt.cnt; j++)
1766 fprintf(f, "\t{ %d, %d },\n",
1767 gt.syms[j], gt.data[j]);
1774 static void gen_states(FILE *f, struct grammar *g)
1777 fprintf(f, "#line 0 \"gen_states\"\n");
1778 fprintf(f, "static const struct state states[] = {\n");
1779 for (i = 0; i < g->states; i++) {
1780 struct itemset *is = g->statetab[i];
1782 for (j = 0; j < is->items.cnt; j++) {
1783 int itm = is->items.syms[j];
1784 int p = item_prod(itm);
1785 int bp = item_index(itm);
1786 struct production *pr = g->productions[p];
1788 if (bp < pr->body_size)
1790 /* This is what we reduce */
1795 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d },\n",
1796 i, is->go_to.cnt, i, prod,
1797 prod < 0 ? -1 : g->productions[prod]->body_size,
1798 prod < 0 ? -1 : g->productions[prod]->head->num);
1800 fprintf(f, "};\n\n");
1803 ### The `do_reduce` function and the code
1805 When the parser engine decides to reduce a production, it calls `do_reduce`.
1806 This has two functions.
1808 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1809 production being reduced. Secondly it runs the code that was included with
1810 the production if any.
1812 This code needs to be able to store data somewhere. Rather than requiring
1813 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1814 `do_reduce` return the size to be saved.
1816 The code fragment requires translation when written out. Any `$N` needs to
1817 be converted to a reference either to that buffer (if `$0`) or to the
1818 structure returned by a previous reduction. These pointer need to be cast
1819 to the appropriate type for each access. All this is handling in
1825 static void gen_code(struct production *p, FILE *f, struct grammar *g)
1828 fprintf(f, "\t\t\t");
1829 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1838 if (*c < '0' || *c > '9') {
1843 while (c[1] >= '0' && c[1] <= '9') {
1845 n = n * 10 + *c - '0';
1848 fprintf(f, "(*(struct %.*s*)ret)",
1849 p->head->struct_name.len,
1850 p->head->struct_name.txt);
1851 else if (n > p->body_size)
1852 fprintf(f, "$%d", n);
1853 else if (p->body[n-1]->type == Terminal)
1854 fprintf(f, "(*(struct token *)body[%d])",
1856 else if (p->body[n-1]->struct_name.txt == NULL)
1857 fprintf(f, "$%d", n);
1859 fprintf(f, "(*(struct %.*s*)body[%d])",
1860 p->body[n-1]->struct_name.len,
1861 p->body[n-1]->struct_name.txt, n-1);
1868 static void gen_reduce(FILE *f, struct grammar *g, char *file)
1871 fprintf(f, "#line 0 \"gen_reduce\"\n");
1872 fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n");
1873 fprintf(f, " void *ret, FILE *trace)\n");
1875 fprintf(f, "\tint ret_size = 0;\n");
1877 fprintf(f, "\tswitch(prod) {\n");
1878 for (i = 0; i < g->production_count; i++) {
1879 struct production *p = g->productions[i];
1880 fprintf(f, "\tcase %d:\n", i);
1885 fprintf(f, "\t\tif (trace) {\n");
1886 fprintf(f, "\t\t\tfprintf(trace, \"[%%2d]%.*s ->\", depth);\n",
1887 p->head->name.len, p->head->name.txt);
1888 for (j = 0; j < p->body_size; j++) {
1889 if (p->body[j]->type == Terminal) {
1890 fputs("\t\t\tfputs(\" \", trace);\n", f);
1891 fprintf(f, "\t\t\ttext_dump(trace, (*(struct token*)body[%d]).txt, 20);\n", j);
1893 fprintf(f, "\t\t\tfprintf(trace, \" %.*s\");\n",
1894 p->body[j]->name.len,
1895 p->body[j]->name.txt);
1898 fprintf(f, "\t\t}\n");
1900 if (p->head->struct_name.txt)
1901 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
1902 p->head->struct_name.len,
1903 p->head->struct_name.txt);
1905 fprintf(f, "\t\tbreak;\n");
1907 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
1912 As each non-terminal can potentially cause a different type of data
1913 structure to be allocated and filled in, we need to be able to free it when
1916 It is particularly important to have fine control over freeing during error
1917 recovery where individual stack frames might need to be freed.
1919 For this, the grammar author required to defined a `free_XX` function for
1920 each structure that is used by a non-terminal. `do_free` all call whichever
1921 is appropriate given a symbol number, and will call `free` (as is
1922 appropriate for tokens` on any terminal symbol.
1926 static void gen_free(FILE *f, struct grammar *g)
1930 fprintf(f, "#line 0 \"gen_free\"\n");
1931 fprintf(f, "static void do_free(short sym, void *asn)\n");
1933 fprintf(f, "\tif (!asn) return;\n");
1934 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
1935 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
1936 fprintf(f, "\tswitch(sym) {\n");
1938 for (i = 0; i < g->num_syms; i++) {
1939 struct symbol *s = g->symtab[i];
1941 s->type != Nonterminal ||
1942 s->struct_name.txt == NULL)
1945 fprintf(f, "\tcase %d:\n", s->num);
1946 fprintf(f, "\t\tfree_%.*s(asn);\n",
1948 s->struct_name.txt);
1949 fprintf(f, "\t\tbreak;\n");
1951 fprintf(f, "\t}\n}\n\n");
1954 ## The main routine.
1956 There are three key parts to the "main" function of parsergen: processing
1957 the arguments, loading the grammar file, and dealing with the grammar.
1959 ### Argument processing.
1961 Command line options allow the selection of analysis mode, name of output
1962 file, and whether or not a report should be generated. By default we create
1963 a report only if no code output was requested.
1965 The `parse_XX` function name uses the basename of the output file which
1966 should not have a suffix (`.c`). `.c` is added to the given name for the
1967 code, and `.h` is added for the header (if header text is specifed in the
1974 static const struct option long_options[] = {
1975 { "LR0", 0, NULL, '0' },
1976 { "LR05", 0, NULL, '5' },
1977 { "SLR", 0, NULL, 'S' },
1978 { "LALR", 0, NULL, 'L' },
1979 { "LR1", 0, NULL, '1' },
1980 { "report", 0, NULL, 'R' },
1981 { "output", 1, NULL, 'o' },
1982 { NULL, 0, NULL, 0 }
1984 const char *options = "05SL1Ro:";
1986 ###### process arguments
1988 char *outfile = NULL;
1992 enum grammar_type type = LR05;
1993 while ((opt = getopt_long(argc, argv, options,
1994 long_options, NULL)) != -1) {
2009 outfile = optarg; break;
2011 fprintf(stderr, "Usage: parsergen ...\n");
2016 infile = argv[optind++];
2018 fprintf(stderr, "No input file given\n");
2021 if (outfile && report == 1)
2024 if (name && strchr(name, '/'))
2025 name = strrchr(name, '/')+1;
2027 if (optind < argc) {
2028 fprintf(stderr, "Excess command line arguments\n");
2032 ### Loading the grammar file
2034 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2037 One we have extracted the code (with `mdcode`) we expect to file three
2038 sections: header, code, and grammar. Anything else is an error.
2040 "header" and "code" are optional, though it is hard to build a working
2041 parser with neither. "grammar" must be provided.
2045 #include <sys/mman.h>
2050 static void pr_err(char *msg)
2053 fprintf(stderr, "%s\n", msg);
2057 struct section *table;
2061 fd = open(infile, O_RDONLY);
2063 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2064 infile, strerror(errno));
2067 len = lseek(fd, 0, 2);
2068 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2069 table = code_extract(file, file+len, pr_err);
2071 struct code_node *hdr = NULL;
2072 struct code_node *code = NULL;
2073 struct code_node *gram = NULL;
2074 for (s = table; s; s = s->next) {
2075 if (text_is(s->section, "header"))
2077 else if (text_is(s->section, "code"))
2079 else if (text_is(s->section, "grammar"))
2082 fprintf(stderr, "Unknown content section: %.*s\n",
2083 s->section.len, s->section.txt);
2088 ### Processing the input
2090 As we need to append an extention to a filename and then open it for
2091 writing, and we need to do this twice, it helps to have a separate function.
2095 static FILE *open_ext(char *base, char *ext)
2097 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2099 strcat(strcpy(fn, base), ext);
2105 If we can read the grammar, then we analyse and optionally report on it. If
2106 the report finds conflicts we will exit with an error status.
2108 ###### process input
2109 struct grammar *g = NULL;
2111 fprintf(stderr, "No grammar section provided\n");
2114 g = grammar_read(gram);
2116 fprintf(stderr, "Failure to parse grammar\n");
2121 grammar_analyse(g, type);
2123 if (grammar_report(g, type))
2127 If a headers section is defined, we write it out and include a declaration
2128 for the `parse_XX` function so it can be used from separate code.
2130 if (rv == 0 && hdr && outfile) {
2131 FILE *f = open_ext(outfile, ".h");
2133 code_node_print(f, hdr, infile);
2134 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2138 fprintf(stderr, "Cannot create %s.h\n",
2144 And if all goes well and an output file was provided, we create the `.c`
2145 file with the code section (if any) and the parser tables and function.
2147 if (rv == 0 && outfile) {
2148 FILE *f = open_ext(outfile, ".c");
2151 code_node_print(f, code, infile);
2152 gen_parser(f, g, infile, name);
2155 fprintf(stderr, "Cannot create %s.c\n",
2161 And that about wraps it up. We need to set the locale so that UTF-8 is
2162 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2164 ###### File: parsergen.mk
2165 parsergen : parsergen.o libscanner.o libmdcode.o
2166 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2173 int main(int argc, char *argv[])
2178 setlocale(LC_ALL,"");
2180 ## process arguments
2187 ## The SHIFT/REDUCE parser
2189 Having analysed the grammar and generated all the table, we only need the
2190 shift/reduce engine to bring it all together.
2192 ### Goto table lookup
2194 The parser generator has nicely provided us with goto tables sorted by
2195 symbol number. We need a binary search function to find a symbol in the
2200 static int search(const struct state *l, int sym)
2203 int hi = l->go_to_cnt;
2207 while (lo + 1 < hi) {
2208 int mid = (lo + hi) / 2;
2209 if (l->go_to[mid].sym <= sym)
2214 if (l->go_to[lo].sym == sym)
2215 return l->go_to[lo].state;
2220 ### The state stack.
2222 The core data structure for the parser is the stack. This track all the
2223 symbols that have been recognised or partially recognised.
2225 The stack usually won't grow very large - maybe a few tens of entries. So
2226 we dynamically resize and array as required but never bother to shrink it
2229 We keep the stack as two separate allocations. One, `asn_stack` stores the
2230 "abstract syntax nodes" which are created by each reduction. When we call
2231 `do_reduce` we need to pass an array of the `asn`s of the body of the
2232 production, and by keeping a separate `asn` stack, we can just pass a
2233 pointer into this stack.
2235 The other allocation store all other stack fields of which there are two.
2236 The `state` is the most important one and guides the parsing process. The
2237 `sym` is nearly unnecessary. However when we want to free entries from the
2238 `asn_stack`, it helps to know what type they are so we can call the right
2239 freeing function. The symbol leads us to the right free function through
2257 The operations are needed on the stack - shift (which is like push) and pop.
2259 Shift applies no only to terminals but also to non-terminals. When we
2260 reduce a production we will pop off entries corresponding to the body
2261 symbols, then push on an item for the head of the production. This last is
2262 exactly the same process as shifting in a terminal so we use the same
2265 To simplify other code we arrange for `shift` to fail if there is no `goto`
2266 state for the symbol. This is useful in basic parsing due to our design
2267 that we shift when we can, and reduce when we cannot. So the `shift`
2268 function reports if it could.
2270 So `shift` finds the next state. If that succeed it extends the allocations
2271 if needed and pushed all the information onto the stacks.
2275 static int shift(struct parser *p,
2277 const struct state states[])
2279 // Push an entry onto the stack
2280 int newstate = search(&states[p->state], sym);
2283 if (p->tos >= p->stack_size) {
2284 p->stack_size += 10;
2285 p->stack = realloc(p->stack, p->stack_size
2286 * sizeof(p->stack[0]));
2287 p->asn_stack = realloc(p->asn_stack, p->stack_size
2288 * sizeof(p->asn_stack[0]));
2290 p->stack[p->tos].state = p->state;
2291 p->stack[p->tos].sym = sym;
2292 p->asn_stack[p->tos] = asn;
2294 p->state = newstate;
2298 `pop` simply moves the top of stack (`tos`) back down the required amount
2299 and frees and `asn` entries that need to be freed. It is called _after_ we
2300 reduce a production, just before we `shift` the nonterminal in.
2304 static void pop(struct parser *p, int num,
2305 void(*do_free)(short sym, void *asn))
2309 for (i = 0; i < num; i++)
2310 do_free(p->stack[p->tos+i].sym,
2311 p->asn_stack[p->tos+i]);
2313 p->state = p->stack[p->tos].state;
2316 ### Memory allocation
2318 The `scanner` returns tokens in a local variable - we want them in allocated
2319 memory so they can live in the `asn_stack`. Similarly the `asn` produce by
2320 a reduce is in a large buffer. Both of these require some allocation and
2321 copying, hence `memdup` and `tokcopy`.
2323 ###### parser includes
2328 void *memdup(void *m, int len)
2334 memcpy(ret, m, len);
2338 static struct token *tok_copy(struct token tk)
2340 struct token *new = malloc(sizeof(*new));
2345 ### The heart of the parser.
2347 Now we have the parser. If we can shift, we do. If not and we can reduce
2348 we do. If the production we reduced was production zero, then we have
2349 accepted the input and can finish.
2351 If we can neither shift nor reduce we have an error to handle. We pop
2352 single entries off the stack until we can shift the `TK_error` symbol, the
2353 drop input tokens until we find one we can shift into the new error state.
2355 We return whatever `asn` was returned by reducing production zero.
2357 ###### parser includes
2360 void *parser_run(struct token_state *tokens,
2361 const struct state states[],
2362 int (*do_reduce)(int, int, void**, void*, FILE*),
2363 void (*do_free)(short, void*),
2366 struct parser p = { 0 };
2371 tk = tok_copy(token_next(tokens));
2373 if (shift(&p, tk->num, tk, states)) {
2375 fputs("Shift ", trace);
2376 text_dump(trace, tk->txt, 20);
2379 tk = tok_copy(token_next(tokens));
2382 if (states[p.state].reduce_prod >= 0) {
2384 int prod = states[p.state].reduce_prod;
2385 int size = states[p.state].reduce_size;
2386 int sym = states[p.state].reduce_sym;
2388 static char buf[16*1024];
2390 body = p.asn_stack +
2391 (p.tos - states[p.state].reduce_size);
2393 bufsize = do_reduce(prod, p.tos, body,
2398 pop(&p, size, do_free);
2399 shift(&p, sym, memdup(buf, bufsize), states);
2404 /* Error. we walk up the stack until we
2405 * find a state which will accept TK_error.
2406 * We then shift in TK_error and see what state
2407 * that takes us too.
2408 * Then we discard input tokens until
2409 * we find one that is acceptable.
2411 while (shift(&p, TK_error, tk, states) < 0
2413 // discard this state
2414 pop(&p, 1, do_free);
2416 while (search(&states[p.state], tk->num) < 0 &&
2417 tk->num != TK_eof) {
2419 tk = tok_copy(token_next(tokens));
2421 if (p.tos == 0 && tk->num == TK_eof)
2426 ret = p.asn_stack[0];
2428 pop(&p, p.tos, do_free);
2434 ###### exported functions
2435 void *parser_run(struct token_state *tokens,
2436 const struct state states[],
2437 int (*do_reduce)(int, int, void**, void*, FILE*),
2438 void (*do_free)(short, void*),
2444 The obvious example for a parser is a calculator.
2446 As `scanner` provides number parsing function using `libgmp` is it not much
2447 work to perform arbitrary rational number calculations.
2449 This calculator takes one expression, or an equality test per line. The
2450 results are printed and in any equality test fails, the program exits with
2453 Embedding mdcode inside mdcode is rather horrible. I'd like to find a
2454 better approach, but as the grammar file must have 3 components I need
2455 something like this.
2457 ###### File: parsergen.mk
2458 calc.c : parsergen calc.cgm
2459 ./parsergen -o calc calc.cgm
2460 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2461 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2469 // what do we use for a demo-grammar? A calculator of course.
2483 #include <sys/mman.h>
2488 #include "scanner.h"
2494 static void free_number(struct number *n)
2500 int main(int argc, char *argv[])
2502 int fd = open(argv[1], O_RDONLY);
2503 int len = lseek(fd, 0, 2);
2504 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2505 struct section *s = code_extract(file, file+len, NULL);
2506 struct token_config config = {
2507 .ignored = (1 << TK_line_comment)
2508 | (1 << TK_block_comment)
2511 .number_chars = ".,_+-",
2515 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2523 Session -> Session Line
2526 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2527 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2528 gmp_printf(" or as a decimal: %Fg\n", fl);
2532 | Expression = Expression NEWLINE ${
2533 if (mpq_equal($1.val, $3.val))
2534 gmp_printf("Both equal %Qd\n", $1.val);
2536 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
2541 | NEWLINE ${ printf("Blank line\n"); }$
2542 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2545 Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2546 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2547 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2549 Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2550 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2551 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2553 Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2554 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$