1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. `calc.cgm` is a test grammar for a simple calculator.
22 ###### File: parsergen.c
27 ## forward declarations
38 ###### File: libparser.c
46 ###### File: parsergen.mk
49 parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
52 ## Reading the grammar
54 The grammar must be stored in a markdown literate code file as parsed
55 by "[mdcode][]". It must have three top level (i.e. unreferenced)
56 sections: `header`, `code`, and `grammar`. The first two will be
57 literally copied into the generated `.c` and `.h`. files. The last
58 contains the grammar. This is tokenised with "[scanner][]".
61 [scanner]: scanner.html
67 ###### parser includes
71 The grammar contains production sets, precedence/associativity
72 declarations, and data type declarations. These are all parsed with
73 _ad hoc_ parsing as we don't have a parser generator yet.
75 The precedence and associativity can be set for each production, but
76 can be inherited from symbols. The data type is potentially defined
77 for each non-terminal and describes what C structure is used to store
78 information about each symbol.
81 enum assoc {Left, Right, Non};
82 char *assoc_names[] = {"Left","Right","Non"};
85 struct text struct_name;
87 unsigned short precedence;
91 unsigned short precedence;
99 The strings reported by `mdcode` and `scanner` are `struct text` which have
100 length rather than being null terminated. To help with printing an
101 comparing we define `text_is` and `prtxt`, which should possibly go in
102 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
103 which might contain control characters.
106 static int text_is(struct text t, char *s)
108 return (strlen(s) == t.len &&
109 strncmp(s, t.txt, t.len) == 0);
111 static void prtxt(struct text t)
113 printf("%.*s", t.len, t.txt);
118 Productions are comprised primarily of symbols - terminal and
119 non-terminal. We do not make a syntactic distinction between the two,
120 though non-terminals must be identifiers. Non-terminal symbols are
121 those which appear in the head of a production, terminal symbols are
122 those which don't. There are also "virtual" symbols used for precedence
123 marking discussed later, and sometimes we won't know what type a symbol
126 ###### forward declarations
127 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
128 char *symtypes = "UVTN";
132 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
133 table of known symbols and the resulting parser will report them as
134 `TK_reserved + N`. A small set of identifiers are reserved for the
135 different token types that `scanner` can report.
138 static char *reserved_words[] = {
139 [TK_error] = "ERROR",
140 [TK_number] = "NUMBER",
141 [TK_ident] = "IDENTIFIER",
143 [TK_string] = "STRING",
144 [TK_multi_string] = "MULTI_STRING",
145 [TK_indent] = "INDENT",
146 [TK_undent] = "UNDENT",
147 [TK_newline] = "NEWLINE",
153 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
154 recognised. The former is automatically expected at the end of the text
155 being parsed. The latter are always ignored by the parser.
157 All of these symbols are stored in a simple symbol table. We use the
158 `struct text` from `mdcode` to store the name and link them together
159 into a sorted list using an insertion sort.
161 We don't have separate `find` and `insert` functions as any symbol we
162 find needs to be remembered. We simply expect `find` to always return a
163 symbol, but its type might be `Unknown`.
172 ###### grammar fields
177 static int text_cmp(struct text a, struct text b)
182 int cmp = strncmp(a.txt, b.txt, len);
186 return a.len - b.len;
189 static struct symbol *sym_find(struct grammar *g, struct text s)
191 struct symbol **l = &g->syms;
196 (cmp = text_cmp((*l)->name, s)) < 0)
200 n = calloc(1, sizeof(*n));
209 static void symbols_init(struct grammar *g)
211 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
213 for (i = 0; i < entries; i++) {
216 t.txt = reserved_words[i];
219 t.len = strlen(t.txt);
226 ### Data types and precedence.
228 Data type specification and precedence specification are both
229 introduced by a dollar sign at the start of the line. If the next
230 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
231 precedence, otherwise it specifies a data type.
233 The data type name is simply stored and applied to the head of all
234 subsequent productions. It must be the name of a structure, so `$expr`
235 maps to `struct expr`.
237 The precedence line must contain a list of symbols - typically
238 terminal symbols, but not necessarily. It can only contain symbols
239 that have not been seen yet, so precedence declaration must precede
240 the productions that they affect.
242 A precedence line may also contain a "virtual" symbol which is an
243 identifier preceded by `$$`. Virtual terminals carry precedence
244 information but are not included in the grammar. A production can
245 declare that it inherits the precedence of a given virtual symbol.
247 This use for `$$` precludes it from being used as a symbol in the
248 described language. Two other symbols: `${` and `}$` are also
251 Each new precedence line introduces a new precedence level and
252 declares how it associates. This level is stored in each symbol
253 listed and may be inherited by any production which uses the symbol. A
254 production inherits from the last symbol which has a precedence.
256 ###### grammar fields
257 struct text current_type;
261 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
262 static const char *known[] = { "$$", "${", "}$" };
265 static char *dollar_line(struct token_state *ts, struct grammar *g)
267 struct token t = token_next(ts);
272 if (t.num != TK_ident) {
273 err = "type or assoc expected after '$'";
276 if (text_is(t.txt, "LEFT"))
278 else if (text_is(t.txt, "RIGHT"))
280 else if (text_is(t.txt, "NON"))
283 g->current_type = t.txt;
285 if (t.num != TK_newline) {
286 err = "Extra tokens after type name";
292 // This is a precedence line, need some symbols.
296 while (t.num != TK_newline) {
297 enum symtype type = Terminal;
299 if (t.num == TK_virtual) {
302 if (t.num != TK_ident) {
303 err = "$$ must be followed by a word";
306 } else if (t.num != TK_ident &&
308 err = "Illegal token in precedence line";
311 s = sym_find(g, t.txt);
312 if (s->type != Unknown) {
313 err = "Symbols in precedence line must not already be known.";
317 s->precedence = g->prec_levels;
322 err = "No symbols given on precedence line";
326 while (t.num != TK_newline && t.num != TK_eof)
333 A production either starts with an identifier which is the head
334 non-terminal, or a vertical bar (`|`) in which case this production
335 uses the same head as the previous one. The identifier must be
336 followed by a `->` mark. All productions for a given non-terminal must
337 be grouped together with the `nonterminal ->` given only once.
339 After this a (possibly empty) sequence of identifiers and marks form
340 the body of the production. A virtual symbol may be given after the
341 body by preceding it with `$$`. If a virtual symbol is given, the
342 precedence of the production is that for the virtual symbol. If none
343 is given, the precedence is inherited from the last symbol in the
344 production which has a precedence specified.
346 After the optional precedence may come the `${` mark. This indicates
347 the start of a code fragment. If present, this must be on the same
348 line as the start of the production.
350 All of the text from the `${` through to the matching `}$` is
351 collected and forms the code-fragment for the production. It must all
352 be in one `code_node` of the literate code. The `}$` must be
353 at the end of a line.
355 Text in the code fragment will undergo substitutions where `$N` for
356 some numeric `N` will be replaced with a variable holding the parse
357 information for the particular symbol in the production. `$0` is the
358 head of the production, `$1` is the first symbol of the body, etc.
359 The type of `$N` for a terminal symbol is `struct token`. For
360 non-terminal, it is whatever has been declared for that symbol.
362 While building productions we will need to add to an array which needs to
366 static void array_add(void *varray, int *cnt, void *new)
368 void ***array = varray;
371 current = ((*cnt-1) | (step-1))+1;
372 if (*cnt == current) {
375 *array = realloc(*array, current * sizeof(void*));
377 (*array)[*cnt] = new;
381 Collecting the code fragment simply involves reading tokens until we
382 hit the end token `}$`, and noting the character position of the start and
386 static struct text collect_code(struct token_state *state,
391 code.txt = start.txt.txt + start.txt.len;
393 t = token_next(state);
394 while (t.node == start.node &&
395 t.num != TK_close && t.num != TK_error &&
397 if (t.num == TK_close && t.node == start.node)
398 code.len = t.txt.txt - code.txt;
404 Now we have all the bit we need to parse a full production.
409 ###### grammar fields
410 struct production **productions;
411 int production_count;
413 ###### production fields
415 struct symbol **body;
420 int first_production;
423 static char *parse_production(struct grammar *g,
425 struct token_state *state)
427 /* Head has already been parsed. */
430 struct production p, *pp;
432 memset(&p, 0, sizeof(p));
434 tk = token_next(state);
435 while (tk.num == TK_ident || tk.num == TK_mark) {
436 struct symbol *bs = sym_find(g, tk.txt);
437 if (bs->type == Unknown)
439 if (bs->type == Virtual) {
440 err = "Virtual symbol not permitted in production";
443 if (bs->precedence) {
444 p.precedence = bs->precedence;
447 array_add(&p.body, &p.body_size, bs);
448 tk = token_next(state);
450 if (tk.num == TK_virtual) {
452 tk = token_next(state);
453 if (tk.num != TK_ident) {
454 err = "word required after $$";
457 vs = sym_find(g, tk.txt);
458 if (vs->type != Virtual) {
459 err = "symbol after $$ must be virtual";
462 p.precedence = vs->precedence;
464 tk = token_next(state);
466 if (tk.num == TK_open) {
467 p.code = collect_code(state, tk);
468 if (p.code.txt == NULL) {
469 err = "code fragment not closed properly";
472 tk = token_next(state);
474 if (tk.num != TK_newline && tk.num != TK_eof) {
475 err = "stray tokens at end of line";
478 pp = malloc(sizeof(*pp));
480 array_add(&g->productions, &g->production_count, pp);
483 while (tk.num != TK_newline && tk.num != TK_eof)
484 tk = token_next(state);
488 With the ability to parse production and dollar-lines, we have nearly all
489 that we need to parse a grammar from a `code_node`.
491 The head of the first production will effectively be the `start` symbol of
492 the grammar. However it wont _actually_ be so. Processing the grammar is
493 greatly simplified if the real start symbol only has a single production,
494 and expect `$eof` as the final terminal. So when we find the first explicit
495 production we insert an extra production as production zero which looks like
497 ###### Example: production 0
500 where `START` is the first non-terminal give.
502 ###### create production zero
503 struct production *p = calloc(1,sizeof(*p));
504 struct text start = {"$start",6};
505 struct text eof = {"$eof",4};
506 p->head = sym_find(g, start);
507 p->head->type = Nonterminal;
508 array_add(&p->body, &p->body_size, head);
509 array_add(&p->body, &p->body_size, sym_find(g, eof));
510 g->start = p->head->num;
511 p->head->first_production = g->production_count;
512 array_add(&g->productions, &g->production_count, p);
514 Now we are ready to read in the grammar.
516 ###### grammar fields
517 int start; // the 'start' symbol.
520 static struct grammar *grammar_read(struct code_node *code)
522 struct token_config conf = {
525 .words_marks = known,
526 .known_count = sizeof(known)/sizeof(known[0]),
528 .ignored = (1 << TK_line_comment)
529 | (1 << TK_block_comment)
532 | (1 << TK_multi_string)
537 struct token_state *state = token_open(code, &conf);
538 struct token tk = token_next(state);
539 struct symbol *head = NULL;
543 g = calloc(1, sizeof(*g));
546 for (tk = token_next(state); tk.num != TK_eof;
547 tk = token_next(state)) {
548 if (tk.num == TK_newline)
550 if (tk.num == TK_ident) {
552 head = sym_find(g, tk.txt);
553 if (head->type == Nonterminal)
554 err = "This non-terminal has already be used.";
555 else if (head->type == Virtual)
556 err = "Virtual symbol not permitted in head of production";
558 head->type = Nonterminal;
559 head->struct_name = g->current_type;
561 ## create production zero
563 head->first_production = g->production_count;
564 tk = token_next(state);
565 if (tk.num == TK_mark &&
566 text_is(tk.txt, "->"))
567 err = parse_production(g, head, state);
569 err = "'->' missing in production";
571 } else if (tk.num == TK_mark
572 && text_is(tk.txt, "|")) {
573 // another production for same non-term
575 err = parse_production(g, head, state);
577 err = "First production must have a head";
578 } else if (tk.num == TK_mark
579 && text_is(tk.txt, "$")) {
580 err = dollar_line(state, g);
582 err = "Unrecognised token at start of line.";
590 fprintf(stderr, "Error at line %d: %s\n",
597 ## Analysing the grammar
599 The central task in analysing the grammar is to determine a set of
600 states to drive the parsing process. This will require finding
601 various sets of symbols and of "items". Some of these sets will need
602 to attach information to each element in the set, so they are more
605 Each "item" may have a 'look-ahead' or `LA` set associated with
606 it. Many of these will be the same as each other. To avoid memory
607 wastage, and to simplify some comparisons of sets, these sets will be
608 stored in a list of unique sets, each assigned a number.
610 Once we have the data structures in hand to manage these sets and
611 lists, we can start setting the 'nullable' flag, build the 'FIRST'
612 sets, and then create the item sets which define the various states.
616 Though we don't only store symbols in these sets, they are the main
617 things we store, so they are called symbol sets or "symsets".
619 A symset has a size, an array of shorts, and an optional array of data
620 values, which are also shorts. If the array of data is not present,
621 we store an impossible pointer, as `NULL` is used to indicate that no
622 memory has been allocated yet;
627 unsigned short *syms, *data;
629 #define NO_DATA ((unsigned short *)1)
630 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
631 const struct symset INIT_DATASET = { 0, NULL, NULL };
633 The arrays of shorts are allocated in blocks of 8 and are kept sorted
634 by using an insertion sort. We don't explicitly record the amount of
635 allocated space as it can be derived directly from the current `cnt` using
636 `((cnt - 1) | 7) + 1`.
639 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
642 int current = ((s->cnt-1) | 7) + 1;
643 if (current == s->cnt) {
645 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
646 if (s->data != NO_DATA)
647 s->data = realloc(s->data, sizeof(*s->data) * current);
650 while (i > 0 && s->syms[i-1] > key) {
651 s->syms[i] = s->syms[i-1];
652 if (s->data != NO_DATA)
653 s->data[i] = s->data[i-1];
657 if (s->data != NO_DATA)
662 Finding a symbol (or item) in a `symset` uses a simple binary search.
663 We return the index where the value was found (so data can be accessed),
664 or `-1` to indicate failure.
666 static int symset_find(struct symset *ss, unsigned short key)
673 while (lo + 1 < hi) {
674 int mid = (lo + hi) / 2;
675 if (ss->syms[mid] <= key)
680 if (ss->syms[lo] == key)
685 We will often want to form the union of two symsets. When we do, we
686 will often want to know if anything changed (as they might mean there
687 is more work to do). So `symset_union` reports whether anything was
688 added to the first set. We use a slow quadratic approach as these
689 sets don't really get very big. If profiles shows this to be a problem is
690 can be optimised later.
692 static int symset_union(struct symset *a, struct symset *b)
696 for (i = 0; i < b->cnt; i++)
697 if (symset_find(a, b->syms[i]) < 0) {
698 unsigned short data = 0;
699 if (b->data != NO_DATA)
701 symset_add(a, b->syms[i], data);
707 And of course we must be able to free a symset.
709 static void symset_free(struct symset ss)
712 if (ss.data != NO_DATA)
718 Some symsets are simply stored somewhere appropriate in the `grammar`
719 data structure, others needs a bit of indirection. This applies
720 particularly to "LA" sets. These will be paired with "items" in an
721 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
722 stored in the `data` field, so we need a mapping from a small (short)
723 number to an LA `symset`.
725 As mentioned earlier this involves creating a list of unique symsets.
727 For now, we just use a linear list sorted by insertion. A skiplist
728 would be more efficient and may be added later.
735 struct setlist *next;
738 ###### grammar fields
739 struct setlist *sets;
744 static int ss_cmp(struct symset a, struct symset b)
747 int diff = a.cnt - b.cnt;
750 for (i = 0; i < a.cnt; i++) {
751 diff = (int)a.syms[i] - (int)b.syms[i];
758 static int save_set(struct grammar *g, struct symset ss)
760 struct setlist **sl = &g->sets;
764 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
771 s = malloc(sizeof(*s));
780 Finding a set by number is currently performed by a simple linear search.
781 If this turns out to hurt performance, we can store the sets in a dynamic
782 array like the productions.
784 static struct symset set_find(struct grammar *g, int num)
786 struct setlist *sl = g->sets;
787 while (sl && sl->num != num)
793 ### Setting `nullable`
795 We set `nullable` on the head symbol for any production for which all
796 the body symbols (if any) are nullable. As this is a recursive
797 definition, any change in the `nullable` setting means that we need to
798 re-evaluate where it needs to be set.
800 We simply loop around performing the same calculations until no more
807 static void set_nullable(struct grammar *g)
810 while (check_again) {
813 for (p = 0; p < g->production_count; p++) {
814 struct production *pr = g->productions[p];
817 if (pr->head->nullable)
819 for (s = 0; s < pr->body_size; s++)
820 if (! pr->body[s]->nullable)
822 if (s == pr->body_size) {
823 pr->head->nullable = 1;
830 ### Building the `first` sets
832 When calculating what can follow a particular non-terminal, we will need to
833 know what the "first" terminal in an subsequent non-terminal might be. So
834 we calculate the `first` set for every non-terminal and store them in an
835 array. We don't bother recording the "first" set for terminals as they are
838 As the "first" for one symbol might depend on the "first" of another,
839 we repeat the calculations until no changes happen, just like with
840 `set_nullable`. This makes use of the fact that `symset_union`
841 reports if any change happens.
843 The core of this which finds the "first" of part of a production body
844 will be reused for computing the "follow" sets, so we split it out
845 into a separate function.
847 ###### grammar fields
848 struct symset *first;
852 static int add_first(struct production *pr, int start,
853 struct symset *target, struct grammar *g,
858 for (s = start; s < pr->body_size; s++) {
859 struct symbol *bs = pr->body[s];
860 if (bs->type == Terminal) {
861 if (symset_find(target, bs->num) < 0) {
862 symset_add(target, bs->num, 0);
866 } else if (symset_union(target, &g->first[bs->num]))
872 *to_end = (s == pr->body_size);
876 static void build_first(struct grammar *g)
880 g->first = calloc(g->num_syms, sizeof(g->first[0]));
881 for (s = 0; s < g->num_syms; s++)
882 g->first[s] = INIT_SYMSET;
884 while (check_again) {
887 for (p = 0; p < g->production_count; p++) {
888 struct production *pr = g->productions[p];
889 struct symset *head = &g->first[pr->head->num];
891 if (add_first(pr, 0, head, g, NULL))
897 ### Building the `follow` sets.
899 There are two different situations when we will want to generate "follow"
900 sets. If we are doing an SLR analysis, we want to generate a single
901 "follow" set for each non-terminal in the grammar. That is what is
902 happening here. If we are doing an LALR or LR analysis we will want
903 to generate a separate "LA" set for each item. We do that later
906 There are two parts to generating a "follow" set. Firstly we look at
907 every place that any non-terminal appears in the body of any
908 production, and we find the set of possible "first" symbols after
909 there. This is done using `add_first` above and only needs to be done
910 once as the "first" sets are now stable and will not change.
914 for (p = 0; p < g->production_count; p++) {
915 struct production *pr = g->productions[p];
918 for (b = 0; b < pr->body_size - 1; b++) {
919 struct symbol *bs = pr->body[b];
920 if (bs->type == Terminal)
922 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
926 The second part is to add the "follow" set of the head of a production
927 to the "follow" sets of the final symbol in the production, and any
928 other symbol which is followed only by `nullable` symbols. As this
929 depends on "follow" itself we need to repeatedly perform the process
930 until no further changes happen.
934 for (again = 0, p = 0;
935 p < g->production_count;
936 p < g->production_count-1
937 ? p++ : again ? (again = 0, p = 0)
939 struct production *pr = g->productions[p];
942 for (b = pr->body_size - 1; b >= 0; b--) {
943 struct symbol *bs = pr->body[b];
944 if (bs->type == Terminal)
946 if (symset_union(&g->follow[bs->num],
947 &g->follow[pr->head->num]))
954 We now just need to create and initialise the `follow` list to get a
957 ###### grammar fields
958 struct symset *follow;
961 static void build_follow(struct grammar *g)
964 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
965 for (s = 0; s < g->num_syms; s++)
966 g->follow[s] = INIT_SYMSET;
970 ### Building itemsets and states
972 There are three different levels of detail that can be chosen for
973 building the itemsets and states for the LR grammar. They are:
975 1. LR(0) or SLR(1), where no look-ahead is considered.
976 2. LALR(1) where we build look-ahead sets with each item and merge
977 the LA sets when we find two paths to the same "kernel" set of items.
978 3. LR(1) where different look-ahead for any item in the code means
979 a different state must be created.
981 ###### forward declarations
982 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
984 We need to be able to look through existing states to see if a newly
985 generated state already exists. For now we use a simple sorted linked
988 An item is a pair of numbers: the production number and the position of
989 "DOT", which is an index into the body of the production.
990 As the numbers are not enormous we can combine them into a single "short"
991 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
992 production number (so 4000 productions with maximum of 15 symbols in the
995 Comparing the itemsets will be a little different to comparing symsets
996 as we want to do the lookup after generating the "kernel" of an
997 itemset, so we need to ignore the offset=zero items which are added during
1000 To facilitate this, we modify the "DOT" number so that "0" sorts to the end of
1001 the list in the symset, and then only compare items before the first "0".
1004 static inline unsigned short item_num(int production, int index)
1006 return production | ((31-index) << 11);
1008 static inline int item_prod(unsigned short item)
1010 return item & 0x7ff;
1012 static inline int item_index(unsigned short item)
1014 return (31-(item >> 11)) & 0x1f;
1017 For LR(1) analysis we need to compare not just the itemset in a state
1018 but also the LA sets. As we assign each unique LA set a number, we
1019 can just compare the symset and the data values together.
1022 static int itemset_cmp(struct symset a, struct symset b,
1023 enum grammar_type type)
1029 i < a.cnt && i < b.cnt &&
1030 item_index(a.syms[i]) > 0 &&
1031 item_index(b.syms[i]) > 0;
1033 int diff = a.syms[i] - b.syms[i];
1037 diff = a.data[i] - b.data[i];
1042 if (i == a.cnt || item_index(a.syms[i]) == 0)
1046 if (i == b.cnt || item_index(b.syms[i]) == 0)
1052 if (type < LR1 || av == -1)
1055 a.data[i] - b.data[i];
1058 And now we can build the list of itemsets. The lookup routine returns
1059 both a success flag and a pointer to where in the list an insert
1060 should happen, so we don't need to search a second time.
1064 struct itemset *next;
1066 struct symset items;
1067 struct symset go_to;
1071 ###### grammar fields
1072 struct itemset *items;
1076 static int itemset_find(struct grammar *g, struct itemset ***where,
1077 struct symset kernel, enum grammar_type type)
1079 struct itemset **ip;
1081 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1082 struct itemset *i = *ip;
1084 diff = itemset_cmp(i->items, kernel, type);
1097 Adding an itemset may require merging the LA sets if LALR analysis is
1098 happening. If any new LA set add symbol that weren't in the old LA set, we
1099 clear the `completed` flag so that the dependants of this itemset will be
1100 recalculated and their LA sets updated.
1102 `add_itemset` must consume the symsets it is passed, either by adding
1103 them to a data structure, of freeing them.
1105 static int add_itemset(struct grammar *g, struct symset ss,
1106 enum grammar_type type)
1108 struct itemset **where, *is;
1110 int found = itemset_find(g, &where, ss, type);
1112 is = calloc(1, sizeof(*is));
1113 is->state = g->states;
1117 is->go_to = INIT_DATASET;
1126 for (i = 0; i < ss.cnt; i++) {
1127 struct symset temp = INIT_SYMSET, s;
1128 if (ss.data[i] == is->items.data[i])
1130 s = set_find(g, is->items.data[i]);
1131 symset_union(&temp, &s);
1132 s = set_find(g, ss.data[i]);
1133 if (symset_union(&temp, &s)) {
1134 is->items.data[i] = save_set(g, temp);
1145 To build all the itemsets, we first insert the initial itemset made from the
1146 start symbol, complete each itemset, and then generate new itemsets from old
1147 until no new ones can be made.
1149 Completing an itemset means finding all the items where "DOT" is followed by
1150 a nonterminal and adding "DOT=0" items for every production from that
1151 non-terminal - providing each item hasn't already been added.
1153 If LA sets are needed, the LA set for each new item is found using
1154 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1155 be supplemented by the LA set for the item which produce the new item.
1157 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1158 is used in the next stage.
1160 NOTE: precedence handling should happen here - I haven't written this yet
1163 ###### complete itemset
1164 for (i = 0; i < is->items.cnt; i++) {
1165 int p = item_prod(is->items.syms[i]);
1166 int bs = item_index(is->items.syms[i]);
1167 struct production *pr = g->productions[p];
1170 struct symset LA = INIT_SYMSET;
1171 unsigned short sn = 0;
1173 if (bs == pr->body_size)
1176 if (symset_find(&done, s->num) < 0)
1177 symset_add(&done, s->num, 0);
1178 if (s->type != Nonterminal)
1184 add_first(pr, bs+1, &LA, g, &to_end);
1186 struct symset ss = set_find(g, is->items.data[i]);
1187 symset_union(&LA, &ss);
1189 sn = save_set(g, LA);
1190 LA = set_find(g, sn);
1192 /* Add productions for this symbol */
1193 for (p2 = s->first_production;
1194 p2 < g->production_count &&
1195 g->productions[p2]->head == s;
1197 int itm = item_num(p2, 0);
1198 int pos = symset_find(&is->items, itm);
1200 symset_add(&is->items, itm, sn);
1201 /* Will have re-ordered, so start
1202 * from beginning again */
1204 } else if (type >= LALR) {
1205 struct symset ss = set_find(g, is->items.data[pos]);
1206 struct symset tmp = INIT_SYMSET;
1208 symset_union(&tmp, &ss);
1209 if (symset_union(&tmp, &LA)) {
1210 is->items.data[pos] = save_set(g, tmp);
1218 For each symbol we found (and placed in `done`) we collect all the items for
1219 which this symbol is next, and create a new itemset with "DOT" advanced over
1220 the symbol. This is then added to the collection of itemsets (or merged
1221 with a pre-existing itemset).
1223 ###### derive itemsets
1224 // Now we have a completed itemset, so we need to
1225 // create all the 'next' itemset and create them
1226 // if they don't exist.
1227 for (i = 0; i < done.cnt; i++) {
1229 unsigned short state;
1230 struct symset newitemset = INIT_SYMSET;
1232 newitemset = INIT_DATASET;
1234 for (j = 0; j < is->items.cnt; j++) {
1235 int itm = is->items.syms[j];
1236 int p = item_prod(itm);
1237 int bp = item_index(itm);
1238 struct production *pr = g->productions[p];
1239 unsigned short la = 0;
1242 if (bp == pr->body_size)
1244 if (pr->body[bp]->num != done.syms[i])
1247 la = is->items.data[j];
1248 pos = symset_find(&newitemset, pr->head->num);
1250 symset_add(&newitemset, item_num(p, bp+1), la);
1251 else if (type >= LALR) {
1252 // Need to merge la set.
1253 int la2 = newitemset.data[pos];
1255 struct symset ss = set_find(g, la2);
1256 struct symset LA = INIT_SYMSET;
1257 symset_union(&LA, &ss);
1258 ss = set_find(g, la);
1259 if (symset_union(&LA, &ss))
1260 newitemset.data[pos] = save_set(g, LA);
1266 state = add_itemset(g, newitemset, type);
1267 if (symset_find(&is->go_to, done.syms[i]) < 0)
1268 symset_add(&is->go_to, done.syms[i], state);
1271 All that is left is to crate the initial itemset from production zero, and
1272 with `TK_eof` as the LA set.
1275 static void build_itemsets(struct grammar *g, enum grammar_type type)
1277 struct symset first = INIT_SYMSET;
1280 unsigned short la = 0;
1282 // LA set just has eof
1283 struct symset eof = INIT_SYMSET;
1284 symset_add(&eof, TK_eof, 0);
1285 la = save_set(g, eof);
1286 first = INIT_DATASET;
1288 // production 0, offset 0 (with no data)
1289 symset_add(&first, item_num(0, 0), la);
1290 add_itemset(g, first, type);
1291 for (again = 0, is = g->items;
1293 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1295 struct symset done = INIT_SYMSET;
1306 ### Completing the analysis.
1308 The exact process of analysis depends on which level was requested. For
1309 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1310 `LR1` we need first, but not follow. For `SLR` we need both.
1312 We don't build the "action" tables that you might expect as the parser
1313 doesn't use them. They will be calculated to some extent if needed for
1316 Once we have built everything we allocate arrays for the two lists:
1317 symbols and itemsets. This allows more efficient access during reporting.
1318 The symbols are grouped as terminals and non-terminals and we record the
1319 changeover point in `first_nonterm`.
1321 ###### grammar fields
1322 struct symbol **symtab;
1323 struct itemset **statetab;
1326 ###### grammar_analyse
1328 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1332 int snum = TK_reserved;
1333 for (s = g->syms; s; s = s->next)
1334 if (s->num < 0 && s->type == Terminal) {
1338 g->first_nonterm = snum;
1339 for (s = g->syms; s; s = s->next)
1345 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1346 for (s = g->syms; s; s = s->next)
1347 g->symtab[s->num] = s;
1356 build_itemsets(g, type);
1358 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1359 for (is = g->items; is ; is = is->next)
1360 g->statetab[is->state] = is;
1363 ## Reporting on the Grammar
1365 The purpose of the report is to give the grammar developer insight into
1366 how the grammar parser will work. It is basically a structured dump of
1367 all the tables that have been generated, plus an description of any conflicts.
1369 ###### grammar_report
1370 static int grammar_report(struct grammar *g, enum grammar_type type)
1376 return report_conflicts(g, type);
1379 Firstly we have the complete list of symbols, together with the "FIRST"
1380 set if that was generated.
1384 static void report_symbols(struct grammar *g)
1388 printf("SYMBOLS + FIRST:\n");
1390 printf("SYMBOLS:\n");
1392 for (n = 0; n < g->num_syms; n++) {
1393 struct symbol *s = g->symtab[n];
1397 printf(" %c%3d%c: ",
1398 s->nullable ? '*':' ',
1399 s->num, symtypes[s->type]);
1402 printf(" (%d%s)", s->precedence,
1403 assoc_names[s->assoc]);
1405 if (g->first && s->type == Nonterminal) {
1408 for (j = 0; j < g->first[n].cnt; j++) {
1411 prtxt(g->symtab[g->first[n].syms[j]]->name);
1418 Then we have to follow sets if they were computed.
1420 static void report_follow(struct grammar *g)
1423 printf("FOLLOW:\n");
1424 for (n = 0; n < g->num_syms; n++)
1425 if (g->follow[n].cnt) {
1429 prtxt(g->symtab[n]->name);
1430 for (j = 0; j < g->follow[n].cnt; j++) {
1433 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1439 And finally the item sets. These include the GO TO tables and, for
1440 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1441 it up a bit. First the items, with production number and associativity.
1443 static void report_item(struct grammar *g, int itm)
1445 int p = item_prod(itm);
1446 int dot = item_index(itm);
1447 struct production *pr = g->productions[p];
1451 prtxt(pr->head->name);
1453 for (i = 0; i < pr->body_size; i++) {
1454 printf(" %s", (dot == i ? ". ": ""));
1455 prtxt(pr->body[i]->name);
1457 if (dot == pr->body_size)
1461 printf(" (%d%s)", pr->precedence,
1462 assoc_names[pr->assoc]);
1466 The LA sets which are (possibly) reported with each item:
1468 static void report_la(struct grammar *g, int lanum)
1470 struct symset la = set_find(g, lanum);
1474 printf(" LOOK AHEAD(%d)", lanum);
1475 for (i = 0; i < la.cnt; i++) {
1478 prtxt(g->symtab[la.syms[i]]->name);
1483 Then the go to sets:
1486 static void report_goto(struct grammar *g, struct symset gt)
1491 for (i = 0; i < gt.cnt; i++) {
1493 prtxt(g->symtab[gt.syms[i]]->name);
1494 printf(" -> %d\n", gt.data[i]);
1498 Now we can report all the item sets complete with items, LA sets, and GO TO.
1500 static void report_itemsets(struct grammar *g)
1503 printf("ITEM SETS(%d)\n", g->states);
1504 for (s = 0; s < g->states; s++) {
1506 struct itemset *is = g->statetab[s];
1507 printf(" Itemset %d:\n", s);
1508 for (j = 0; j < is->items.cnt; j++) {
1509 report_item(g, is->items.syms[j]);
1510 if (is->items.data != NO_DATA)
1511 report_la(g, is->items.data[j]);
1513 report_goto(g, is->go_to);
1517 ### Reporting conflicts
1519 Conflict detection varies a lot among different analysis levels. However
1520 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1521 LR1 are also similar as they have FOLLOW or LA sets.
1525 ## conflict functions
1527 static int report_conflicts(struct grammar *g, enum grammar_type type)
1530 printf("Conflicts:\n");
1532 cnt = conflicts_lr0(g, type);
1534 cnt = conflicts_slr(g, type);
1536 printf(" - no conflicts\n");
1540 LR0 conflicts are any state which have both a reducible item and
1543 LR05 conflicts only occurs if two possibly reductions exist,
1544 as shifts always over-ride reductions.
1546 ###### conflict functions
1547 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1551 for (i = 0; i < g->states; i++) {
1552 struct itemset *is = g->statetab[i];
1553 int last_reduce = -1;
1554 int prev_reduce = -1;
1555 int last_shift = -1;
1559 for (j = 0; j < is->items.cnt; j++) {
1560 int itm = is->items.syms[j];
1561 int p = item_prod(itm);
1562 int bp = item_index(itm);
1563 struct production *pr = g->productions[p];
1565 if (bp == pr->body_size) {
1566 prev_reduce = last_reduce;
1570 if (pr->body[bp]->type == Terminal)
1573 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1574 printf(" State %d has both SHIFT and REDUCE:\n", i);
1575 report_item(g, is->items.syms[last_shift]);
1576 report_item(g, is->items.syms[last_reduce]);
1579 if (prev_reduce >= 0) {
1580 printf(" State %d has 2 (or more) reducible items\n", i);
1581 report_item(g, is->items.syms[prev_reduce]);
1582 report_item(g, is->items.syms[last_reduce]);
1589 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1590 look ahead, or if a symbol in a look-ahead can be shifted. The differ only
1591 in the source of the look ahead set.
1593 We build a dataset mapping terminal to item for possible SHIFTs and then
1594 another for possible REDUCE operations. We report when we get conflicts
1597 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1602 for (i = 0; i < g->states; i++) {
1603 struct itemset *is = g->statetab[i];
1604 struct symset shifts = INIT_DATASET;
1605 struct symset reduce = INIT_DATASET;
1609 /* First collect the shifts */
1610 for (j = 0; j < is->items.cnt; j++) {
1611 unsigned short itm = is->items.syms[j];
1612 int p = item_prod(itm);
1613 int bp = item_index(itm);
1614 struct production *pr = g->productions[p];
1616 if (bp < pr->body_size &&
1617 pr->body[bp]->type == Terminal) {
1619 int sym = pr->body[bp]->num;
1620 if (symset_find(&shifts, sym) < 0)
1621 symset_add(&shifts, sym, itm);
1624 /* Now look for reduction and conflicts */
1625 for (j = 0; j < is->items.cnt; j++) {
1626 unsigned short itm = is->items.syms[j];
1627 int p = item_prod(itm);
1628 int bp = item_index(itm);
1629 struct production *pr = g->productions[p];
1631 if (bp < pr->body_size)
1636 la = g->follow[pr->head->num];
1638 la = set_find(g, is->items.data[j]);
1640 for (k = 0; k < la.cnt; k++) {
1641 int pos = symset_find(&shifts, la.syms[k]);
1643 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1644 prtxt(g->symtab[la.syms[k]]->name);
1646 report_item(g, shifts.data[pos]);
1647 report_item(g, itm);
1650 pos = symset_find(&reduce, la.syms[k]);
1652 symset_add(&reduce, la.syms[k], itm);
1655 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1656 prtxt(g->symtab[la.syms[k]]->name);
1658 report_item(g, itm);
1659 report_item(g, reduce.data[pos]);
1663 symset_free(shifts);
1664 symset_free(reduce);
1670 ## Generating the parser
1672 The export part of the parser is the `parse_XX` function, where the name
1673 `XX` is based on the name of the parser files.
1675 This takes a `code_node`, a partially initialized `token_config`, and an
1676 optional `FILE` to send tracing to. The `token_config` gets the list of
1677 known words added and then is used with the `code_node` to initialize the
1680 `parse_XX` then call the library function `parser_run` to actually complete
1681 the parse, This needs the `states` table and function to call the various
1682 pieces of code provided in the grammar file, so they are generated first.
1684 ###### parser_generate
1686 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1691 gen_reduce(f, g, file);
1694 fprintf(f, "#line 0 \"gen_parser\"\n");
1695 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1698 fprintf(f, "\tstruct token_state *tokens;\n");
1699 fprintf(f, "\tconfig->words_marks = known;\n");
1700 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1701 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1702 fprintf(f, "\ttokens = token_open(code, config);\n");
1703 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace);\n");
1704 fprintf(f, "\ttoken_close(tokens);\n");
1705 fprintf(f, "\treturn rv;\n");
1706 fprintf(f, "}\n\n");
1709 ### Table words table
1711 The know words is simply an array of terminal symbols.
1715 static void gen_known(FILE *f, struct grammar *g)
1718 fprintf(f, "#line 0 \"gen_known\"\n");
1719 fprintf(f, "static const char *known[] = {\n");
1720 for (i = TK_reserved;
1721 i < g->num_syms && g->symtab[i]->type == Terminal;
1723 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1724 g->symtab[i]->name.txt);
1725 fprintf(f, "};\n\n");
1728 ### States and the goto tables.
1730 For each state we record the goto table and the reducible production if
1732 Some of the details of the reducible production are stored in the
1733 `do_reduce` function to come later. Here we store the production number,
1734 the body size (useful for stack management) and the resulting symbol (useful
1735 for knowing how to free data later).
1737 The go to table is stored in a simple array of `sym` and corresponding
1740 ###### exported types
1748 const struct lookup * go_to;
1757 static void gen_goto(FILE *f, struct grammar *g)
1760 fprintf(f, "#line 0 \"gen_goto\"\n");
1761 for (i = 0; i < g->states; i++) {
1763 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1765 struct symset gt = g->statetab[i]->go_to;
1766 for (j = 0; j < gt.cnt; j++)
1767 fprintf(f, "\t{ %d, %d },\n",
1768 gt.syms[j], gt.data[j]);
1775 static void gen_states(FILE *f, struct grammar *g)
1778 fprintf(f, "#line 0 \"gen_states\"\n");
1779 fprintf(f, "static const struct state states[] = {\n");
1780 for (i = 0; i < g->states; i++) {
1781 struct itemset *is = g->statetab[i];
1783 for (j = 0; j < is->items.cnt; j++) {
1784 int itm = is->items.syms[j];
1785 int p = item_prod(itm);
1786 int bp = item_index(itm);
1787 struct production *pr = g->productions[p];
1789 if (bp < pr->body_size)
1791 /* This is what we reduce */
1796 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d },\n",
1797 i, is->go_to.cnt, i, prod,
1798 prod < 0 ? -1 : g->productions[prod]->body_size,
1799 prod < 0 ? -1 : g->productions[prod]->head->num);
1801 fprintf(f, "};\n\n");
1804 ### The `do_reduce` function and the code
1806 When the parser engine decides to reduce a production, it calls `do_reduce`.
1807 This has two functions.
1809 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1810 production being reduced. Secondly it runs the code that was included with
1811 the production if any.
1813 This code needs to be able to store data somewhere. Rather than requiring
1814 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1815 `do_reduce` return the size to be saved.
1817 The code fragment requires translation when written out. Any `$N` needs to
1818 be converted to a reference either to that buffer (if `$0`) or to the
1819 structure returned by a previous reduction. These pointer need to be cast
1820 to the appropriate type for each access. All this is handling in
1826 static void gen_code(struct production *p, FILE *f, struct grammar *g)
1829 fprintf(f, "\t\t\t");
1830 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1839 if (*c < '0' || *c > '9') {
1844 while (c[1] >= '0' && c[1] <= '9') {
1846 n = n * 10 + *c - '0';
1849 fprintf(f, "(*(struct %.*s*)ret)",
1850 p->head->struct_name.len,
1851 p->head->struct_name.txt);
1852 else if (n > p->body_size)
1853 fprintf(f, "$%d", n);
1854 else if (p->body[n-1]->type == Terminal)
1855 fprintf(f, "(*(struct token *)body[%d])",
1857 else if (p->body[n-1]->struct_name.txt == NULL)
1858 fprintf(f, "$%d", n);
1860 fprintf(f, "(*(struct %.*s*)body[%d])",
1861 p->body[n-1]->struct_name.len,
1862 p->body[n-1]->struct_name.txt, n-1);
1869 static void gen_reduce(FILE *f, struct grammar *g, char *file)
1872 fprintf(f, "#line 0 \"gen_reduce\"\n");
1873 fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n");
1874 fprintf(f, " void *ret, FILE *trace)\n");
1876 fprintf(f, "\tint ret_size = 0;\n");
1878 fprintf(f, "\tswitch(prod) {\n");
1879 for (i = 0; i < g->production_count; i++) {
1880 struct production *p = g->productions[i];
1881 fprintf(f, "\tcase %d:\n", i);
1886 fprintf(f, "\t\tif (trace) {\n");
1887 fprintf(f, "\t\t\tfprintf(trace, \"[%%2d]%.*s ->\", depth);\n",
1888 p->head->name.len, p->head->name.txt);
1889 for (j = 0; j < p->body_size; j++) {
1890 if (p->body[j]->type == Terminal) {
1891 fputs("\t\t\tfputs(\" \", trace);\n", f);
1892 fprintf(f, "\t\t\ttext_dump(trace, (*(struct token*)body[%d]).txt, 20);\n", j);
1894 fprintf(f, "\t\t\tfprintf(trace, \" %.*s\");\n",
1895 p->body[j]->name.len,
1896 p->body[j]->name.txt);
1899 fprintf(f, "\t\t}\n");
1901 if (p->head->struct_name.txt)
1902 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
1903 p->head->struct_name.len,
1904 p->head->struct_name.txt);
1906 fprintf(f, "\t\tbreak;\n");
1908 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
1913 As each non-terminal can potentially cause a different type of data
1914 structure to be allocated and filled in, we need to be able to free it when
1917 It is particularly important to have fine control over freeing during error
1918 recovery where individual stack frames might need to be freed.
1920 For this, the grammar author required to defined a `free_XX` function for
1921 each structure that is used by a non-terminal. `do_free` all call whichever
1922 is appropriate given a symbol number, and will call `free` (as is
1923 appropriate for tokens` on any terminal symbol.
1927 static void gen_free(FILE *f, struct grammar *g)
1931 fprintf(f, "#line 0 \"gen_free\"\n");
1932 fprintf(f, "static void do_free(short sym, void *asn)\n");
1934 fprintf(f, "\tif (!asn) return;\n");
1935 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
1936 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
1937 fprintf(f, "\tswitch(sym) {\n");
1939 for (i = 0; i < g->num_syms; i++) {
1940 struct symbol *s = g->symtab[i];
1942 s->type != Nonterminal ||
1943 s->struct_name.txt == NULL)
1946 fprintf(f, "\tcase %d:\n", s->num);
1947 fprintf(f, "\t\tfree_%.*s(asn);\n",
1949 s->struct_name.txt);
1950 fprintf(f, "\t\tbreak;\n");
1952 fprintf(f, "\t}\n}\n\n");
1955 ## The main routine.
1957 There are three key parts to the "main" function of parsergen: processing
1958 the arguments, loading the grammar file, and dealing with the grammar.
1960 ### Argument processing.
1962 Command line options allow the selection of analysis mode, name of output
1963 file, and whether or not a report should be generated. By default we create
1964 a report only if no code output was requested.
1966 The `parse_XX` function name uses the basename of the output file which
1967 should not have a suffix (`.c`). `.c` is added to the given name for the
1968 code, and `.h` is added for the header (if header text is specifed in the
1975 static const struct option long_options[] = {
1976 { "LR0", 0, NULL, '0' },
1977 { "LR05", 0, NULL, '5' },
1978 { "SLR", 0, NULL, 'S' },
1979 { "LALR", 0, NULL, 'L' },
1980 { "LR1", 0, NULL, '1' },
1981 { "report", 0, NULL, 'R' },
1982 { "output", 1, NULL, 'o' },
1983 { NULL, 0, NULL, 0 }
1985 const char *options = "05SL1Ro:";
1987 ###### process arguments
1989 char *outfile = NULL;
1993 enum grammar_type type = LR05;
1994 while ((opt = getopt_long(argc, argv, options,
1995 long_options, NULL)) != -1) {
2010 outfile = optarg; break;
2012 fprintf(stderr, "Usage: parsergen ...\n");
2017 infile = argv[optind++];
2019 fprintf(stderr, "No input file given\n");
2022 if (outfile && report == 1)
2025 if (name && strchr(name, '/'))
2026 name = strrchr(name, '/')+1;
2028 if (optind < argc) {
2029 fprintf(stderr, "Excess command line arguments\n");
2033 ### Loading the grammar file
2035 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2038 One we have extracted the code (with `mdcode`) we expect to file three
2039 sections: header, code, and grammar. Anything else is an error.
2041 "header" and "code" are optional, though it is hard to build a working
2042 parser with neither. "grammar" must be provided.
2046 #include <sys/mman.h>
2051 static void pr_err(char *msg)
2054 fprintf(stderr, "%s\n", msg);
2058 struct section *table;
2062 fd = open(infile, O_RDONLY);
2064 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2065 infile, strerror(errno));
2068 len = lseek(fd, 0, 2);
2069 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2070 table = code_extract(file, file+len, pr_err);
2072 struct code_node *hdr = NULL;
2073 struct code_node *code = NULL;
2074 struct code_node *gram = NULL;
2075 for (s = table; s; s = s->next) {
2076 if (text_is(s->section, "header"))
2078 else if (text_is(s->section, "code"))
2080 else if (text_is(s->section, "grammar"))
2083 fprintf(stderr, "Unknown content section: %.*s\n",
2084 s->section.len, s->section.txt);
2089 ### Processing the input
2091 As we need to append an extention to a filename and then open it for
2092 writing, and we need to do this twice, it helps to have a separate function.
2096 static FILE *open_ext(char *base, char *ext)
2098 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2100 strcat(strcpy(fn, base), ext);
2106 If we can read the grammar, then we analyse and optionally report on it. If
2107 the report finds conflicts we will exit with an error status.
2109 ###### process input
2110 struct grammar *g = NULL;
2112 fprintf(stderr, "No grammar section provided\n");
2115 g = grammar_read(gram);
2117 fprintf(stderr, "Failure to parse grammar\n");
2122 grammar_analyse(g, type);
2124 if (grammar_report(g, type))
2128 If a headers section is defined, we write it out and include a declaration
2129 for the `parse_XX` function so it can be used from separate code.
2131 if (rv == 0 && hdr && outfile) {
2132 FILE *f = open_ext(outfile, ".h");
2134 code_node_print(f, hdr, infile);
2135 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2139 fprintf(stderr, "Cannot create %s.h\n",
2145 And if all goes well and an output file was provided, we create the `.c`
2146 file with the code section (if any) and the parser tables and function.
2148 if (rv == 0 && outfile) {
2149 FILE *f = open_ext(outfile, ".c");
2152 code_node_print(f, code, infile);
2153 gen_parser(f, g, infile, name);
2156 fprintf(stderr, "Cannot create %s.c\n",
2162 And that about wraps it up. We need to set the locale so that UTF-8 is
2163 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2165 ###### File: parsergen.mk
2166 parsergen : parsergen.o libscanner.o libmdcode.o
2167 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2174 int main(int argc, char *argv[])
2179 setlocale(LC_ALL,"");
2181 ## process arguments
2188 ## The SHIFT/REDUCE parser
2190 Having analysed the grammar and generated all the table, we only need the
2191 shift/reduce engine to bring it all together.
2193 ### Goto table lookup
2195 The parser generator has nicely provided us with goto tables sorted by
2196 symbol number. We need a binary search function to find a symbol in the
2201 static int search(const struct state *l, int sym)
2204 int hi = l->go_to_cnt;
2208 while (lo + 1 < hi) {
2209 int mid = (lo + hi) / 2;
2210 if (l->go_to[mid].sym <= sym)
2215 if (l->go_to[lo].sym == sym)
2216 return l->go_to[lo].state;
2221 ### The state stack.
2223 The core data structure for the parser is the stack. This track all the
2224 symbols that have been recognised or partially recognised.
2226 The stack usually won't grow very large - maybe a few tens of entries. So
2227 we dynamically resize and array as required but never bother to shrink it
2230 We keep the stack as two separate allocations. One, `asn_stack` stores the
2231 "abstract syntax nodes" which are created by each reduction. When we call
2232 `do_reduce` we need to pass an array of the `asn`s of the body of the
2233 production, and by keeping a separate `asn` stack, we can just pass a
2234 pointer into this stack.
2236 The other allocation store all other stack fields of which there are two.
2237 The `state` is the most important one and guides the parsing process. The
2238 `sym` is nearly unnecessary. However when we want to free entries from the
2239 `asn_stack`, it helps to know what type they are so we can call the right
2240 freeing function. The symbol leads us to the right free function through
2258 The operations are needed on the stack - shift (which is like push) and pop.
2260 Shift applies no only to terminals but also to non-terminals. When we
2261 reduce a production we will pop off entries corresponding to the body
2262 symbols, then push on an item for the head of the production. This last is
2263 exactly the same process as shifting in a terminal so we use the same
2266 To simplify other code we arrange for `shift` to fail if there is no `goto`
2267 state for the symbol. This is useful in basic parsing due to our design
2268 that we shift when we can, and reduce when we cannot. So the `shift`
2269 function reports if it could.
2271 So `shift` finds the next state. If that succeed it extends the allocations
2272 if needed and pushed all the information onto the stacks.
2276 static int shift(struct parser *p,
2278 const struct state states[])
2280 // Push an entry onto the stack
2281 int newstate = search(&states[p->state], sym);
2284 if (p->tos >= p->stack_size) {
2285 p->stack_size += 10;
2286 p->stack = realloc(p->stack, p->stack_size
2287 * sizeof(p->stack[0]));
2288 p->asn_stack = realloc(p->asn_stack, p->stack_size
2289 * sizeof(p->asn_stack[0]));
2291 p->stack[p->tos].state = p->state;
2292 p->stack[p->tos].sym = sym;
2293 p->asn_stack[p->tos] = asn;
2295 p->state = newstate;
2299 `pop` simply moves the top of stack (`tos`) back down the required amount
2300 and frees and `asn` entries that need to be freed. It is called _after_ we
2301 reduce a production, just before we `shift` the nonterminal in.
2305 static void pop(struct parser *p, int num,
2306 void(*do_free)(short sym, void *asn))
2310 for (i = 0; i < num; i++)
2311 do_free(p->stack[p->tos+i].sym,
2312 p->asn_stack[p->tos+i]);
2314 p->state = p->stack[p->tos].state;
2317 ### Memory allocation
2319 The `scanner` returns tokens in a local variable - we want them in allocated
2320 memory so they can live in the `asn_stack`. Similarly the `asn` produce by
2321 a reduce is in a large buffer. Both of these require some allocation and
2322 copying, hence `memdup` and `tokcopy`.
2324 ###### parser includes
2329 void *memdup(void *m, int len)
2335 memcpy(ret, m, len);
2339 static struct token *tok_copy(struct token tk)
2341 struct token *new = malloc(sizeof(*new));
2346 ### The heart of the parser.
2348 Now we have the parser. If we can shift, we do. If not and we can reduce
2349 we do. If the production we reduced was production zero, then we have
2350 accepted the input and can finish.
2352 If we can neither shift nor reduce we have an error to handle. We pop
2353 single entries off the stack until we can shift the `TK_error` symbol, the
2354 drop input tokens until we find one we can shift into the new error state.
2356 We return whatever `asn` was returned by reducing production zero.
2358 ###### parser includes
2361 void *parser_run(struct token_state *tokens,
2362 const struct state states[],
2363 int (*do_reduce)(int, int, void**, void*, FILE*),
2364 void (*do_free)(short, void*),
2367 struct parser p = { 0 };
2372 tk = tok_copy(token_next(tokens));
2374 if (shift(&p, tk->num, tk, states)) {
2376 fputs("Shift ", trace);
2377 text_dump(trace, tk->txt, 20);
2380 tk = tok_copy(token_next(tokens));
2383 if (states[p.state].reduce_prod >= 0) {
2385 int prod = states[p.state].reduce_prod;
2386 int size = states[p.state].reduce_size;
2387 int sym = states[p.state].reduce_sym;
2389 static char buf[16*1024];
2391 body = p.asn_stack +
2392 (p.tos - states[p.state].reduce_size);
2394 bufsize = do_reduce(prod, p.tos, body,
2399 pop(&p, size, do_free);
2400 shift(&p, sym, memdup(buf, bufsize), states);
2405 /* Error. we walk up the stack until we
2406 * find a state which will accept TK_error.
2407 * We then shift in TK_error and see what state
2408 * that takes us too.
2409 * Then we discard input tokens until
2410 * we find one that is acceptable.
2412 while (shift(&p, TK_error, tk, states) == 0
2414 // discard this state
2415 pop(&p, 1, do_free);
2417 while (search(&states[p.state], tk->num) < 0 &&
2418 tk->num != TK_eof) {
2420 tk = tok_copy(token_next(tokens));
2422 if (p.tos == 0 && tk->num == TK_eof)
2427 ret = p.asn_stack[0];
2429 pop(&p, p.tos, do_free);
2435 ###### exported functions
2436 void *parser_run(struct token_state *tokens,
2437 const struct state states[],
2438 int (*do_reduce)(int, int, void**, void*, FILE*),
2439 void (*do_free)(short, void*),
2445 The obvious example for a parser is a calculator.
2447 As `scanner` provides number parsing function using `libgmp` is it not much
2448 work to perform arbitrary rational number calculations.
2450 This calculator takes one expression, or an equality test per line. The
2451 results are printed and in any equality test fails, the program exits with
2454 Embedding mdcode inside mdcode is rather horrible. I'd like to find a
2455 better approach, but as the grammar file must have 3 components I need
2456 something like this.
2458 ###### File: parsergen.mk
2459 calc.c : parsergen calc.cgm
2460 ./parsergen -o calc calc.cgm
2461 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2462 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2470 // what do we use for a demo-grammar? A calculator of course.
2484 #include <sys/mman.h>
2489 #include "scanner.h"
2495 static void free_number(struct number *n)
2501 int main(int argc, char *argv[])
2503 int fd = open(argv[1], O_RDONLY);
2504 int len = lseek(fd, 0, 2);
2505 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2506 struct section *s = code_extract(file, file+len, NULL);
2507 struct token_config config = {
2508 .ignored = (1 << TK_line_comment)
2509 | (1 << TK_block_comment)
2512 .number_chars = ".,_+-",
2516 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2524 Session -> Session Line
2527 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2528 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2529 gmp_printf(" or as a decimal: %Fg\n", fl);
2533 | Expression = Expression NEWLINE ${
2534 if (mpq_equal($1.val, $3.val))
2535 gmp_printf("Both equal %Qd\n", $1.val);
2537 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
2542 | NEWLINE ${ printf("Blank line\n"); }$
2543 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2546 Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2547 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2548 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2550 Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2551 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2552 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2554 Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2555 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$