1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. `calc.cgm` is a test grammar for a simple calculator.
22 ###### File: parsergen.c
27 ## forward declarations
38 ###### File: libparser.c
47 ###### File: parsergen.mk
50 parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
53 ## Reading the grammar
55 The grammar must be stored in a markdown literate code file as parsed
56 by "[mdcode][]". It must have three top level (i.e. unreferenced)
57 sections: `header`, `code`, and `grammar`. The first two will be
58 literally copied into the generated `.c` and `.h`. files. The last
59 contains the grammar. This is tokenised with "[scanner][]".
62 [scanner]: scanner.html
68 ###### parser includes
72 The grammar contains production sets, precedence/associativity
73 declarations, and data type declarations. These are all parsed with
74 _ad hoc_ parsing as we don't have a parser generator yet.
76 The precedence and associativity can be set for each production, but
77 can be inherited from symbols. The data type is potentially defined
78 for each non-terminal and describes what C structure is used to store
79 information about each symbol.
82 enum assoc {Left, Right, Non};
83 char *assoc_names[] = {"Left","Right","Non"};
86 struct text struct_name;
88 unsigned short precedence;
92 unsigned short precedence;
100 The strings reported by `mdcode` and `scanner` are `struct text` which have
101 length rather than being null terminated. To help with printing an
102 comparing we define `text_is` and `prtxt`, which should possibly go in
103 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
104 which might contain control characters.
107 static int text_is(struct text t, char *s)
109 return (strlen(s) == t.len &&
110 strncmp(s, t.txt, t.len) == 0);
112 static void prtxt(struct text t)
114 printf("%.*s", t.len, t.txt);
119 Productions are comprised primarily of symbols - terminal and
120 non-terminal. We do not make a syntactic distinction between the two,
121 though non-terminals must be identifiers. Non-terminal symbols are
122 those which appear in the head of a production, terminal symbols are
123 those which don't. There are also "virtual" symbols used for precedence
124 marking discussed later, and sometimes we won't know what type a symbol
127 ###### forward declarations
128 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
129 char *symtypes = "UVTN";
133 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
134 table of known symbols and the resulting parser will report them as
135 `TK_reserved + N`. A small set of identifiers are reserved for the
136 different token types that `scanner` can report.
139 static char *reserved_words[] = {
140 [TK_error] = "ERROR",
141 [TK_number] = "NUMBER",
142 [TK_ident] = "IDENTIFIER",
144 [TK_string] = "STRING",
145 [TK_multi_string] = "MULTI_STRING",
146 [TK_indent] = "INDENT",
147 [TK_undent] = "UNDENT",
148 [TK_newline] = "NEWLINE",
154 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
155 recognised. The former is automatically expected at the end of the text
156 being parsed. The latter are always ignored by the parser.
158 All of these symbols are stored in a simple symbol table. We use the
159 `struct text` from `mdcode` to store the name and link them together
160 into a sorted list using an insertion sort.
162 We don't have separate `find` and `insert` functions as any symbol we
163 find needs to be remembered. We simply expect `find` to always return a
164 symbol, but its type might be `Unknown`.
173 ###### grammar fields
178 static int text_cmp(struct text a, struct text b)
183 int cmp = strncmp(a.txt, b.txt, len);
187 return a.len - b.len;
190 static struct symbol *sym_find(struct grammar *g, struct text s)
192 struct symbol **l = &g->syms;
197 (cmp = text_cmp((*l)->name, s)) < 0)
201 n = calloc(1, sizeof(*n));
210 static void symbols_init(struct grammar *g)
212 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
214 for (i = 0; i < entries; i++) {
217 t.txt = reserved_words[i];
220 t.len = strlen(t.txt);
227 ### Data types and precedence.
229 Data type specification and precedence specification are both
230 introduced by a dollar sign at the start of the line. If the next
231 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
232 precedence, otherwise it specifies a data type.
234 The data type name is simply stored and applied to the head of all
235 subsequent productions. It must be the name of a structure, so `$expr`
236 maps to `struct expr`.
238 Any productions given before the first data type will have no data type
239 and can carry no information. In order to allow other non-terminals to
240 have no type, the data type `$void` can be given. This does *not* mean
241 that `struct void` will be used, but rather than no type will be
242 associated with future non-terminals.
244 The precedence line must contain a list of symbols - typically
245 terminal symbols, but not necessarily. It can only contain symbols
246 that have not been seen yet, so precedence declaration must precede
247 the productions that they affect.
249 A precedence line may also contain a "virtual" symbol which is an
250 identifier preceded by `$$`. Virtual terminals carry precedence
251 information but are not included in the grammar. A production can
252 declare that it inherits the precedence of a given virtual symbol.
254 This use for `$$` precludes it from being used as a symbol in the
255 described language. Two other symbols: `${` and `}$` are also
258 Each new precedence line introduces a new precedence level and
259 declares how it associates. This level is stored in each symbol
260 listed and may be inherited by any production which uses the symbol. A
261 production inherits from the last symbol which has a precedence.
263 ###### grammar fields
264 struct text current_type;
268 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
269 static const char *known[] = { "$$", "${", "}$" };
272 static char *dollar_line(struct token_state *ts, struct grammar *g)
274 struct token t = token_next(ts);
279 if (t.num != TK_ident) {
280 err = "type or assoc expected after '$'";
283 if (text_is(t.txt, "LEFT"))
285 else if (text_is(t.txt, "RIGHT"))
287 else if (text_is(t.txt, "NON"))
290 g->current_type = t.txt;
291 if (text_is(t.txt, "void"))
292 g->current_type.txt = NULL;
294 if (t.num != TK_newline) {
295 err = "Extra tokens after type name";
301 // This is a precedence line, need some symbols.
305 while (t.num != TK_newline) {
306 enum symtype type = Terminal;
308 if (t.num == TK_virtual) {
311 if (t.num != TK_ident) {
312 err = "$$ must be followed by a word";
315 } else if (t.num != TK_ident &&
317 err = "Illegal token in precedence line";
320 s = sym_find(g, t.txt);
321 if (s->type != Unknown) {
322 err = "Symbols in precedence line must not already be known.";
326 s->precedence = g->prec_levels;
331 err = "No symbols given on precedence line";
335 while (t.num != TK_newline && t.num != TK_eof)
342 A production either starts with an identifier which is the head
343 non-terminal, or a vertical bar (`|`) in which case this production
344 uses the same head as the previous one. The identifier must be
345 followed by a `->` mark. All productions for a given non-terminal must
346 be grouped together with the `nonterminal ->` given only once.
348 After this a (possibly empty) sequence of identifiers and marks form
349 the body of the production. A virtual symbol may be given after the
350 body by preceding it with `$$`. If a virtual symbol is given, the
351 precedence of the production is that for the virtual symbol. If none
352 is given, the precedence is inherited from the last symbol in the
353 production which has a precedence specified.
355 After the optional precedence may come the `${` mark. This indicates
356 the start of a code fragment. If present, this must be on the same
357 line as the start of the production.
359 All of the text from the `${` through to the matching `}$` is
360 collected and forms the code-fragment for the production. It must all
361 be in one `code_node` of the literate code. The `}$` must be
362 at the end of a line.
364 Text in the code fragment will undergo substitutions where `$N` for
365 some numeric `N` will be replaced with a variable holding the parse
366 information for the particular symbol in the production. `$0` is the
367 head of the production, `$1` is the first symbol of the body, etc.
368 The type of `$N` for a terminal symbol is `struct token`. For
369 non-terminal, it is whatever has been declared for that symbol.
371 While building productions we will need to add to an array which needs to
375 static void array_add(void *varray, int *cnt, void *new)
377 void ***array = varray;
380 current = ((*cnt-1) | (step-1))+1;
381 if (*cnt == current) {
384 *array = realloc(*array, current * sizeof(void*));
386 (*array)[*cnt] = new;
390 Collecting the code fragment simply involves reading tokens until we
391 hit the end token `}$`, and noting the character position of the start and
395 static struct text collect_code(struct token_state *state,
400 code.txt = start.txt.txt + start.txt.len;
402 t = token_next(state);
403 while (t.node == start.node &&
404 t.num != TK_close && t.num != TK_error &&
406 if (t.num == TK_close && t.node == start.node)
407 code.len = t.txt.txt - code.txt;
413 Now we have all the bit we need to parse a full production.
418 ###### grammar fields
419 struct production **productions;
420 int production_count;
422 ###### production fields
424 struct symbol **body;
429 int first_production;
432 static char *parse_production(struct grammar *g,
434 struct token_state *state)
436 /* Head has already been parsed. */
439 struct production p, *pp;
441 memset(&p, 0, sizeof(p));
443 tk = token_next(state);
444 while (tk.num == TK_ident || tk.num == TK_mark) {
445 struct symbol *bs = sym_find(g, tk.txt);
446 if (bs->type == Unknown)
448 if (bs->type == Virtual) {
449 err = "Virtual symbol not permitted in production";
452 if (bs->precedence) {
453 p.precedence = bs->precedence;
456 array_add(&p.body, &p.body_size, bs);
457 tk = token_next(state);
459 if (tk.num == TK_virtual) {
461 tk = token_next(state);
462 if (tk.num != TK_ident) {
463 err = "word required after $$";
466 vs = sym_find(g, tk.txt);
467 if (vs->type != Virtual) {
468 err = "symbol after $$ must be virtual";
471 p.precedence = vs->precedence;
473 tk = token_next(state);
475 if (tk.num == TK_open) {
476 p.code = collect_code(state, tk);
477 if (p.code.txt == NULL) {
478 err = "code fragment not closed properly";
481 tk = token_next(state);
483 if (tk.num != TK_newline && tk.num != TK_eof) {
484 err = "stray tokens at end of line";
487 pp = malloc(sizeof(*pp));
489 array_add(&g->productions, &g->production_count, pp);
492 while (tk.num != TK_newline && tk.num != TK_eof)
493 tk = token_next(state);
497 With the ability to parse production and dollar-lines, we have nearly all
498 that we need to parse a grammar from a `code_node`.
500 The head of the first production will effectively be the `start` symbol of
501 the grammar. However it wont _actually_ be so. Processing the grammar is
502 greatly simplified if the real start symbol only has a single production,
503 and expect `$eof` as the final terminal. So when we find the first explicit
504 production we insert an extra production as production zero which looks like
506 ###### Example: production 0
509 where `START` is the first non-terminal give.
511 ###### create production zero
512 struct production *p = calloc(1,sizeof(*p));
513 struct text start = {"$start",6};
514 struct text eof = {"$eof",4};
515 p->head = sym_find(g, start);
516 p->head->type = Nonterminal;
517 array_add(&p->body, &p->body_size, head);
518 array_add(&p->body, &p->body_size, sym_find(g, eof));
519 g->start = p->head->num;
520 p->head->first_production = g->production_count;
521 array_add(&g->productions, &g->production_count, p);
523 Now we are ready to read in the grammar.
525 ###### grammar fields
526 int start; // the 'start' symbol.
529 static struct grammar *grammar_read(struct code_node *code)
531 struct token_config conf = {
534 .words_marks = known,
535 .known_count = sizeof(known)/sizeof(known[0]),
537 .ignored = (1 << TK_line_comment)
538 | (1 << TK_block_comment)
541 | (1 << TK_multi_string)
546 struct token_state *state = token_open(code, &conf);
547 struct token tk = token_next(state);
548 struct symbol *head = NULL;
552 g = calloc(1, sizeof(*g));
555 for (tk = token_next(state); tk.num != TK_eof;
556 tk = token_next(state)) {
557 if (tk.num == TK_newline)
559 if (tk.num == TK_ident) {
561 head = sym_find(g, tk.txt);
562 if (head->type == Nonterminal)
563 err = "This non-terminal has already be used.";
564 else if (head->type == Virtual)
565 err = "Virtual symbol not permitted in head of production";
567 head->type = Nonterminal;
568 head->struct_name = g->current_type;
570 ## create production zero
572 head->first_production = g->production_count;
573 tk = token_next(state);
574 if (tk.num == TK_mark &&
575 text_is(tk.txt, "->"))
576 err = parse_production(g, head, state);
578 err = "'->' missing in production";
580 } else if (tk.num == TK_mark
581 && text_is(tk.txt, "|")) {
582 // another production for same non-term
584 err = parse_production(g, head, state);
586 err = "First production must have a head";
587 } else if (tk.num == TK_mark
588 && text_is(tk.txt, "$")) {
589 err = dollar_line(state, g);
591 err = "Unrecognised token at start of line.";
599 fprintf(stderr, "Error at line %d: %s\n",
606 ## Analysing the grammar
608 The central task in analysing the grammar is to determine a set of
609 states to drive the parsing process. This will require finding
610 various sets of symbols and of "items". Some of these sets will need
611 to attach information to each element in the set, so they are more
614 Each "item" may have a 'look-ahead' or `LA` set associated with
615 it. Many of these will be the same as each other. To avoid memory
616 wastage, and to simplify some comparisons of sets, these sets will be
617 stored in a list of unique sets, each assigned a number.
619 Once we have the data structures in hand to manage these sets and
620 lists, we can start setting the 'nullable' flag, build the 'FIRST'
621 sets, and then create the item sets which define the various states.
625 Though we don't only store symbols in these sets, they are the main
626 things we store, so they are called symbol sets or "symsets".
628 A symset has a size, an array of shorts, and an optional array of data
629 values, which are also shorts. If the array of data is not present,
630 we store an impossible pointer, as `NULL` is used to indicate that no
631 memory has been allocated yet;
636 unsigned short *syms, *data;
638 #define NO_DATA ((unsigned short *)1)
639 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
640 const struct symset INIT_DATASET = { 0, NULL, NULL };
642 The arrays of shorts are allocated in blocks of 8 and are kept sorted
643 by using an insertion sort. We don't explicitly record the amount of
644 allocated space as it can be derived directly from the current `cnt` using
645 `((cnt - 1) | 7) + 1`.
648 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
651 int current = ((s->cnt-1) | 7) + 1;
652 if (current == s->cnt) {
654 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
655 if (s->data != NO_DATA)
656 s->data = realloc(s->data, sizeof(*s->data) * current);
659 while (i > 0 && s->syms[i-1] > key) {
660 s->syms[i] = s->syms[i-1];
661 if (s->data != NO_DATA)
662 s->data[i] = s->data[i-1];
666 if (s->data != NO_DATA)
671 Finding a symbol (or item) in a `symset` uses a simple binary search.
672 We return the index where the value was found (so data can be accessed),
673 or `-1` to indicate failure.
675 static int symset_find(struct symset *ss, unsigned short key)
682 while (lo + 1 < hi) {
683 int mid = (lo + hi) / 2;
684 if (ss->syms[mid] <= key)
689 if (ss->syms[lo] == key)
694 We will often want to form the union of two symsets. When we do, we
695 will often want to know if anything changed (as they might mean there
696 is more work to do). So `symset_union` reports whether anything was
697 added to the first set. We use a slow quadratic approach as these
698 sets don't really get very big. If profiles shows this to be a problem is
699 can be optimised later.
701 static int symset_union(struct symset *a, struct symset *b)
705 for (i = 0; i < b->cnt; i++)
706 if (symset_find(a, b->syms[i]) < 0) {
707 unsigned short data = 0;
708 if (b->data != NO_DATA)
710 symset_add(a, b->syms[i], data);
716 And of course we must be able to free a symset.
718 static void symset_free(struct symset ss)
721 if (ss.data != NO_DATA)
727 Some symsets are simply stored somewhere appropriate in the `grammar`
728 data structure, others needs a bit of indirection. This applies
729 particularly to "LA" sets. These will be paired with "items" in an
730 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
731 stored in the `data` field, so we need a mapping from a small (short)
732 number to an LA `symset`.
734 As mentioned earlier this involves creating a list of unique symsets.
736 For now, we just use a linear list sorted by insertion. A skiplist
737 would be more efficient and may be added later.
744 struct setlist *next;
747 ###### grammar fields
748 struct setlist *sets;
753 static int ss_cmp(struct symset a, struct symset b)
756 int diff = a.cnt - b.cnt;
759 for (i = 0; i < a.cnt; i++) {
760 diff = (int)a.syms[i] - (int)b.syms[i];
767 static int save_set(struct grammar *g, struct symset ss)
769 struct setlist **sl = &g->sets;
773 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
780 s = malloc(sizeof(*s));
789 Finding a set by number is currently performed by a simple linear search.
790 If this turns out to hurt performance, we can store the sets in a dynamic
791 array like the productions.
793 static struct symset set_find(struct grammar *g, int num)
795 struct setlist *sl = g->sets;
796 while (sl && sl->num != num)
802 ### Setting `nullable`
804 We set `nullable` on the head symbol for any production for which all
805 the body symbols (if any) are nullable. As this is a recursive
806 definition, any change in the `nullable` setting means that we need to
807 re-evaluate where it needs to be set.
809 We simply loop around performing the same calculations until no more
816 static void set_nullable(struct grammar *g)
819 while (check_again) {
822 for (p = 0; p < g->production_count; p++) {
823 struct production *pr = g->productions[p];
826 if (pr->head->nullable)
828 for (s = 0; s < pr->body_size; s++)
829 if (! pr->body[s]->nullable)
831 if (s == pr->body_size) {
832 pr->head->nullable = 1;
839 ### Building the `first` sets
841 When calculating what can follow a particular non-terminal, we will need to
842 know what the "first" terminal in an subsequent non-terminal might be. So
843 we calculate the `first` set for every non-terminal and store them in an
844 array. We don't bother recording the "first" set for terminals as they are
847 As the "first" for one symbol might depend on the "first" of another,
848 we repeat the calculations until no changes happen, just like with
849 `set_nullable`. This makes use of the fact that `symset_union`
850 reports if any change happens.
852 The core of this which finds the "first" of part of a production body
853 will be reused for computing the "follow" sets, so we split it out
854 into a separate function.
856 ###### grammar fields
857 struct symset *first;
861 static int add_first(struct production *pr, int start,
862 struct symset *target, struct grammar *g,
867 for (s = start; s < pr->body_size; s++) {
868 struct symbol *bs = pr->body[s];
869 if (bs->type == Terminal) {
870 if (symset_find(target, bs->num) < 0) {
871 symset_add(target, bs->num, 0);
875 } else if (symset_union(target, &g->first[bs->num]))
881 *to_end = (s == pr->body_size);
885 static void build_first(struct grammar *g)
889 g->first = calloc(g->num_syms, sizeof(g->first[0]));
890 for (s = 0; s < g->num_syms; s++)
891 g->first[s] = INIT_SYMSET;
893 while (check_again) {
896 for (p = 0; p < g->production_count; p++) {
897 struct production *pr = g->productions[p];
898 struct symset *head = &g->first[pr->head->num];
900 if (add_first(pr, 0, head, g, NULL))
906 ### Building the `follow` sets.
908 There are two different situations when we will want to generate "follow"
909 sets. If we are doing an SLR analysis, we want to generate a single
910 "follow" set for each non-terminal in the grammar. That is what is
911 happening here. If we are doing an LALR or LR analysis we will want
912 to generate a separate "LA" set for each item. We do that later
915 There are two parts to generating a "follow" set. Firstly we look at
916 every place that any non-terminal appears in the body of any
917 production, and we find the set of possible "first" symbols after
918 there. This is done using `add_first` above and only needs to be done
919 once as the "first" sets are now stable and will not change.
923 for (p = 0; p < g->production_count; p++) {
924 struct production *pr = g->productions[p];
927 for (b = 0; b < pr->body_size - 1; b++) {
928 struct symbol *bs = pr->body[b];
929 if (bs->type == Terminal)
931 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
935 The second part is to add the "follow" set of the head of a production
936 to the "follow" sets of the final symbol in the production, and any
937 other symbol which is followed only by `nullable` symbols. As this
938 depends on "follow" itself we need to repeatedly perform the process
939 until no further changes happen.
943 for (again = 0, p = 0;
944 p < g->production_count;
945 p < g->production_count-1
946 ? p++ : again ? (again = 0, p = 0)
948 struct production *pr = g->productions[p];
951 for (b = pr->body_size - 1; b >= 0; b--) {
952 struct symbol *bs = pr->body[b];
953 if (bs->type == Terminal)
955 if (symset_union(&g->follow[bs->num],
956 &g->follow[pr->head->num]))
963 We now just need to create and initialise the `follow` list to get a
966 ###### grammar fields
967 struct symset *follow;
970 static void build_follow(struct grammar *g)
973 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
974 for (s = 0; s < g->num_syms; s++)
975 g->follow[s] = INIT_SYMSET;
979 ### Building itemsets and states
981 There are three different levels of detail that can be chosen for
982 building the itemsets and states for the LR grammar. They are:
984 1. LR(0) or SLR(1), where no look-ahead is considered.
985 2. LALR(1) where we build look-ahead sets with each item and merge
986 the LA sets when we find two paths to the same "kernel" set of items.
987 3. LR(1) where different look-ahead for any item in the code means
988 a different state must be created.
990 ###### forward declarations
991 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
993 We need to be able to look through existing states to see if a newly
994 generated state already exists. For now we use a simple sorted linked
997 An item is a pair of numbers: the production number and the position of
998 "DOT", which is an index into the body of the production.
999 As the numbers are not enormous we can combine them into a single "short"
1000 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1001 production number (so 4000 productions with maximum of 15 symbols in the
1004 Comparing the itemsets will be a little different to comparing symsets
1005 as we want to do the lookup after generating the "kernel" of an
1006 itemset, so we need to ignore the offset=zero items which are added during
1009 To facilitate this, we modify the "DOT" number so that "0" sorts to the end of
1010 the list in the symset, and then only compare items before the first "0".
1013 static inline unsigned short item_num(int production, int index)
1015 return production | ((31-index) << 11);
1017 static inline int item_prod(unsigned short item)
1019 return item & 0x7ff;
1021 static inline int item_index(unsigned short item)
1023 return (31-(item >> 11)) & 0x1f;
1026 For LR(1) analysis we need to compare not just the itemset in a state
1027 but also the LA sets. As we assign each unique LA set a number, we
1028 can just compare the symset and the data values together.
1031 static int itemset_cmp(struct symset a, struct symset b,
1032 enum grammar_type type)
1038 i < a.cnt && i < b.cnt &&
1039 item_index(a.syms[i]) > 0 &&
1040 item_index(b.syms[i]) > 0;
1042 int diff = a.syms[i] - b.syms[i];
1046 diff = a.data[i] - b.data[i];
1051 if (i == a.cnt || item_index(a.syms[i]) == 0)
1055 if (i == b.cnt || item_index(b.syms[i]) == 0)
1061 if (type < LR1 || av == -1)
1064 a.data[i] - b.data[i];
1067 And now we can build the list of itemsets. The lookup routine returns
1068 both a success flag and a pointer to where in the list an insert
1069 should happen, so we don't need to search a second time.
1073 struct itemset *next;
1075 struct symset items;
1076 struct symset go_to;
1080 ###### grammar fields
1081 struct itemset *items;
1085 static int itemset_find(struct grammar *g, struct itemset ***where,
1086 struct symset kernel, enum grammar_type type)
1088 struct itemset **ip;
1090 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1091 struct itemset *i = *ip;
1093 diff = itemset_cmp(i->items, kernel, type);
1106 Adding an itemset may require merging the LA sets if LALR analysis is
1107 happening. If any new LA set add symbol that weren't in the old LA set, we
1108 clear the `completed` flag so that the dependants of this itemset will be
1109 recalculated and their LA sets updated.
1111 `add_itemset` must consume the symsets it is passed, either by adding
1112 them to a data structure, of freeing them.
1114 static int add_itemset(struct grammar *g, struct symset ss,
1115 enum grammar_type type)
1117 struct itemset **where, *is;
1119 int found = itemset_find(g, &where, ss, type);
1121 is = calloc(1, sizeof(*is));
1122 is->state = g->states;
1126 is->go_to = INIT_DATASET;
1135 for (i = 0; i < ss.cnt; i++) {
1136 struct symset temp = INIT_SYMSET, s;
1137 if (ss.data[i] == is->items.data[i])
1139 s = set_find(g, is->items.data[i]);
1140 symset_union(&temp, &s);
1141 s = set_find(g, ss.data[i]);
1142 if (symset_union(&temp, &s)) {
1143 is->items.data[i] = save_set(g, temp);
1154 To build all the itemsets, we first insert the initial itemset made from the
1155 start symbol, complete each itemset, and then generate new itemsets from old
1156 until no new ones can be made.
1158 Completing an itemset means finding all the items where "DOT" is followed by
1159 a nonterminal and adding "DOT=0" items for every production from that
1160 non-terminal - providing each item hasn't already been added.
1162 If LA sets are needed, the LA set for each new item is found using
1163 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1164 be supplemented by the LA set for the item which produce the new item.
1166 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1167 is used in the next stage.
1169 NOTE: precedence handling should happen here - I haven't written this yet
1172 ###### complete itemset
1173 for (i = 0; i < is->items.cnt; i++) {
1174 int p = item_prod(is->items.syms[i]);
1175 int bs = item_index(is->items.syms[i]);
1176 struct production *pr = g->productions[p];
1179 struct symset LA = INIT_SYMSET;
1180 unsigned short sn = 0;
1182 if (bs == pr->body_size)
1185 if (symset_find(&done, s->num) < 0)
1186 symset_add(&done, s->num, 0);
1187 if (s->type != Nonterminal)
1193 add_first(pr, bs+1, &LA, g, &to_end);
1195 struct symset ss = set_find(g, is->items.data[i]);
1196 symset_union(&LA, &ss);
1198 sn = save_set(g, LA);
1199 LA = set_find(g, sn);
1201 /* Add productions for this symbol */
1202 for (p2 = s->first_production;
1203 p2 < g->production_count &&
1204 g->productions[p2]->head == s;
1206 int itm = item_num(p2, 0);
1207 int pos = symset_find(&is->items, itm);
1209 symset_add(&is->items, itm, sn);
1210 /* Will have re-ordered, so start
1211 * from beginning again */
1213 } else if (type >= LALR) {
1214 struct symset ss = set_find(g, is->items.data[pos]);
1215 struct symset tmp = INIT_SYMSET;
1217 symset_union(&tmp, &ss);
1218 if (symset_union(&tmp, &LA)) {
1219 is->items.data[pos] = save_set(g, tmp);
1227 For each symbol we found (and placed in `done`) we collect all the items for
1228 which this symbol is next, and create a new itemset with "DOT" advanced over
1229 the symbol. This is then added to the collection of itemsets (or merged
1230 with a pre-existing itemset).
1232 ###### derive itemsets
1233 // Now we have a completed itemset, so we need to
1234 // create all the 'next' itemset and create them
1235 // if they don't exist.
1236 for (i = 0; i < done.cnt; i++) {
1238 unsigned short state;
1239 struct symset newitemset = INIT_SYMSET;
1241 newitemset = INIT_DATASET;
1243 for (j = 0; j < is->items.cnt; j++) {
1244 int itm = is->items.syms[j];
1245 int p = item_prod(itm);
1246 int bp = item_index(itm);
1247 struct production *pr = g->productions[p];
1248 unsigned short la = 0;
1251 if (bp == pr->body_size)
1253 if (pr->body[bp]->num != done.syms[i])
1256 la = is->items.data[j];
1257 pos = symset_find(&newitemset, pr->head->num);
1259 symset_add(&newitemset, item_num(p, bp+1), la);
1260 else if (type >= LALR) {
1261 // Need to merge la set.
1262 int la2 = newitemset.data[pos];
1264 struct symset ss = set_find(g, la2);
1265 struct symset LA = INIT_SYMSET;
1266 symset_union(&LA, &ss);
1267 ss = set_find(g, la);
1268 if (symset_union(&LA, &ss))
1269 newitemset.data[pos] = save_set(g, LA);
1275 state = add_itemset(g, newitemset, type);
1276 if (symset_find(&is->go_to, done.syms[i]) < 0)
1277 symset_add(&is->go_to, done.syms[i], state);
1280 All that is left is to crate the initial itemset from production zero, and
1281 with `TK_eof` as the LA set.
1284 static void build_itemsets(struct grammar *g, enum grammar_type type)
1286 struct symset first = INIT_SYMSET;
1289 unsigned short la = 0;
1291 // LA set just has eof
1292 struct symset eof = INIT_SYMSET;
1293 symset_add(&eof, TK_eof, 0);
1294 la = save_set(g, eof);
1295 first = INIT_DATASET;
1297 // production 0, offset 0 (with no data)
1298 symset_add(&first, item_num(0, 0), la);
1299 add_itemset(g, first, type);
1300 for (again = 0, is = g->items;
1302 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1304 struct symset done = INIT_SYMSET;
1315 ### Completing the analysis.
1317 The exact process of analysis depends on which level was requested. For
1318 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1319 `LR1` we need first, but not follow. For `SLR` we need both.
1321 We don't build the "action" tables that you might expect as the parser
1322 doesn't use them. They will be calculated to some extent if needed for
1325 Once we have built everything we allocate arrays for the two lists:
1326 symbols and itemsets. This allows more efficient access during reporting.
1327 The symbols are grouped as terminals and non-terminals and we record the
1328 changeover point in `first_nonterm`.
1330 ###### grammar fields
1331 struct symbol **symtab;
1332 struct itemset **statetab;
1335 ###### grammar_analyse
1337 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1341 int snum = TK_reserved;
1342 for (s = g->syms; s; s = s->next)
1343 if (s->num < 0 && s->type == Terminal) {
1347 g->first_nonterm = snum;
1348 for (s = g->syms; s; s = s->next)
1354 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1355 for (s = g->syms; s; s = s->next)
1356 g->symtab[s->num] = s;
1365 build_itemsets(g, type);
1367 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1368 for (is = g->items; is ; is = is->next)
1369 g->statetab[is->state] = is;
1372 ## Reporting on the Grammar
1374 The purpose of the report is to give the grammar developer insight into
1375 how the grammar parser will work. It is basically a structured dump of
1376 all the tables that have been generated, plus an description of any conflicts.
1378 ###### grammar_report
1379 static int grammar_report(struct grammar *g, enum grammar_type type)
1385 return report_conflicts(g, type);
1388 Firstly we have the complete list of symbols, together with the "FIRST"
1389 set if that was generated.
1393 static void report_symbols(struct grammar *g)
1397 printf("SYMBOLS + FIRST:\n");
1399 printf("SYMBOLS:\n");
1401 for (n = 0; n < g->num_syms; n++) {
1402 struct symbol *s = g->symtab[n];
1406 printf(" %c%3d%c: ",
1407 s->nullable ? '*':' ',
1408 s->num, symtypes[s->type]);
1411 printf(" (%d%s)", s->precedence,
1412 assoc_names[s->assoc]);
1414 if (g->first && s->type == Nonterminal) {
1417 for (j = 0; j < g->first[n].cnt; j++) {
1420 prtxt(g->symtab[g->first[n].syms[j]]->name);
1427 Then we have to follow sets if they were computed.
1429 static void report_follow(struct grammar *g)
1432 printf("FOLLOW:\n");
1433 for (n = 0; n < g->num_syms; n++)
1434 if (g->follow[n].cnt) {
1438 prtxt(g->symtab[n]->name);
1439 for (j = 0; j < g->follow[n].cnt; j++) {
1442 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1448 And finally the item sets. These include the GO TO tables and, for
1449 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1450 it up a bit. First the items, with production number and associativity.
1452 static void report_item(struct grammar *g, int itm)
1454 int p = item_prod(itm);
1455 int dot = item_index(itm);
1456 struct production *pr = g->productions[p];
1460 prtxt(pr->head->name);
1462 for (i = 0; i < pr->body_size; i++) {
1463 printf(" %s", (dot == i ? ". ": ""));
1464 prtxt(pr->body[i]->name);
1466 if (dot == pr->body_size)
1470 printf(" (%d%s)", pr->precedence,
1471 assoc_names[pr->assoc]);
1475 The LA sets which are (possibly) reported with each item:
1477 static void report_la(struct grammar *g, int lanum)
1479 struct symset la = set_find(g, lanum);
1483 printf(" LOOK AHEAD(%d)", lanum);
1484 for (i = 0; i < la.cnt; i++) {
1487 prtxt(g->symtab[la.syms[i]]->name);
1492 Then the go to sets:
1495 static void report_goto(struct grammar *g, struct symset gt)
1500 for (i = 0; i < gt.cnt; i++) {
1502 prtxt(g->symtab[gt.syms[i]]->name);
1503 printf(" -> %d\n", gt.data[i]);
1507 Now we can report all the item sets complete with items, LA sets, and GO TO.
1509 static void report_itemsets(struct grammar *g)
1512 printf("ITEM SETS(%d)\n", g->states);
1513 for (s = 0; s < g->states; s++) {
1515 struct itemset *is = g->statetab[s];
1516 printf(" Itemset %d:\n", s);
1517 for (j = 0; j < is->items.cnt; j++) {
1518 report_item(g, is->items.syms[j]);
1519 if (is->items.data != NO_DATA)
1520 report_la(g, is->items.data[j]);
1522 report_goto(g, is->go_to);
1526 ### Reporting conflicts
1528 Conflict detection varies a lot among different analysis levels. However
1529 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1530 LR1 are also similar as they have FOLLOW or LA sets.
1534 ## conflict functions
1536 static int report_conflicts(struct grammar *g, enum grammar_type type)
1539 printf("Conflicts:\n");
1541 cnt = conflicts_lr0(g, type);
1543 cnt = conflicts_slr(g, type);
1545 printf(" - no conflicts\n");
1549 LR0 conflicts are any state which have both a reducible item and
1552 LR05 conflicts only occurs if two possibly reductions exist,
1553 as shifts always over-ride reductions.
1555 ###### conflict functions
1556 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1560 for (i = 0; i < g->states; i++) {
1561 struct itemset *is = g->statetab[i];
1562 int last_reduce = -1;
1563 int prev_reduce = -1;
1564 int last_shift = -1;
1568 for (j = 0; j < is->items.cnt; j++) {
1569 int itm = is->items.syms[j];
1570 int p = item_prod(itm);
1571 int bp = item_index(itm);
1572 struct production *pr = g->productions[p];
1574 if (bp == pr->body_size) {
1575 prev_reduce = last_reduce;
1579 if (pr->body[bp]->type == Terminal)
1582 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1583 printf(" State %d has both SHIFT and REDUCE:\n", i);
1584 report_item(g, is->items.syms[last_shift]);
1585 report_item(g, is->items.syms[last_reduce]);
1588 if (prev_reduce >= 0) {
1589 printf(" State %d has 2 (or more) reducible items\n", i);
1590 report_item(g, is->items.syms[prev_reduce]);
1591 report_item(g, is->items.syms[last_reduce]);
1598 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1599 look ahead, or if a symbol in a look-ahead can be shifted. The differ only
1600 in the source of the look ahead set.
1602 We build a dataset mapping terminal to item for possible SHIFTs and then
1603 another for possible REDUCE operations. We report when we get conflicts
1606 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1611 for (i = 0; i < g->states; i++) {
1612 struct itemset *is = g->statetab[i];
1613 struct symset shifts = INIT_DATASET;
1614 struct symset reduce = INIT_DATASET;
1618 /* First collect the shifts */
1619 for (j = 0; j < is->items.cnt; j++) {
1620 unsigned short itm = is->items.syms[j];
1621 int p = item_prod(itm);
1622 int bp = item_index(itm);
1623 struct production *pr = g->productions[p];
1625 if (bp < pr->body_size &&
1626 pr->body[bp]->type == Terminal) {
1628 int sym = pr->body[bp]->num;
1629 if (symset_find(&shifts, sym) < 0)
1630 symset_add(&shifts, sym, itm);
1633 /* Now look for reduction and conflicts */
1634 for (j = 0; j < is->items.cnt; j++) {
1635 unsigned short itm = is->items.syms[j];
1636 int p = item_prod(itm);
1637 int bp = item_index(itm);
1638 struct production *pr = g->productions[p];
1640 if (bp < pr->body_size)
1645 la = g->follow[pr->head->num];
1647 la = set_find(g, is->items.data[j]);
1649 for (k = 0; k < la.cnt; k++) {
1650 int pos = symset_find(&shifts, la.syms[k]);
1652 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1653 prtxt(g->symtab[la.syms[k]]->name);
1655 report_item(g, shifts.data[pos]);
1656 report_item(g, itm);
1659 pos = symset_find(&reduce, la.syms[k]);
1661 symset_add(&reduce, la.syms[k], itm);
1664 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1665 prtxt(g->symtab[la.syms[k]]->name);
1667 report_item(g, itm);
1668 report_item(g, reduce.data[pos]);
1672 symset_free(shifts);
1673 symset_free(reduce);
1679 ## Generating the parser
1681 The export part of the parser is the `parse_XX` function, where the name
1682 `XX` is based on the name of the parser files.
1684 This takes a `code_node`, a partially initialized `token_config`, and an
1685 optional `FILE` to send tracing to. The `token_config` gets the list of
1686 known words added and then is used with the `code_node` to initialize the
1689 `parse_XX` then call the library function `parser_run` to actually complete
1690 the parse, This needs the `states` table and function to call the various
1691 pieces of code provided in the grammar file, so they are generated first.
1693 ###### parser_generate
1695 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1701 gen_reduce(f, g, file);
1704 fprintf(f, "#line 0 \"gen_parser\"\n");
1705 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1708 fprintf(f, "\tstruct token_state *tokens;\n");
1709 fprintf(f, "\tconfig->words_marks = known;\n");
1710 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1711 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1712 fprintf(f, "\ttokens = token_open(code, config);\n");
1713 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
1714 fprintf(f, "\ttoken_close(tokens);\n");
1715 fprintf(f, "\treturn rv;\n");
1716 fprintf(f, "}\n\n");
1719 ### Table words table
1721 The know words is simply an array of terminal symbols.
1722 The table of nonterminals used for tracing is a similar array.
1726 static void gen_known(FILE *f, struct grammar *g)
1729 fprintf(f, "#line 0 \"gen_known\"\n");
1730 fprintf(f, "static const char *known[] = {\n");
1731 for (i = TK_reserved;
1732 i < g->num_syms && g->symtab[i]->type == Terminal;
1734 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1735 g->symtab[i]->name.txt);
1736 fprintf(f, "};\n\n");
1739 static void gen_non_term(FILE *f, struct grammar *g)
1742 fprintf(f, "#line 0 \"gen_non_term\"\n");
1743 fprintf(f, "static const char *non_term[] = {\n");
1744 for (i = TK_reserved;
1747 if (g->symtab[i]->type == Nonterminal)
1748 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1749 g->symtab[i]->name.txt);
1750 fprintf(f, "};\n\n");
1753 ### States and the goto tables.
1755 For each state we record the goto table and the reducible production if
1757 Some of the details of the reducible production are stored in the
1758 `do_reduce` function to come later. Here we store the production number,
1759 the body size (useful for stack management) and the resulting symbol (useful
1760 for knowing how to free data later).
1762 The go to table is stored in a simple array of `sym` and corresponding
1765 ###### exported types
1773 const struct lookup * go_to;
1782 static void gen_goto(FILE *f, struct grammar *g)
1785 fprintf(f, "#line 0 \"gen_goto\"\n");
1786 for (i = 0; i < g->states; i++) {
1788 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1790 struct symset gt = g->statetab[i]->go_to;
1791 for (j = 0; j < gt.cnt; j++)
1792 fprintf(f, "\t{ %d, %d },\n",
1793 gt.syms[j], gt.data[j]);
1800 static void gen_states(FILE *f, struct grammar *g)
1803 fprintf(f, "#line 0 \"gen_states\"\n");
1804 fprintf(f, "static const struct state states[] = {\n");
1805 for (i = 0; i < g->states; i++) {
1806 struct itemset *is = g->statetab[i];
1808 for (j = 0; j < is->items.cnt; j++) {
1809 int itm = is->items.syms[j];
1810 int p = item_prod(itm);
1811 int bp = item_index(itm);
1812 struct production *pr = g->productions[p];
1814 if (bp < pr->body_size)
1816 /* This is what we reduce */
1821 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d },\n",
1822 i, is->go_to.cnt, i, prod,
1823 prod < 0 ? -1 : g->productions[prod]->body_size,
1824 prod < 0 ? -1 : g->productions[prod]->head->num);
1826 fprintf(f, "};\n\n");
1829 ### The `do_reduce` function and the code
1831 When the parser engine decides to reduce a production, it calls `do_reduce`.
1832 This has two functions.
1834 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1835 production being reduced. Secondly it runs the code that was included with
1836 the production if any.
1838 This code needs to be able to store data somewhere. Rather than requiring
1839 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1840 `do_reduce` return the size to be saved.
1842 The code fragment requires translation when written out. Any `$N` needs to
1843 be converted to a reference either to that buffer (if `$0`) or to the
1844 structure returned by a previous reduction. These pointer need to be cast
1845 to the appropriate type for each access. All this is handling in
1851 static void gen_code(struct production *p, FILE *f, struct grammar *g)
1854 fprintf(f, "\t\t\t");
1855 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1864 if (*c < '0' || *c > '9') {
1869 while (c[1] >= '0' && c[1] <= '9') {
1871 n = n * 10 + *c - '0';
1874 fprintf(f, "(*(struct %.*s*)ret)",
1875 p->head->struct_name.len,
1876 p->head->struct_name.txt);
1877 else if (n > p->body_size)
1878 fprintf(f, "$%d", n);
1879 else if (p->body[n-1]->type == Terminal)
1880 fprintf(f, "(*(struct token *)body[%d])",
1882 else if (p->body[n-1]->struct_name.txt == NULL)
1883 fprintf(f, "$%d", n);
1885 fprintf(f, "(*(struct %.*s*)body[%d])",
1886 p->body[n-1]->struct_name.len,
1887 p->body[n-1]->struct_name.txt, n-1);
1894 static void gen_reduce(FILE *f, struct grammar *g, char *file)
1897 fprintf(f, "#line 0 \"gen_reduce\"\n");
1898 fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n");
1899 fprintf(f, " void *ret)\n");
1901 fprintf(f, "\tint ret_size = 0;\n");
1903 fprintf(f, "\tswitch(prod) {\n");
1904 for (i = 0; i < g->production_count; i++) {
1905 struct production *p = g->productions[i];
1906 fprintf(f, "\tcase %d:\n", i);
1911 if (p->head->struct_name.txt)
1912 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
1913 p->head->struct_name.len,
1914 p->head->struct_name.txt);
1916 fprintf(f, "\t\tbreak;\n");
1918 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
1923 As each non-terminal can potentially cause a different type of data
1924 structure to be allocated and filled in, we need to be able to free it when
1927 It is particularly important to have fine control over freeing during error
1928 recovery where individual stack frames might need to be freed.
1930 For this, the grammar author required to defined a `free_XX` function for
1931 each structure that is used by a non-terminal. `do_free` all call whichever
1932 is appropriate given a symbol number, and will call `free` (as is
1933 appropriate for tokens` on any terminal symbol.
1937 static void gen_free(FILE *f, struct grammar *g)
1941 fprintf(f, "#line 0 \"gen_free\"\n");
1942 fprintf(f, "static void do_free(short sym, void *asn)\n");
1944 fprintf(f, "\tif (!asn) return;\n");
1945 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
1946 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
1947 fprintf(f, "\tswitch(sym) {\n");
1949 for (i = 0; i < g->num_syms; i++) {
1950 struct symbol *s = g->symtab[i];
1952 s->type != Nonterminal ||
1953 s->struct_name.txt == NULL)
1956 fprintf(f, "\tcase %d:\n", s->num);
1957 fprintf(f, "\t\tfree_%.*s(asn);\n",
1959 s->struct_name.txt);
1960 fprintf(f, "\t\tbreak;\n");
1962 fprintf(f, "\t}\n}\n\n");
1965 ## The main routine.
1967 There are three key parts to the "main" function of parsergen: processing
1968 the arguments, loading the grammar file, and dealing with the grammar.
1970 ### Argument processing.
1972 Command line options allow the selection of analysis mode, name of output
1973 file, and whether or not a report should be generated. By default we create
1974 a report only if no code output was requested.
1976 The `parse_XX` function name uses the basename of the output file which
1977 should not have a suffix (`.c`). `.c` is added to the given name for the
1978 code, and `.h` is added for the header (if header text is specifed in the
1985 static const struct option long_options[] = {
1986 { "LR0", 0, NULL, '0' },
1987 { "LR05", 0, NULL, '5' },
1988 { "SLR", 0, NULL, 'S' },
1989 { "LALR", 0, NULL, 'L' },
1990 { "LR1", 0, NULL, '1' },
1991 { "report", 0, NULL, 'R' },
1992 { "output", 1, NULL, 'o' },
1993 { NULL, 0, NULL, 0 }
1995 const char *options = "05SL1Ro:";
1997 ###### process arguments
1999 char *outfile = NULL;
2003 enum grammar_type type = LR05;
2004 while ((opt = getopt_long(argc, argv, options,
2005 long_options, NULL)) != -1) {
2020 outfile = optarg; break;
2022 fprintf(stderr, "Usage: parsergen ...\n");
2027 infile = argv[optind++];
2029 fprintf(stderr, "No input file given\n");
2032 if (outfile && report == 1)
2035 if (name && strchr(name, '/'))
2036 name = strrchr(name, '/')+1;
2038 if (optind < argc) {
2039 fprintf(stderr, "Excess command line arguments\n");
2043 ### Loading the grammar file
2045 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2048 One we have extracted the code (with `mdcode`) we expect to file three
2049 sections: header, code, and grammar. Anything else is an error.
2051 "header" and "code" are optional, though it is hard to build a working
2052 parser with neither. "grammar" must be provided.
2056 #include <sys/mman.h>
2061 static void pr_err(char *msg)
2064 fprintf(stderr, "%s\n", msg);
2068 struct section *table;
2072 fd = open(infile, O_RDONLY);
2074 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2075 infile, strerror(errno));
2078 len = lseek(fd, 0, 2);
2079 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2080 table = code_extract(file, file+len, pr_err);
2082 struct code_node *hdr = NULL;
2083 struct code_node *code = NULL;
2084 struct code_node *gram = NULL;
2085 for (s = table; s; s = s->next) {
2086 if (text_is(s->section, "header"))
2088 else if (text_is(s->section, "code"))
2090 else if (text_is(s->section, "grammar"))
2093 fprintf(stderr, "Unknown content section: %.*s\n",
2094 s->section.len, s->section.txt);
2099 ### Processing the input
2101 As we need to append an extention to a filename and then open it for
2102 writing, and we need to do this twice, it helps to have a separate function.
2106 static FILE *open_ext(char *base, char *ext)
2108 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2110 strcat(strcpy(fn, base), ext);
2116 If we can read the grammar, then we analyse and optionally report on it. If
2117 the report finds conflicts we will exit with an error status.
2119 ###### process input
2120 struct grammar *g = NULL;
2122 fprintf(stderr, "No grammar section provided\n");
2125 g = grammar_read(gram);
2127 fprintf(stderr, "Failure to parse grammar\n");
2132 grammar_analyse(g, type);
2134 if (grammar_report(g, type))
2138 If a headers section is defined, we write it out and include a declaration
2139 for the `parse_XX` function so it can be used from separate code.
2141 if (rv == 0 && hdr && outfile) {
2142 FILE *f = open_ext(outfile, ".h");
2144 code_node_print(f, hdr, infile);
2145 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2149 fprintf(stderr, "Cannot create %s.h\n",
2155 And if all goes well and an output file was provided, we create the `.c`
2156 file with the code section (if any) and the parser tables and function.
2158 if (rv == 0 && outfile) {
2159 FILE *f = open_ext(outfile, ".c");
2162 code_node_print(f, code, infile);
2163 gen_parser(f, g, infile, name);
2166 fprintf(stderr, "Cannot create %s.c\n",
2172 And that about wraps it up. We need to set the locale so that UTF-8 is
2173 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2175 ###### File: parsergen.mk
2176 parsergen : parsergen.o libscanner.o libmdcode.o
2177 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2184 int main(int argc, char *argv[])
2189 setlocale(LC_ALL,"");
2191 ## process arguments
2198 ## The SHIFT/REDUCE parser
2200 Having analysed the grammar and generated all the table, we only need the
2201 shift/reduce engine to bring it all together.
2203 ### Goto table lookup
2205 The parser generator has nicely provided us with goto tables sorted by
2206 symbol number. We need a binary search function to find a symbol in the
2209 ###### parser functions
2211 static int search(const struct state *l, int sym)
2214 int hi = l->go_to_cnt;
2218 while (lo + 1 < hi) {
2219 int mid = (lo + hi) / 2;
2220 if (l->go_to[mid].sym <= sym)
2225 if (l->go_to[lo].sym == sym)
2226 return l->go_to[lo].state;
2231 ### The state stack.
2233 The core data structure for the parser is the stack. This track all the
2234 symbols that have been recognised or partially recognised.
2236 The stack usually won't grow very large - maybe a few tens of entries. So
2237 we dynamically resize and array as required but never bother to shrink it
2240 We keep the stack as two separate allocations. One, `asn_stack` stores the
2241 "abstract syntax nodes" which are created by each reduction. When we call
2242 `do_reduce` we need to pass an array of the `asn`s of the body of the
2243 production, and by keeping a separate `asn` stack, we can just pass a
2244 pointer into this stack.
2246 The other allocation store all other stack fields of which there are two.
2247 The `state` is the most important one and guides the parsing process. The
2248 `sym` is nearly unnecessary. However when we want to free entries from the
2249 `asn_stack`, it helps to know what type they are so we can call the right
2250 freeing function. The symbol leads us to the right free function through
2253 ###### parser functions
2268 The operations are needed on the stack - shift (which is like push) and pop.
2270 Shift applies no only to terminals but also to non-terminals. When we
2271 reduce a production we will pop off entries corresponding to the body
2272 symbols, then push on an item for the head of the production. This last is
2273 exactly the same process as shifting in a terminal so we use the same
2276 To simplify other code we arrange for `shift` to fail if there is no `goto`
2277 state for the symbol. This is useful in basic parsing due to our design
2278 that we shift when we can, and reduce when we cannot. So the `shift`
2279 function reports if it could.
2281 So `shift` finds the next state. If that succeed it extends the allocations
2282 if needed and pushed all the information onto the stacks.
2284 ###### parser functions
2286 static int shift(struct parser *p,
2288 const struct state states[])
2290 // Push an entry onto the stack
2291 int newstate = search(&states[p->state], sym);
2294 if (p->tos >= p->stack_size) {
2295 p->stack_size += 10;
2296 p->stack = realloc(p->stack, p->stack_size
2297 * sizeof(p->stack[0]));
2298 p->asn_stack = realloc(p->asn_stack, p->stack_size
2299 * sizeof(p->asn_stack[0]));
2301 p->stack[p->tos].state = p->state;
2302 p->stack[p->tos].sym = sym;
2303 p->asn_stack[p->tos] = asn;
2305 p->state = newstate;
2309 `pop` simply moves the top of stack (`tos`) back down the required amount
2310 and frees and `asn` entries that need to be freed. It is called _after_ we
2311 reduce a production, just before we `shift` the nonterminal in.
2313 ###### parser functions
2315 static void pop(struct parser *p, int num,
2316 void(*do_free)(short sym, void *asn))
2320 for (i = 0; i < num; i++)
2321 do_free(p->stack[p->tos+i].sym,
2322 p->asn_stack[p->tos+i]);
2324 p->state = p->stack[p->tos].state;
2327 ### Memory allocation
2329 The `scanner` returns tokens in a local variable - we want them in allocated
2330 memory so they can live in the `asn_stack`. Similarly the `asn` produce by
2331 a reduce is in a large buffer. Both of these require some allocation and
2332 copying, hence `memdup` and `tokcopy`.
2334 ###### parser includes
2337 ###### parser functions
2339 void *memdup(void *m, int len)
2345 memcpy(ret, m, len);
2349 static struct token *tok_copy(struct token tk)
2351 struct token *new = malloc(sizeof(*new));
2356 ### The heart of the parser.
2358 Now we have the parser. If we can shift, we do. If not and we can reduce
2359 we do. If the production we reduced was production zero, then we have
2360 accepted the input and can finish.
2362 If we can neither shift nor reduce we have an error to handle. We pop
2363 single entries off the stack until we can shift the `TK_error` symbol, the
2364 drop input tokens until we find one we can shift into the new error state.
2366 We return whatever `asn` was returned by reducing production zero.
2368 ###### parser includes
2371 void *parser_run(struct token_state *tokens,
2372 const struct state states[],
2373 int (*do_reduce)(int, int, void**, void*),
2374 void (*do_free)(short, void*),
2375 FILE *trace, const char *non_term[], int knowns)
2377 struct parser p = { 0 };
2382 tk = tok_copy(token_next(tokens));
2385 parser_trace(trace, &p, tk, states, non_term, knowns);
2387 if (shift(&p, tk->num, tk, states)) {
2388 tk = tok_copy(token_next(tokens));
2391 if (states[p.state].reduce_prod >= 0) {
2393 int prod = states[p.state].reduce_prod;
2394 int size = states[p.state].reduce_size;
2395 int sym = states[p.state].reduce_sym;
2397 static char buf[16*1024];
2399 body = p.asn_stack +
2400 (p.tos - states[p.state].reduce_size);
2402 bufsize = do_reduce(prod, p.tos, body,
2405 pop(&p, size, do_free);
2406 shift(&p, sym, memdup(buf, bufsize), states);
2411 /* Error. we walk up the stack until we
2412 * find a state which will accept TK_error.
2413 * We then shift in TK_error and see what state
2414 * that takes us too.
2415 * Then we discard input tokens until
2416 * we find one that is acceptable.
2418 while (shift(&p, TK_error, tk, states) == 0
2420 // discard this state
2421 pop(&p, 1, do_free);
2423 while (search(&states[p.state], tk->num) < 0 &&
2424 tk->num != TK_eof) {
2426 tk = tok_copy(token_next(tokens));
2428 if (p.tos == 0 && tk->num == TK_eof)
2433 ret = p.asn_stack[0];
2435 pop(&p, p.tos, do_free);
2441 ###### exported functions
2442 void *parser_run(struct token_state *tokens,
2443 const struct state states[],
2444 int (*do_reduce)(int, int, void**, void*),
2445 void (*do_free)(short, void*),
2446 FILE *trace, const char *non_term[], int knowns);
2450 Being able to visualize the parser in action can be invaluable when
2451 debugging the parser code, or trying to understand how the parse of a
2452 particular grammar progresses. The stack contains all the important
2453 state, so just printing out the stack every time around the parse loop
2454 can make it possible to see exactly what is happening.
2456 This doesn't explicitly show each SHIFT and REDUCE action. However they
2457 are easily deduced from the change between consecutive lines, and the
2458 details of each state can be found by cross referencing the states list
2459 in the stack with the "report" that parsergen can generate.
2461 For terminal symbols, we just dump the token. For non-terminals we
2462 print the name of the symbol. The look ahead token is reported at the
2463 end inside square brackets.
2465 ###### parser functions
2467 void parser_trace(FILE *trace, struct parser *p,
2468 struct token *tk, const struct state states[],
2469 const char *non_term[], int knowns)
2472 for (i = 0; i < p->tos; i++) {
2473 int sym = p->stack[i].sym;
2474 fprintf(trace, "(%d) ", p->stack[i].state);
2475 if (sym < TK_reserved + knowns) {
2476 struct token *t = p->asn_stack[i];
2477 text_dump(trace, t->txt, 20);
2479 fputs(non_term[sym - TK_reserved - knowns],
2483 fprintf(trace, "(%d) [", p->state);
2484 text_dump(trace, tk->txt, 20);
2485 fputs("]\n", trace);
2490 The obvious example for a parser is a calculator.
2492 As `scanner` provides number parsing function using `libgmp` is it not much
2493 work to perform arbitrary rational number calculations.
2495 This calculator takes one expression, or an equality test per line. The
2496 results are printed and in any equality test fails, the program exits with
2499 Embedding mdcode inside mdcode is rather horrible. I'd like to find a
2500 better approach, but as the grammar file must have 3 components I need
2501 something like this.
2503 ###### File: parsergen.mk
2504 calc.c : parsergen calc.cgm
2505 ./parsergen -o calc calc.cgm
2506 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2507 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2515 // what do we use for a demo-grammar? A calculator of course.
2529 #include <sys/mman.h>
2534 #include "scanner.h"
2540 static void free_number(struct number *n)
2546 int main(int argc, char *argv[])
2548 int fd = open(argv[1], O_RDONLY);
2549 int len = lseek(fd, 0, 2);
2550 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2551 struct section *s = code_extract(file, file+len, NULL);
2552 struct token_config config = {
2553 .ignored = (1 << TK_line_comment)
2554 | (1 << TK_block_comment)
2557 .number_chars = ".,_+-",
2561 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2569 Session -> Session Line
2572 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2573 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2574 gmp_printf(" or as a decimal: %Fg\n", fl);
2578 | Expression = Expression NEWLINE ${
2579 if (mpq_equal($1.val, $3.val))
2580 gmp_printf("Both equal %Qd\n", $1.val);
2582 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
2587 | NEWLINE ${ printf("Blank line\n"); }$
2588 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2591 Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2592 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2593 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2595 Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2596 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2597 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2599 Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2600 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$