1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. Finally `calc` is a test grammar for a simple calculator. The
21 `parsergen` program built from the C code in this file can extract
22 that grammar directly from this file and process it.
25 ###### File: parsergen.c
30 ## forward declarations
41 ###### File: libparser.c
48 ###### File: parsergen.mk
51 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
54 ## Reading the grammar
56 The grammar must be stored in a markdown literate code file as parsed
57 by "[mdcode][]". It must have three top level (i.e. unreferenced)
58 sections: `header`, `code`, and `grammar`. The first two will be
59 literally copied into the generated `.c` and `.h`. files. The last
60 contains the grammar. This is tokenised with "[scanner][]".
62 If the `--tag` option is given, then any top level header that doesn't
63 start with the tag is ignored, and the tag is striped from the rest. So
65 means that the three needed sections must be `Foo: header`, `Foo: code`,
66 and `Foo: grammar`. The tag `calc` is used to extract the same calculator
70 [scanner]: scanner.html
76 ###### parser includes
80 The grammar contains production sets, precedence/associativity
81 declarations, and data type declarations. These are all parsed with
82 _ad hoc_ parsing as we don't have a parser generator yet.
84 The precedence and associativity can be set for each production, but
85 can be inherited from symbols. The data type (either structure or a
86 reference to a structure) is potentially defined for each non-terminal
87 and describes what C structure is used to store information about each
91 enum assoc {Left, Right, Non};
92 char *assoc_names[] = {"Left","Right","Non"};
95 struct text struct_name;
98 unsigned short precedence;
102 unsigned short precedence;
110 The strings reported by `mdcode` and `scanner` are `struct text` which have
111 length rather than being null terminated. To help with printing and
112 comparing we define `text_is` and `prtxt`, which should possibly go in
113 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
114 which might contain control characters.
116 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
117 because that is what we need to detect tags.
120 static int text_is(struct text t, char *s)
122 return (strlen(s) == t.len &&
123 strncmp(s, t.txt, t.len) == 0);
125 static void prtxt(struct text t)
127 printf("%.*s", t.len, t.txt);
130 static int strip_tag(struct text *t, char *tag)
132 int skip = strlen(tag) + 1;
133 if (skip >= t->len ||
134 strncmp(t->txt, tag, skip-1) != 0 ||
135 t->txt[skip-1] != ':')
137 while (skip < t->len && t->txt[skip] == ' ')
146 Productions are comprised primarily of symbols - terminal and
147 non-terminal. We do not make a syntactic distinction between the two,
148 though non-terminals must be identifiers. Non-terminal symbols are
149 those which appear in the head of a production, terminal symbols are
150 those which don't. There are also "virtual" symbols used for precedence
151 marking discussed later, and sometimes we won't know what type a symbol
154 ###### forward declarations
155 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
156 char *symtypes = "UVTN";
160 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
161 table of known symbols and the resulting parser will report them as
162 `TK_reserved + N`. A small set of identifiers are reserved for the
163 different token types that `scanner` can report.
166 static char *reserved_words[] = {
167 [TK_error] = "ERROR",
168 [TK_number] = "NUMBER",
169 [TK_ident] = "IDENTIFIER",
171 [TK_string] = "STRING",
172 [TK_multi_string] = "MULTI_STRING",
175 [TK_newline] = "NEWLINE",
181 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
182 recognised. The former is automatically expected at the end of the text
183 being parsed. The latter are always ignored by the parser.
185 All of these symbols are stored in a simple symbol table. We use the
186 `struct text` from `mdcode` to store the name and link them together
187 into a sorted list using an insertion sort.
189 We don't have separate `find` and `insert` functions as any symbol we
190 find needs to be remembered. We simply expect `find` to always return a
191 symbol, but its type might be `Unknown`.
200 ###### grammar fields
205 static struct symbol *sym_find(struct grammar *g, struct text s)
207 struct symbol **l = &g->syms;
212 (cmp = text_cmp((*l)->name, s)) < 0)
216 n = calloc(1, sizeof(*n));
225 static void symbols_init(struct grammar *g)
227 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
229 for (i = 0; i < entries; i++) {
232 t.txt = reserved_words[i];
235 t.len = strlen(t.txt);
242 ### Data types and precedence.
244 Data type specification and precedence specification are both
245 introduced by a dollar sign at the start of the line. If the next
246 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
247 precedence, otherwise it specifies a data type.
249 The data type name is simply stored and applied to the head of all
250 subsequent productions. It must be the name of a structure optionally
251 preceded by an asterisk which means a reference or "pointer". So
252 `$expression` maps to `struct expression` and `$*statement` maps to
253 `struct statement *`.
255 Any productions given before the first data type declaration will have
256 no data type associated with them and can carry no information. In
257 order to allow other non-terminals to have no type, the data type
258 `$void` can be given. This does *not* mean that `struct void` will be
259 used, but rather than no type will be associated with future
262 The precedence line must contain a list of symbols - typically
263 terminal symbols, but not necessarily. It can only contain symbols
264 that have not been seen yet, so precedence declaration must precede
265 the productions that they affect.
267 A precedence line may also contain a "virtual" symbol which is an
268 identifier preceded by `$$`. Virtual terminals carry precedence
269 information but are not included in the grammar. A production can
270 declare that it inherits the precedence of a given virtual symbol.
272 This use for `$$` precludes it from being used as a symbol in the
273 described language. Two other symbols: `${` and `}$` are also
276 Each new precedence line introduces a new precedence level and
277 declares how it associates. This level is stored in each symbol
278 listed and may be inherited by any production which uses the symbol. A
279 production inherits from the last symbol which has a precedence.
281 ###### grammar fields
282 struct text current_type;
287 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
288 static const char *known[] = { "$$", "${", "}$" };
291 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
293 struct token t = token_next(ts);
298 if (t.num != TK_ident) {
299 err = "type or assoc expected after '$'";
302 if (text_is(t.txt, "LEFT"))
304 else if (text_is(t.txt, "RIGHT"))
306 else if (text_is(t.txt, "NON"))
309 g->current_type = t.txt;
310 g->type_isref = isref;
311 if (text_is(t.txt, "void"))
312 g->current_type.txt = NULL;
314 if (t.num != TK_newline) {
315 err = "Extra tokens after type name";
322 err = "$* cannot be followed by a precedence";
326 // This is a precedence line, need some symbols.
330 while (t.num != TK_newline) {
331 enum symtype type = Terminal;
333 if (t.num == TK_virtual) {
336 if (t.num != TK_ident) {
337 err = "$$ must be followed by a word";
340 } else if (t.num != TK_ident &&
342 err = "Illegal token in precedence line";
345 s = sym_find(g, t.txt);
346 if (s->type != Unknown) {
347 err = "Symbols in precedence line must not already be known.";
351 s->precedence = g->prec_levels;
357 err = "No symbols given on precedence line";
361 while (t.num != TK_newline && t.num != TK_eof)
368 A production either starts with an identifier which is the head
369 non-terminal, or a vertical bar (`|`) in which case this production
370 uses the same head as the previous one. The identifier must be
371 followed by a `->` mark. All productions for a given non-terminal must
372 be grouped together with the `nonterminal ->` given only once.
374 After this a (possibly empty) sequence of identifiers and marks form
375 the body of the production. A virtual symbol may be given after the
376 body by preceding it with `$$`. If a virtual symbol is given, the
377 precedence of the production is that for the virtual symbol. If none
378 is given, the precedence is inherited from the last symbol in the
379 production which has a precedence specified.
381 After the optional precedence may come the `${` mark. This indicates
382 the start of a code fragment. If present, this must be on the same
383 line as the start of the production.
385 All of the text from the `${` through to the matching `}$` is
386 collected and forms the code-fragment for the production. It must all
387 be in one `code_node` of the literate code. The `}$` must be
388 at the end of a line.
390 Text in the code fragment will undergo substitutions where `$N` or
391 `$<N`,for some numeric `N`, will be replaced with a variable holding
392 the parse information for the particular symbol in the production.
393 `$0` is the head of the production, `$1` is the first symbol of the
394 body, etc. The type of `$N` for a terminal symbol is `struct token`.
395 For a non-terminal, it is whatever has been declared for that symbol.
396 The `<` may be included for symbols declared as storing a reference
397 (not a structure) and means that the reference is being moved out, so
398 it will not automatically be freed.
400 While building productions we will need to add to an array which needs to
404 static void array_add(void *varray, int *cnt, void *new)
406 void ***array = varray;
409 current = ((*cnt-1) | (step-1))+1;
410 if (*cnt == current) {
413 *array = realloc(*array, current * sizeof(void*));
415 (*array)[*cnt] = new;
419 Collecting the code fragment simply involves reading tokens until we
420 hit the end token `}$`, and noting the character position of the start and
424 static struct text collect_code(struct token_state *state,
429 code.txt = start.txt.txt + start.txt.len;
431 t = token_next(state);
432 while (t.node == start.node &&
433 t.num != TK_close && t.num != TK_error &&
435 if (t.num == TK_close && t.node == start.node)
436 code.len = t.txt.txt - code.txt;
442 Now we have all the bits we need to parse a full production.
447 ###### grammar fields
448 struct production **productions;
449 int production_count;
451 ###### production fields
453 struct symbol **body;
459 int first_production;
462 static char *parse_production(struct grammar *g,
464 struct token_state *state)
466 /* Head has already been parsed. */
469 struct production p, *pp;
471 memset(&p, 0, sizeof(p));
473 tk = token_next(state);
474 while (tk.num == TK_ident || tk.num == TK_mark) {
475 struct symbol *bs = sym_find(g, tk.txt);
476 if (bs->type == Unknown)
478 if (bs->type == Virtual) {
479 err = "Virtual symbol not permitted in production";
482 if (bs->precedence) {
483 p.precedence = bs->precedence;
486 array_add(&p.body, &p.body_size, bs);
487 tk = token_next(state);
489 if (tk.num == TK_virtual) {
491 tk = token_next(state);
492 if (tk.num != TK_ident) {
493 err = "word required after $$";
496 vs = sym_find(g, tk.txt);
497 if (vs->type != Virtual) {
498 err = "symbol after $$ must be virtual";
501 p.precedence = vs->precedence;
503 tk = token_next(state);
505 if (tk.num == TK_open) {
506 p.code_line = tk.line;
507 p.code = collect_code(state, tk);
508 if (p.code.txt == NULL) {
509 err = "code fragment not closed properly";
512 tk = token_next(state);
514 if (tk.num != TK_newline && tk.num != TK_eof) {
515 err = "stray tokens at end of line";
518 pp = malloc(sizeof(*pp));
520 array_add(&g->productions, &g->production_count, pp);
523 while (tk.num != TK_newline && tk.num != TK_eof)
524 tk = token_next(state);
528 With the ability to parse production and dollar-lines, we have nearly all
529 that we need to parse a grammar from a `code_node`.
531 The head of the first production will effectively be the `start` symbol of
532 the grammar. However it won't _actually_ be so. Processing the grammar is
533 greatly simplified if the real start symbol only has a single production,
534 and expects `$eof` as the final terminal. So when we find the first
535 explicit production we insert an extra production as production zero which
538 ###### Example: production 0
541 where `START` is the first non-terminal given.
543 ###### create production zero
544 struct production *p = calloc(1,sizeof(*p));
545 struct text start = {"$start",6};
546 struct text eof = {"$eof",4};
547 struct text code = {"$0 = $<1;", 9};
548 p->head = sym_find(g, start);
549 p->head->type = Nonterminal;
550 p->head->struct_name = g->current_type;
551 p->head->isref = g->type_isref;
552 if (g->current_type.txt)
554 array_add(&p->body, &p->body_size, head);
555 array_add(&p->body, &p->body_size, sym_find(g, eof));
556 p->head->first_production = g->production_count;
557 array_add(&g->productions, &g->production_count, p);
559 Now we are ready to read in the grammar. We ignore comments
560 and strings so that the marks which introduce them can be
561 used as terminals in the grammar. We don't ignore numbers
562 even though we don't allow them as that causes the scanner
563 to produce errors that the parser is better positioned to handle.
566 static struct grammar *grammar_read(struct code_node *code)
568 struct token_config conf = {
571 .words_marks = known,
572 .known_count = sizeof(known)/sizeof(known[0]),
574 .ignored = (1 << TK_line_comment)
575 | (1 << TK_block_comment)
578 | (1 << TK_multi_string)
583 struct token_state *state = token_open(code, &conf);
585 struct symbol *head = NULL;
589 g = calloc(1, sizeof(*g));
592 for (tk = token_next(state); tk.num != TK_eof;
593 tk = token_next(state)) {
594 if (tk.num == TK_newline)
596 if (tk.num == TK_ident) {
598 head = sym_find(g, tk.txt);
599 if (head->type == Nonterminal)
600 err = "This non-terminal has already be used.";
601 else if (head->type == Virtual)
602 err = "Virtual symbol not permitted in head of production";
604 head->type = Nonterminal;
605 head->struct_name = g->current_type;
606 head->isref = g->type_isref;
607 if (g->production_count == 0) {
608 ## create production zero
610 head->first_production = g->production_count;
611 tk = token_next(state);
612 if (tk.num == TK_mark &&
613 text_is(tk.txt, "->"))
614 err = parse_production(g, head, state);
616 err = "'->' missing in production";
618 } else if (tk.num == TK_mark
619 && text_is(tk.txt, "|")) {
620 // another production for same non-term
622 err = parse_production(g, head, state);
624 err = "First production must have a head";
625 } else if (tk.num == TK_mark
626 && text_is(tk.txt, "$")) {
627 err = dollar_line(state, g, 0);
628 } else if (tk.num == TK_mark
629 && text_is(tk.txt, "$*")) {
630 err = dollar_line(state, g, 1);
632 err = "Unrecognised token at start of line.";
640 fprintf(stderr, "Error at line %d: %s\n",
647 ## Analysing the grammar
649 The central task in analysing the grammar is to determine a set of
650 states to drive the parsing process. This will require finding
651 various sets of symbols and of "items". Some of these sets will need
652 to attach information to each element in the set, so they are more
655 Each "item" may have a 'look-ahead' or `LA` set associated with
656 it. Many of these will be the same as each other. To avoid memory
657 wastage, and to simplify some comparisons of sets, these sets will be
658 stored in a list of unique sets, each assigned a number.
660 Once we have the data structures in hand to manage these sets and
661 lists, we can start setting the 'nullable' flag, build the 'FIRST'
662 sets, and then create the item sets which define the various states.
666 Though we don't only store symbols in these sets, they are the main
667 things we store, so they are called symbol sets or "symsets".
669 A symset has a size, an array of shorts, and an optional array of data
670 values, which are also shorts. If the array of data is not present,
671 we store an impossible pointer, as `NULL` is used to indicate that no
672 memory has been allocated yet;
677 unsigned short *syms, *data;
679 #define NO_DATA ((unsigned short *)1)
680 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
681 const struct symset INIT_DATASET = { 0, NULL, NULL };
683 The arrays of shorts are allocated in blocks of 8 and are kept sorted
684 by using an insertion sort. We don't explicitly record the amount of
685 allocated space as it can be derived directly from the current `cnt` using
686 `((cnt - 1) | 7) + 1`.
689 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
692 int current = ((s->cnt-1) | 7) + 1;
693 if (current == s->cnt) {
695 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
696 if (s->data != NO_DATA)
697 s->data = realloc(s->data, sizeof(*s->data) * current);
700 while (i > 0 && s->syms[i-1] > key) {
701 s->syms[i] = s->syms[i-1];
702 if (s->data != NO_DATA)
703 s->data[i] = s->data[i-1];
707 if (s->data != NO_DATA)
712 Finding a symbol (or item) in a `symset` uses a simple binary search.
713 We return the index where the value was found (so data can be accessed),
714 or `-1` to indicate failure.
716 static int symset_find(struct symset *ss, unsigned short key)
723 while (lo + 1 < hi) {
724 int mid = (lo + hi) / 2;
725 if (ss->syms[mid] <= key)
730 if (ss->syms[lo] == key)
735 We will often want to form the union of two symsets. When we do, we
736 will often want to know if anything changed (as that might mean there
737 is more work to do). So `symset_union` reports whether anything was
738 added to the first set. We use a slow quadratic approach as these
739 sets don't really get very big. If profiles shows this to be a problem it
740 can be optimised later.
742 static int symset_union(struct symset *a, struct symset *b)
746 for (i = 0; i < b->cnt; i++)
747 if (symset_find(a, b->syms[i]) < 0) {
748 unsigned short data = 0;
749 if (b->data != NO_DATA)
751 symset_add(a, b->syms[i], data);
757 And of course we must be able to free a symset.
759 static void symset_free(struct symset ss)
762 if (ss.data != NO_DATA)
768 Some symsets are simply stored somewhere appropriate in the `grammar`
769 data structure, others needs a bit of indirection. This applies
770 particularly to "LA" sets. These will be paired with "items" in an
771 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
772 stored in the `data` field, so we need a mapping from a small (short)
773 number to an LA `symset`.
775 As mentioned earlier this involves creating a list of unique symsets.
777 For now, we just use a linear list sorted by insertion. A skiplist
778 would be more efficient and may be added later.
785 struct setlist *next;
788 ###### grammar fields
789 struct setlist *sets;
794 static int ss_cmp(struct symset a, struct symset b)
797 int diff = a.cnt - b.cnt;
800 for (i = 0; i < a.cnt; i++) {
801 diff = (int)a.syms[i] - (int)b.syms[i];
808 static int save_set(struct grammar *g, struct symset ss)
810 struct setlist **sl = &g->sets;
814 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
821 s = malloc(sizeof(*s));
830 Finding a set by number is currently performed by a simple linear search.
831 If this turns out to hurt performance, we can store the sets in a dynamic
832 array like the productions.
834 static struct symset set_find(struct grammar *g, int num)
836 struct setlist *sl = g->sets;
837 while (sl && sl->num != num)
843 ### Setting `nullable`
845 We set `nullable` on the head symbol for any production for which all
846 the body symbols (if any) are nullable. As this is a recursive
847 definition, any change in the `nullable` setting means that we need to
848 re-evaluate where it needs to be set.
850 We simply loop around performing the same calculations until no more
857 static void set_nullable(struct grammar *g)
860 while (check_again) {
863 for (p = 0; p < g->production_count; p++) {
864 struct production *pr = g->productions[p];
867 if (pr->head->nullable)
869 for (s = 0; s < pr->body_size; s++)
870 if (! pr->body[s]->nullable)
872 if (s == pr->body_size) {
873 pr->head->nullable = 1;
880 ### Setting `can_eol` and `line_like`
882 In order to be able to ignore newline tokens when not relevant, but
883 still include them in the parse when needed, we will need to know
884 which states can start a "line-like" section of code. We ignore
885 newlines when there is an indent since the most recent start of a
888 To know which symbols are line-like, we first need to know which
889 symbols start with a NEWLINE token. Any symbol which is followed by a
890 NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol.
891 Certainly when trying to parse one of these we must take note of NEWLINEs.
893 Clearly the `TK_newline` token can start with a NEWLINE. Any symbol
894 which is the head of a production that contains a starts-with-NEWLINE
895 symbol preceeded only by nullable symbols is also a
896 starts-with-NEWLINE symbol. We use a new field `can_eol` to record
897 this attribute of symbols, and compute it in a repetitive manner
898 similar to `set_nullable`.
900 Once we have that, we can determine which symbols are `line_like` by
901 seeing which are followed by a `can_eol` symbol in any production.
908 static void set_can_eol(struct grammar *g)
911 g->symtab[TK_newline]->can_eol = 1;
912 while (check_again) {
915 for (p = 0; p < g->production_count; p++) {
916 struct production *pr = g->productions[p];
919 if (pr->head->can_eol)
922 for (s = 0 ; s < pr->body_size; s++) {
923 if (pr->body[s]->can_eol) {
924 pr->head->can_eol = 1;
928 if (!pr->body[s]->nullable)
935 static void set_line_like(struct grammar *g)
938 for (p = 0; p < g->production_count; p++) {
939 struct production *pr = g->productions[p];
942 for (s = 1; s < pr->body_size; s++)
943 if (pr->body[s]->can_eol)
944 pr->body[s-1]->line_like = 1;
948 ### Building the `first` sets
950 When calculating what can follow a particular non-terminal, we will need to
951 know what the "first" terminal in any subsequent non-terminal might be. So
952 we calculate the `first` set for every non-terminal and store them in an
953 array. We don't bother recording the "first" set for terminals as they are
956 As the "first" for one symbol might depend on the "first" of another,
957 we repeat the calculations until no changes happen, just like with
958 `set_nullable`. This makes use of the fact that `symset_union`
959 reports if any change happens.
961 The core of this, which finds the "first" of part of a production body,
962 will be reused for computing the "follow" sets, so we split it out
963 into a separate function.
965 ###### grammar fields
966 struct symset *first;
970 static int add_first(struct production *pr, int start,
971 struct symset *target, struct grammar *g,
976 for (s = start; s < pr->body_size; s++) {
977 struct symbol *bs = pr->body[s];
978 if (bs->type == Terminal) {
979 if (symset_find(target, bs->num) < 0) {
980 symset_add(target, bs->num, 0);
984 } else if (symset_union(target, &g->first[bs->num]))
990 *to_end = (s == pr->body_size);
994 static void build_first(struct grammar *g)
998 g->first = calloc(g->num_syms, sizeof(g->first[0]));
999 for (s = 0; s < g->num_syms; s++)
1000 g->first[s] = INIT_SYMSET;
1002 while (check_again) {
1005 for (p = 0; p < g->production_count; p++) {
1006 struct production *pr = g->productions[p];
1007 struct symset *head = &g->first[pr->head->num];
1009 if (add_first(pr, 0, head, g, NULL))
1015 ### Building the `follow` sets.
1017 There are two different situations when we will want to generate "follow"
1018 sets. If we are doing an SLR analysis, we want to generate a single
1019 "follow" set for each non-terminal in the grammar. That is what is
1020 happening here. If we are doing an LALR or LR analysis we will want
1021 to generate a separate "LA" set for each item. We do that later
1022 in state generation.
1024 There are two parts to generating a "follow" set. Firstly we look at
1025 every place that any non-terminal appears in the body of any
1026 production, and we find the set of possible "first" symbols after
1027 there. This is done using `add_first` above and only needs to be done
1028 once as the "first" sets are now stable and will not change.
1032 for (p = 0; p < g->production_count; p++) {
1033 struct production *pr = g->productions[p];
1036 for (b = 0; b < pr->body_size - 1; b++) {
1037 struct symbol *bs = pr->body[b];
1038 if (bs->type == Terminal)
1040 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1044 The second part is to add the "follow" set of the head of a production
1045 to the "follow" sets of the final symbol in the production, and any
1046 other symbol which is followed only by `nullable` symbols. As this
1047 depends on "follow" itself we need to repeatedly perform the process
1048 until no further changes happen.
1052 for (again = 0, p = 0;
1053 p < g->production_count;
1054 p < g->production_count-1
1055 ? p++ : again ? (again = 0, p = 0)
1057 struct production *pr = g->productions[p];
1060 for (b = pr->body_size - 1; b >= 0; b--) {
1061 struct symbol *bs = pr->body[b];
1062 if (bs->type == Terminal)
1064 if (symset_union(&g->follow[bs->num],
1065 &g->follow[pr->head->num]))
1072 We now just need to create and initialise the `follow` list to get a
1075 ###### grammar fields
1076 struct symset *follow;
1079 static void build_follow(struct grammar *g)
1082 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1083 for (s = 0; s < g->num_syms; s++)
1084 g->follow[s] = INIT_SYMSET;
1088 ### Building itemsets and states
1090 There are three different levels of detail that can be chosen for
1091 building the itemsets and states for the LR grammar. They are:
1093 1. LR(0) or SLR(1), where no look-ahead is considered.
1094 2. LALR(1) where we build look-ahead sets with each item and merge
1095 the LA sets when we find two paths to the same "kernel" set of items.
1096 3. LR(1) where different look-ahead for any item in the set means
1097 a different state must be created.
1099 ###### forward declarations
1100 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1102 We need to be able to look through existing states to see if a newly
1103 generated state already exists. For now we use a simple sorted linked
1106 An item is a pair of numbers: the production number and the position of
1107 "DOT", which is an index into the body of the production.
1108 As the numbers are not enormous we can combine them into a single "short"
1109 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1110 production number (so 4000 productions with maximum of 15 symbols in the
1113 Comparing the itemsets will be a little different to comparing symsets
1114 as we want to do the lookup after generating the "kernel" of an
1115 itemset, so we need to ignore the offset=zero items which are added during
1118 To facilitate this, we modify the "DOT" number so that "0" sorts to
1119 the end of the list in the symset, and then only compare items before
1123 static inline unsigned short item_num(int production, int index)
1125 return production | ((31-index) << 11);
1127 static inline int item_prod(unsigned short item)
1129 return item & 0x7ff;
1131 static inline int item_index(unsigned short item)
1133 return (31-(item >> 11)) & 0x1f;
1136 For LR(1) analysis we need to compare not just the itemset in a state
1137 but also the LA sets. As we assign each unique LA set a number, we
1138 can just compare the symset and the data values together.
1141 static int itemset_cmp(struct symset a, struct symset b,
1142 enum grammar_type type)
1148 i < a.cnt && i < b.cnt &&
1149 item_index(a.syms[i]) > 0 &&
1150 item_index(b.syms[i]) > 0;
1152 int diff = a.syms[i] - b.syms[i];
1156 diff = a.data[i] - b.data[i];
1161 if (i == a.cnt || item_index(a.syms[i]) == 0)
1165 if (i == b.cnt || item_index(b.syms[i]) == 0)
1171 if (type < LR1 || av == -1)
1174 a.data[i] - b.data[i];
1177 And now we can build the list of itemsets. The lookup routine returns
1178 both a success flag and a pointer to where in the list an insert
1179 should happen, so we don't need to search a second time.
1181 FIXME: document min_prefix
1185 struct itemset *next;
1187 struct symset items;
1188 struct symset go_to;
1190 unsigned short precedence;
1196 ###### grammar fields
1197 struct itemset *items;
1201 static int itemset_find(struct grammar *g, struct itemset ***where,
1202 struct symset kernel, enum grammar_type type)
1204 struct itemset **ip;
1206 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1207 struct itemset *i = *ip;
1209 diff = itemset_cmp(i->items, kernel, type);
1222 Adding an itemset may require merging the LA sets if LALR analysis is
1223 happening. If any new LA set adds any symbols that weren't in the old LA set, we
1224 clear the `completed` flag so that the dependants of this itemset will be
1225 recalculated and their LA sets updated.
1227 `add_itemset` must consume the symsets it is passed, either by adding
1228 them to a data structure, of freeing them.
1230 static int add_itemset(struct grammar *g, struct symset ss,
1231 enum assoc assoc, unsigned short precedence,
1232 enum grammar_type type)
1234 struct itemset **where, *is;
1236 int found = itemset_find(g, &where, ss, type);
1238 is = calloc(1, sizeof(*is));
1239 is->state = g->states;
1243 is->precedence = precedence;
1245 is->go_to = INIT_DATASET;
1254 for (i = 0; i < ss.cnt; i++) {
1255 struct symset temp = INIT_SYMSET, s;
1256 if (ss.data[i] == is->items.data[i])
1258 s = set_find(g, is->items.data[i]);
1259 symset_union(&temp, &s);
1260 s = set_find(g, ss.data[i]);
1261 if (symset_union(&temp, &s)) {
1262 is->items.data[i] = save_set(g, temp);
1273 To build all the itemsets, we first insert the initial itemset made
1274 from production zero, complete each itemset, and then generate new
1275 itemsets from old until no new ones can be made.
1277 Completing an itemset means finding all the items where "DOT" is followed by
1278 a nonterminal and adding "DOT=0" items for every production from that
1279 non-terminal - providing each item hasn't already been added.
1281 If LA sets are needed, the LA set for each new item is found using
1282 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1283 be supplemented by the LA set for the item which produce the new item.
1285 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1286 is used in the next stage.
1287 If any of these symbols are flagged as starting a line, then this
1288 state must be a `starts_line` state so now is a good time to record that.
1290 When itemsets are created we assign a precedence to the itemset from
1291 the complete item, if there is one. We ignore the possibility of
1292 there being two and don't (currently) handle precedence in such
1293 grammars. When completing a grammar we ignore any item where DOT is
1294 followed by a terminal with a precedence lower (numerically higher)
1295 than that for the itemset. Unless the terminal has right
1296 associativity, we also ignore items where the terminal has the same
1297 precedence. The result is that unwanted items are still in the
1298 itemset, but the terminal doesn't get into the go to set, so the item
1301 ###### complete itemset
1302 for (i = 0; i < is->items.cnt; i++) {
1303 int p = item_prod(is->items.syms[i]);
1304 int bs = item_index(is->items.syms[i]);
1305 struct production *pr = g->productions[p];
1308 struct symset LA = INIT_SYMSET;
1309 unsigned short sn = 0;
1311 if (is->min_prefix == 0 ||
1312 (bs > 0 && bs < is->min_prefix))
1313 is->min_prefix = bs;
1314 if (bs == pr->body_size)
1317 if (s->precedence && is->precedence &&
1318 is->precedence < s->precedence)
1319 /* This terminal has a low precedence and
1320 * shouldn't be shifted
1323 if (s->precedence && is->precedence &&
1324 is->precedence == s->precedence && s->assoc != Right)
1325 /* This terminal has a matching precedence and is
1326 * not Right-associative, so we mustn't shift it.
1329 if (symset_find(&done, s->num) < 0) {
1330 symset_add(&done, s->num, 0);
1332 is->starts_line = 1;
1334 if (s->type != Nonterminal)
1340 add_first(pr, bs+1, &LA, g, &to_end);
1342 struct symset ss = set_find(g, is->items.data[i]);
1343 symset_union(&LA, &ss);
1345 sn = save_set(g, LA);
1346 LA = set_find(g, sn);
1349 /* Add productions for this symbol */
1350 for (p2 = s->first_production;
1351 p2 < g->production_count &&
1352 g->productions[p2]->head == s;
1354 int itm = item_num(p2, 0);
1355 int pos = symset_find(&is->items, itm);
1357 symset_add(&is->items, itm, sn);
1358 /* Will have re-ordered, so start
1359 * from beginning again */
1361 } else if (type >= LALR) {
1362 struct symset ss = set_find(g, is->items.data[pos]);
1363 struct symset tmp = INIT_SYMSET;
1365 symset_union(&tmp, &ss);
1366 if (symset_union(&tmp, &LA)) {
1367 is->items.data[pos] = save_set(g, tmp);
1375 For each symbol we found (and placed in `done`) we collect all the items for
1376 which this symbol is next, and create a new itemset with "DOT" advanced over
1377 the symbol. This is then added to the collection of itemsets (or merged
1378 with a pre-existing itemset).
1380 ###### derive itemsets
1381 // Now we have a completed itemset, so we need to
1382 // compute all the 'next' itemsets and create them
1383 // if they don't exist.
1384 for (i = 0; i < done.cnt; i++) {
1386 unsigned short state;
1387 struct symbol *sym = g->symtab[done.syms[i]];
1388 enum assoc assoc = Non;
1389 unsigned short precedence = 0;
1390 struct symset newitemset = INIT_SYMSET;
1392 newitemset = INIT_DATASET;
1394 for (j = 0; j < is->items.cnt; j++) {
1395 int itm = is->items.syms[j];
1396 int p = item_prod(itm);
1397 int bp = item_index(itm);
1398 struct production *pr = g->productions[p];
1399 unsigned short la = 0;
1402 if (bp == pr->body_size)
1404 if (pr->body[bp] != sym)
1407 la = is->items.data[j];
1408 pos = symset_find(&newitemset, pr->head->num);
1409 if (bp + 1 == pr->body_size &&
1410 pr->precedence > 0 &&
1412 pr->precedence < precedence)) {
1413 // new itemset is reducible and has a precedence.
1414 precedence = pr->precedence;
1418 symset_add(&newitemset, item_num(p, bp+1), la);
1419 else if (type >= LALR) {
1420 // Need to merge la set.
1421 int la2 = newitemset.data[pos];
1423 struct symset ss = set_find(g, la2);
1424 struct symset LA = INIT_SYMSET;
1425 symset_union(&LA, &ss);
1426 ss = set_find(g, la);
1427 if (symset_union(&LA, &ss))
1428 newitemset.data[pos] = save_set(g, LA);
1434 state = add_itemset(g, newitemset, assoc, precedence, type);
1435 if (symset_find(&is->go_to, done.syms[i]) < 0)
1436 symset_add(&is->go_to, done.syms[i], state);
1439 All that is left is to create the initial itemset from production zero, and
1440 with `TK_eof` as the LA set.
1443 static void build_itemsets(struct grammar *g, enum grammar_type type)
1445 struct symset first = INIT_SYMSET;
1448 unsigned short la = 0;
1450 // LA set just has eof
1451 struct symset eof = INIT_SYMSET;
1452 symset_add(&eof, TK_eof, 0);
1453 la = save_set(g, eof);
1454 first = INIT_DATASET;
1456 // production 0, offset 0 (with no data)
1457 symset_add(&first, item_num(0, 0), la);
1458 add_itemset(g, first, Non, 0, type);
1459 for (again = 0, is = g->items;
1461 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1463 struct symset done = INIT_SYMSET;
1474 ### Completing the analysis.
1476 The exact process of analysis depends on which level was requested. For
1477 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1478 `LR1` we need first, but not follow. For `SLR` we need both.
1480 We don't build the "action" tables that you might expect as the parser
1481 doesn't use them. They will be calculated to some extent if needed for
1484 Once we have built everything we allocate arrays for the two lists:
1485 symbols and itemsets. This allows more efficient access during reporting.
1486 The symbols are grouped as terminals and non-terminals and we record the
1487 changeover point in `first_nonterm`.
1489 ###### grammar fields
1490 struct symbol **symtab;
1491 struct itemset **statetab;
1494 ###### grammar_analyse
1496 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1500 int snum = TK_reserved;
1501 for (s = g->syms; s; s = s->next)
1502 if (s->num < 0 && s->type == Terminal) {
1506 g->first_nonterm = snum;
1507 for (s = g->syms; s; s = s->next)
1513 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1514 for (s = g->syms; s; s = s->next)
1515 g->symtab[s->num] = s;
1526 build_itemsets(g, type);
1528 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1529 for (is = g->items; is ; is = is->next)
1530 g->statetab[is->state] = is;
1533 ## Reporting on the Grammar
1535 The purpose of the report is to give the grammar developer insight into
1536 how the grammar parser will work. It is basically a structured dump of
1537 all the tables that have been generated, plus a description of any conflicts.
1539 ###### grammar_report
1540 static int grammar_report(struct grammar *g, enum grammar_type type)
1546 return report_conflicts(g, type);
1549 Firstly we have the complete list of symbols, together with the
1550 "FIRST" set if that was generated. We add a mark to each symbol to
1551 show if it can end in a newline (`>`), if it is considered to be
1552 "line-like" (`<`), or if it is nullable (`.`).
1556 static void report_symbols(struct grammar *g)
1560 printf("SYMBOLS + FIRST:\n");
1562 printf("SYMBOLS:\n");
1564 for (n = 0; n < g->num_syms; n++) {
1565 struct symbol *s = g->symtab[n];
1569 printf(" %c%c%c%3d%c: ",
1570 s->nullable ? '.':' ',
1571 s->can_eol ? '>':' ',
1572 s->line_like ? '<':' ',
1573 s->num, symtypes[s->type]);
1576 printf(" (%d%s)", s->precedence,
1577 assoc_names[s->assoc]);
1579 if (g->first && s->type == Nonterminal) {
1582 for (j = 0; j < g->first[n].cnt; j++) {
1585 prtxt(g->symtab[g->first[n].syms[j]]->name);
1592 Then we have the follow sets if they were computed.
1594 static void report_follow(struct grammar *g)
1597 printf("FOLLOW:\n");
1598 for (n = 0; n < g->num_syms; n++)
1599 if (g->follow[n].cnt) {
1603 prtxt(g->symtab[n]->name);
1604 for (j = 0; j < g->follow[n].cnt; j++) {
1607 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1613 And finally the item sets. These include the GO TO tables and, for
1614 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1615 it up a bit. First the items, with production number and associativity.
1617 static void report_item(struct grammar *g, int itm)
1619 int p = item_prod(itm);
1620 int dot = item_index(itm);
1621 struct production *pr = g->productions[p];
1625 prtxt(pr->head->name);
1627 for (i = 0; i < pr->body_size; i++) {
1628 printf(" %s", (dot == i ? ". ": ""));
1629 prtxt(pr->body[i]->name);
1631 if (dot == pr->body_size)
1634 if (pr->precedence && dot == pr->body_size)
1635 printf(" (%d%s)", pr->precedence,
1636 assoc_names[pr->assoc]);
1637 if (dot < pr->body_size &&
1638 pr->body[dot]->precedence) {
1639 struct symbol *s = pr->body[dot];
1640 printf(" [%d%s]", s->precedence,
1641 assoc_names[s->assoc]);
1646 The LA sets which are (possibly) reported with each item:
1648 static void report_la(struct grammar *g, int lanum)
1650 struct symset la = set_find(g, lanum);
1654 printf(" LOOK AHEAD(%d)", lanum);
1655 for (i = 0; i < la.cnt; i++) {
1658 prtxt(g->symtab[la.syms[i]]->name);
1663 Then the go to sets:
1666 static void report_goto(struct grammar *g, struct symset gt)
1671 for (i = 0; i < gt.cnt; i++) {
1673 prtxt(g->symtab[gt.syms[i]]->name);
1674 printf(" -> %d\n", gt.data[i]);
1678 Now we can report all the item sets complete with items, LA sets, and GO TO.
1680 static void report_itemsets(struct grammar *g)
1683 printf("ITEM SETS(%d)\n", g->states);
1684 for (s = 0; s < g->states; s++) {
1686 struct itemset *is = g->statetab[s];
1687 printf(" Itemset %d:%s min prefix=%d",
1688 s, is->starts_line?" (startsline)":"", is->min_prefix);
1690 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1692 for (j = 0; j < is->items.cnt; j++) {
1693 report_item(g, is->items.syms[j]);
1694 if (is->items.data != NO_DATA)
1695 report_la(g, is->items.data[j]);
1697 report_goto(g, is->go_to);
1701 ### Reporting conflicts
1703 Conflict detection varies a lot among different analysis levels. However
1704 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1705 LR1 are also similar as they have FOLLOW or LA sets.
1709 ## conflict functions
1711 static int report_conflicts(struct grammar *g, enum grammar_type type)
1714 printf("Conflicts:\n");
1716 cnt = conflicts_lr0(g, type);
1718 cnt = conflicts_slr(g, type);
1720 printf(" - no conflicts\n");
1724 LR0 conflicts are any state which have both a reducible item and
1725 a shiftable item, or two reducible items.
1727 LR05 conflicts only occur if two possible reductions exist,
1728 as shifts always over-ride reductions.
1730 ###### conflict functions
1731 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1735 for (i = 0; i < g->states; i++) {
1736 struct itemset *is = g->statetab[i];
1737 int last_reduce = -1;
1738 int prev_reduce = -1;
1739 int last_shift = -1;
1743 for (j = 0; j < is->items.cnt; j++) {
1744 int itm = is->items.syms[j];
1745 int p = item_prod(itm);
1746 int bp = item_index(itm);
1747 struct production *pr = g->productions[p];
1749 if (bp == pr->body_size) {
1750 prev_reduce = last_reduce;
1754 if (pr->body[bp]->type == Terminal)
1757 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1758 printf(" State %d has both SHIFT and REDUCE:\n", i);
1759 report_item(g, is->items.syms[last_shift]);
1760 report_item(g, is->items.syms[last_reduce]);
1763 if (prev_reduce >= 0) {
1764 printf(" State %d has 2 (or more) reducible items\n", i);
1765 report_item(g, is->items.syms[prev_reduce]);
1766 report_item(g, is->items.syms[last_reduce]);
1773 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1774 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1775 in the source of the look ahead set.
1777 We build two datasets to reflect the "action" table: one which maps
1778 terminals to items where that terminal could be shifted and another
1779 which maps terminals to items that could be reduced when the terminal
1780 is in look-ahead. We report when we get conflicts between the two.
1782 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1787 for (i = 0; i < g->states; i++) {
1788 struct itemset *is = g->statetab[i];
1789 struct symset shifts = INIT_DATASET;
1790 struct symset reduce = INIT_DATASET;
1794 /* First collect the shifts */
1795 for (j = 0; j < is->items.cnt; j++) {
1796 unsigned short itm = is->items.syms[j];
1797 int p = item_prod(itm);
1798 int bp = item_index(itm);
1799 struct production *pr = g->productions[p];
1801 if (bp < pr->body_size &&
1802 pr->body[bp]->type == Terminal) {
1804 int sym = pr->body[bp]->num;
1805 if (symset_find(&shifts, sym) < 0)
1806 symset_add(&shifts, sym, itm);
1809 /* Now look for reduction and conflicts */
1810 for (j = 0; j < is->items.cnt; j++) {
1811 unsigned short itm = is->items.syms[j];
1812 int p = item_prod(itm);
1813 int bp = item_index(itm);
1814 struct production *pr = g->productions[p];
1816 if (bp < pr->body_size)
1821 la = g->follow[pr->head->num];
1823 la = set_find(g, is->items.data[j]);
1825 for (k = 0; k < la.cnt; k++) {
1826 int pos = symset_find(&shifts, la.syms[k]);
1828 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1829 prtxt(g->symtab[la.syms[k]]->name);
1831 report_item(g, shifts.data[pos]);
1832 report_item(g, itm);
1835 pos = symset_find(&reduce, la.syms[k]);
1837 symset_add(&reduce, la.syms[k], itm);
1840 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1841 prtxt(g->symtab[la.syms[k]]->name);
1843 report_item(g, itm);
1844 report_item(g, reduce.data[pos]);
1848 symset_free(shifts);
1849 symset_free(reduce);
1855 ## Generating the parser
1857 The exported part of the parser is the `parse_XX` function, where the name
1858 `XX` is based on the name of the parser files.
1860 This takes a `code_node`, a partially initialized `token_config`, and an
1861 optional `FILE` to send tracing to. The `token_config` gets the list of
1862 known words added and then is used with the `code_node` to initialize the
1865 `parse_XX` then calls the library function `parser_run` to actually complete
1866 the parse. This needs the `states` table and function to call the various
1867 pieces of code provided in the grammar file, so they are generated first.
1869 ###### parser_generate
1871 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1877 gen_reduce(f, g, file);
1880 fprintf(f, "#line 0 \"gen_parser\"\n");
1881 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1884 fprintf(f, "\tstruct token_state *tokens;\n");
1885 fprintf(f, "\tconfig->words_marks = known;\n");
1886 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1887 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1888 fprintf(f, "\ttokens = token_open(code, config);\n");
1889 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1890 fprintf(f, "\ttoken_close(tokens);\n");
1891 fprintf(f, "\treturn rv;\n");
1892 fprintf(f, "}\n\n");
1895 ### Known words table
1897 The known words table is simply an array of terminal symbols.
1898 The table of nonterminals used for tracing is a similar array.
1902 static void gen_known(FILE *f, struct grammar *g)
1905 fprintf(f, "#line 0 \"gen_known\"\n");
1906 fprintf(f, "static const char *known[] = {\n");
1907 for (i = TK_reserved;
1908 i < g->num_syms && g->symtab[i]->type == Terminal;
1910 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1911 g->symtab[i]->name.txt);
1912 fprintf(f, "};\n\n");
1915 static void gen_non_term(FILE *f, struct grammar *g)
1918 fprintf(f, "#line 0 \"gen_non_term\"\n");
1919 fprintf(f, "static const char *non_term[] = {\n");
1920 for (i = TK_reserved;
1923 if (g->symtab[i]->type == Nonterminal)
1924 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1925 g->symtab[i]->name.txt);
1926 fprintf(f, "};\n\n");
1929 ### States and the goto tables.
1931 For each state we record the goto table, the reducible production if
1932 there is one, or a symbol to shift for error recovery.
1933 Some of the details of the reducible production are stored in the
1934 `do_reduce` function to come later. Here we store the production number,
1935 the body size (useful for stack management) and the resulting symbol (useful
1936 for knowing how to free data later).
1938 The go to table is stored in a simple array of `sym` and corresponding
1941 ###### exported types
1949 const struct lookup * go_to;
1961 static void gen_goto(FILE *f, struct grammar *g)
1964 fprintf(f, "#line 0 \"gen_goto\"\n");
1965 for (i = 0; i < g->states; i++) {
1967 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1969 struct symset gt = g->statetab[i]->go_to;
1970 for (j = 0; j < gt.cnt; j++)
1971 fprintf(f, "\t{ %d, %d },\n",
1972 gt.syms[j], gt.data[j]);
1979 static void gen_states(FILE *f, struct grammar *g)
1982 fprintf(f, "#line 0 \"gen_states\"\n");
1983 fprintf(f, "static const struct state states[] = {\n");
1984 for (i = 0; i < g->states; i++) {
1985 struct itemset *is = g->statetab[i];
1986 int j, prod = -1, prod_len;
1988 int shift_len = 0, shift_remain = 0;
1989 for (j = 0; j < is->items.cnt; j++) {
1990 int itm = is->items.syms[j];
1991 int p = item_prod(itm);
1992 int bp = item_index(itm);
1993 struct production *pr = g->productions[p];
1995 if (bp < pr->body_size) {
1996 if (shift_sym < 0 ||
1997 (shift_len == bp && shift_remain > pr->body_size - bp)) {
1998 shift_sym = pr->body[bp]->num;
2000 shift_remain = pr->body_size - bp;
2004 /* This is what we reduce */
2005 if (prod < 0 || prod_len < pr->body_size) {
2007 prod_len = pr->body_size;
2012 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d, %d },\n",
2013 i, is->go_to.cnt, i, prod,
2014 g->productions[prod]->body_size,
2015 g->productions[prod]->head->num,
2016 is->starts_line, is->min_prefix);
2018 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d, %d },\n",
2019 i, is->go_to.cnt, i, shift_sym,
2020 is->starts_line, is->min_prefix);
2022 fprintf(f, "};\n\n");
2025 ### The `do_reduce` function and the code
2027 When the parser engine decides to reduce a production, it calls `do_reduce`.
2028 This has two functions.
2030 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
2031 production being reduced. Secondly it runs the code that was included with
2032 the production if any.
2034 This code needs to be able to store data somewhere. Rather than requiring
2035 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
2036 `do_reduce` return the size to be saved.
2038 In order for the code to access "global" context, we pass in the
2039 "config" pointer that was passed to parser function. If the `struct
2040 token_config` is embedded in some larger structure, the reducing code
2041 can access the larger structure using pointer manipulation.
2043 The code fragment requires translation when written out. Any `$N` needs to
2044 be converted to a reference either to that buffer (if `$0`) or to the
2045 structure returned by a previous reduction. These pointers need to be cast
2046 to the appropriate type for each access. All this is handled in
2049 `gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
2050 This applied only to symbols with references (or pointers), not those with structures.
2051 The `<` implies that the reference it being moved out, so the object will not be
2052 automatically freed. This is equivalent to assigning `NULL` to the pointer.
2056 static void gen_code(struct production *p, FILE *f, struct grammar *g)
2059 char *used = calloc(1, p->body_size);
2062 fprintf(f, "\t\t\t");
2063 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2077 if (*c < '0' || *c > '9') {
2084 while (c[1] >= '0' && c[1] <= '9') {
2086 n = n * 10 + *c - '0';
2089 fprintf(f, "(*(struct %.*s*%s)ret)",
2090 p->head->struct_name.len,
2091 p->head->struct_name.txt,
2092 p->head->isref ? "*":"");
2093 else if (n > p->body_size)
2094 fprintf(f, "$%d", n);
2095 else if (p->body[n-1]->type == Terminal)
2096 fprintf(f, "(*(struct token *)body[%d])",
2098 else if (p->body[n-1]->struct_name.txt == NULL)
2099 fprintf(f, "$%d", n);
2101 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2102 p->body[n-1]->struct_name.len,
2103 p->body[n-1]->struct_name.txt,
2104 p->body[n-1]->isref ? "*":"", n-1);
2109 for (i = 0; i < p->body_size; i++) {
2110 if (p->body[i]->struct_name.txt &&
2111 p->body[i]->isref &&
2113 // assume this has been copied out
2114 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2121 static void gen_reduce(FILE *f, struct grammar *g, char *file)
2124 fprintf(f, "#line 0 \"gen_reduce\"\n");
2125 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2127 fprintf(f, "\tint ret_size = 0;\n");
2129 fprintf(f, "\tswitch(prod) {\n");
2130 for (i = 0; i < g->production_count; i++) {
2131 struct production *p = g->productions[i];
2132 fprintf(f, "\tcase %d:\n", i);
2135 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2139 if (p->head->struct_name.txt)
2140 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2141 p->head->struct_name.len,
2142 p->head->struct_name.txt,
2143 p->head->isref ? "*":"");
2145 fprintf(f, "\t\tbreak;\n");
2147 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2152 As each non-terminal can potentially cause a different type of data
2153 structure to be allocated and filled in, we need to be able to free it when
2156 It is particularly important to have fine control over freeing during error
2157 recovery where individual stack frames might need to be freed.
2159 For this, the grammar author is required to defined a `free_XX` function for
2160 each structure that is used by a non-terminal. `do_free` will call whichever
2161 is appropriate given a symbol number, and will call `free` (as is
2162 appropriate for tokens) on any terminal symbol.
2166 static void gen_free(FILE *f, struct grammar *g)
2170 fprintf(f, "#line 0 \"gen_free\"\n");
2171 fprintf(f, "static void do_free(short sym, void *asn)\n");
2173 fprintf(f, "\tif (!asn) return;\n");
2174 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2175 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2176 fprintf(f, "\tswitch(sym) {\n");
2178 for (i = 0; i < g->num_syms; i++) {
2179 struct symbol *s = g->symtab[i];
2181 s->type != Nonterminal ||
2182 s->struct_name.txt == NULL)
2185 fprintf(f, "\tcase %d:\n", s->num);
2187 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2189 s->struct_name.txt);
2190 fprintf(f, "\t\tfree(asn);\n");
2192 fprintf(f, "\t\tfree_%.*s(asn);\n",
2194 s->struct_name.txt);
2195 fprintf(f, "\t\tbreak;\n");
2197 fprintf(f, "\t}\n}\n\n");
2200 ## The main routine.
2202 There are three key parts to the "main" function of parsergen: processing
2203 the arguments, loading the grammar file, and dealing with the grammar.
2205 ### Argument processing.
2207 Command line options allow the selection of analysis mode, name of output
2208 file, and whether or not a report should be generated. By default we create
2209 a report only if no code output was requested.
2211 The `parse_XX` function name uses the basename of the output file which
2212 should not have a suffix (`.c`). `.c` is added to the given name for the
2213 code, and `.h` is added for the header (if header text is specifed in the
2220 static const struct option long_options[] = {
2221 { "LR0", 0, NULL, '0' },
2222 { "LR05", 0, NULL, '5' },
2223 { "SLR", 0, NULL, 'S' },
2224 { "LALR", 0, NULL, 'L' },
2225 { "LR1", 0, NULL, '1' },
2226 { "tag", 1, NULL, 't' },
2227 { "report", 0, NULL, 'R' },
2228 { "output", 1, NULL, 'o' },
2229 { NULL, 0, NULL, 0 }
2231 const char *options = "05SL1t:Ro:";
2233 ###### process arguments
2235 char *outfile = NULL;
2240 enum grammar_type type = LR05;
2241 while ((opt = getopt_long(argc, argv, options,
2242 long_options, NULL)) != -1) {
2257 outfile = optarg; break;
2259 tag = optarg; break;
2261 fprintf(stderr, "Usage: parsergen ...\n");
2266 infile = argv[optind++];
2268 fprintf(stderr, "No input file given\n");
2271 if (outfile && report == 1)
2274 if (name && strchr(name, '/'))
2275 name = strrchr(name, '/')+1;
2277 if (optind < argc) {
2278 fprintf(stderr, "Excess command line arguments\n");
2282 ### Loading the grammar file
2284 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2287 Once we have extracted the code (with `mdcode`) we expect to find three
2288 sections: header, code, and grammar. Anything else that is not
2289 excluded by the `--tag` option is an error.
2291 "header" and "code" are optional, though it is hard to build a working
2292 parser with neither. "grammar" must be provided.
2296 #include <sys/mman.h>
2301 static void pr_err(char *msg)
2304 fprintf(stderr, "%s\n", msg);
2308 struct section *table;
2312 fd = open(infile, O_RDONLY);
2314 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2315 infile, strerror(errno));
2318 len = lseek(fd, 0, 2);
2319 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2320 table = code_extract(file, file+len, pr_err);
2322 struct code_node *hdr = NULL;
2323 struct code_node *code = NULL;
2324 struct code_node *gram = NULL;
2325 for (s = table; s; s = s->next) {
2326 struct text sec = s->section;
2327 if (tag && !strip_tag(&sec, tag))
2329 if (text_is(sec, "header"))
2331 else if (text_is(sec, "code"))
2333 else if (text_is(sec, "grammar"))
2336 fprintf(stderr, "Unknown content section: %.*s\n",
2337 s->section.len, s->section.txt);
2342 ### Processing the input
2344 As we need to append an extention to a filename and then open it for
2345 writing, and we need to do this twice, it helps to have a separate function.
2349 static FILE *open_ext(char *base, char *ext)
2351 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2353 strcat(strcpy(fn, base), ext);
2359 If we can read the grammar, then we analyse and optionally report on it. If
2360 the report finds conflicts we will exit with an error status.
2362 ###### process input
2363 struct grammar *g = NULL;
2365 fprintf(stderr, "No grammar section provided\n");
2368 g = grammar_read(gram);
2370 fprintf(stderr, "Failure to parse grammar\n");
2375 grammar_analyse(g, type);
2377 if (grammar_report(g, type))
2381 If a "headers" section is defined, we write it out and include a declaration
2382 for the `parse_XX` function so it can be used from separate code.
2384 if (rv == 0 && hdr && outfile) {
2385 FILE *f = open_ext(outfile, ".h");
2387 code_node_print(f, hdr, infile);
2388 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2392 fprintf(stderr, "Cannot create %s.h\n",
2398 And if all goes well and an output file was provided, we create the `.c`
2399 file with the code section (if any) and the parser tables and function.
2401 if (rv == 0 && outfile) {
2402 FILE *f = open_ext(outfile, ".c");
2405 code_node_print(f, code, infile);
2406 gen_parser(f, g, infile, name);
2409 fprintf(stderr, "Cannot create %s.c\n",
2415 And that about wraps it up. We need to set the locale so that UTF-8 is
2416 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2418 ###### File: parsergen.mk
2419 parsergen : parsergen.o libscanner.o libmdcode.o
2420 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2427 int main(int argc, char *argv[])
2432 setlocale(LC_ALL,"");
2434 ## process arguments
2441 ## The SHIFT/REDUCE parser
2443 Having analysed the grammar and generated all the tables, we only need the
2444 shift/reduce engine to bring it all together.
2446 ### Goto table lookup
2448 The parser generator has nicely provided us with goto tables sorted by
2449 symbol number. We need a binary search function to find a symbol in the
2452 ###### parser functions
2454 static int search(const struct state *l, int sym)
2457 int hi = l->go_to_cnt;
2461 while (lo + 1 < hi) {
2462 int mid = (lo + hi) / 2;
2463 if (l->go_to[mid].sym <= sym)
2468 if (l->go_to[lo].sym == sym)
2469 return l->go_to[lo].state;
2474 ### The state stack.
2476 The core data structure for the parser is the stack. This tracks all the
2477 symbols that have been recognised or partially recognised.
2479 The stack usually won't grow very large - maybe a few tens of entries. So
2480 we dynamically resize and array as required but never bother to shrink it
2483 We keep the stack as two separate allocations. One, `asn_stack` stores the
2484 "abstract syntax nodes" which are created by each reduction. When we call
2485 `do_reduce` we need to pass an array of the `asn`s of the body of the
2486 production, and by keeping a separate `asn` stack, we can just pass a
2487 pointer into this stack.
2489 The other allocation stores all other stack fields of which there are six.
2490 The `state` is the most important one and guides the parsing process. The
2491 `sym` is nearly unnecessary. However when we want to free entries from the
2492 `asn_stack`, it helps to know what type they are so we can call the right
2493 freeing function. The symbol leads us to the right free function through
2496 The `indents` count tracks the line indents with-in the symbol or
2497 immediately follow it. These are used to allow indent information to
2498 guide parsing and error recovery.
2500 `since_newline` tracks how many stack frames since the last
2501 start-of-line (whether indented or not). So if `since_newline` is
2502 zero, then this symbol is at the start of a line. Similarly
2503 `since_indent` counts the number of states since an indent, it is zero
2504 precisely when `indents` is not zero.
2506 `newline_permitted` keeps track of whether newlines should be ignored
2509 The stack is most properly seen as alternating states and symbols -
2510 states, like the 'DOT' in items, are between symbols. Each frame in
2511 our stack holds a state and the symbol that was before it. The
2512 bottom of stack holds the start state but no symbol, as nothing came
2513 before the beginning.
2515 ###### parser functions
2520 short newline_permitted;
2524 short since_newline;
2534 Two operations are needed on the stack - shift (which is like push) and pop.
2536 Shift applies not only to terminals but also to non-terminals. When
2537 we reduce a production we will pop off entries corresponding to the
2538 body symbols, then push on an item for the head of the production.
2539 This last is exactly the same process as shifting in a terminal so we
2540 use the same function for both. In both cases we provide the symbol,
2541 the number of indents the symbol contains (which will be zero for a
2542 terminal symbol) and a flag indicating the the symbol was at (or was
2543 reduced from a symbol which was at) the start of a line. The state is
2544 deduced from the current top-of-stack state and the new symbol.
2546 To simplify other code we arrange for `shift` to fail if there is no `goto`
2547 state for the symbol. This is useful in basic parsing due to our design
2548 that we shift when we can, and reduce when we cannot. So the `shift`
2549 function reports if it could.
2551 `shift` is also used to push state zero onto the stack, so if the
2552 stack is empty, it always chooses zero as the next state.
2554 So `shift` finds the next state. If that succeeds it extends the
2555 allocations if needed and pushes all the information onto the stacks.
2557 Newlines are permitted after a `starts_line` state until an internal
2558 indent. If the new frame has neither a `starts_line` state nor an
2559 indent, newlines are permitted if the previous stack frame permitted
2562 ###### parser functions
2564 static int shift(struct parser *p,
2565 short sym, short indents, short start_of_line,
2567 const struct state states[])
2569 // Push an entry onto the stack
2570 struct frame next = {0};
2571 int newstate = p->tos
2572 ? search(&states[p->stack[p->tos-1].state],
2577 if (p->tos >= p->stack_size) {
2578 p->stack_size += 10;
2579 p->stack = realloc(p->stack, p->stack_size
2580 * sizeof(p->stack[0]));
2581 p->asn_stack = realloc(p->asn_stack, p->stack_size
2582 * sizeof(p->asn_stack[0]));
2585 next.indents = indents;
2586 next.state = newstate;
2587 if (states[newstate].starts_line)
2588 next.newline_permitted = 1;
2590 next.newline_permitted = 0;
2592 next.newline_permitted =
2593 p->stack[p->tos-1].newline_permitted;
2595 next.newline_permitted = 0;
2597 if (!start_of_line) {
2599 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2601 next.since_newline = 1;
2604 next.since_indent = 0;
2606 next.since_indent = p->stack[p->tos-1].since_indent + 1;
2608 next.since_indent = 1;
2610 p->stack[p->tos] = next;
2611 p->asn_stack[p->tos] = asn;
2616 `pop` primarily moves the top of stack (`tos`) back down the required
2617 amount and frees any `asn` entries that need to be freed. It also
2618 collects a summary of the indents and line starts in the symbols that
2619 are being removed. It is called _after_ we reduce a production, just
2620 before we `shift` the nonterminal in.
2622 ###### parser functions
2624 static int pop(struct parser *p, int num,
2625 short *start_of_line,
2626 void(*do_free)(short sym, void *asn))
2633 for (i = 0; i < num; i++) {
2634 sol |= !p->stack[p->tos+i].since_newline;
2635 indents += p->stack[p->tos+i].indents;
2636 do_free(p->stack[p->tos+i].sym,
2637 p->asn_stack[p->tos+i]);
2640 *start_of_line = sol;
2644 ### Memory allocation
2646 The `scanner` returns tokens in a local variable - we want them in allocated
2647 memory so they can live in the `asn_stack`. Similarly the `asn` produced by
2648 a reduce is in a large buffer. Both of these require some allocation and
2649 copying, hence `memdup` and `tokcopy`.
2651 ###### parser includes
2654 ###### parser functions
2656 void *memdup(void *m, int len)
2662 memcpy(ret, m, len);
2666 static struct token *tok_copy(struct token tk)
2668 struct token *new = malloc(sizeof(*new));
2673 ### The heart of the parser.
2675 Now we have the parser. If we can shift we do, though newlines and
2676 reducing indenting may block that. If not and we can reduce we do
2677 that. If the production we reduced was production zero, then we have
2678 accepted the input and can finish.
2680 We return whatever `asn` was returned by reducing production zero.
2682 If we can neither shift nor reduce we have an error to handle. We pop
2683 single entries off the stack until we can shift the `TK_error` symbol, then
2684 drop input tokens until we find one we can shift into the new error state.
2686 When we find `TK_in` and `TK_out` tokens which report indents we need
2687 to handle them directly as the grammar cannot express what we want to
2690 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2691 record how many indents there are following the previous token.
2693 `TK_out` tokens must be canceled against an indent count
2694 within the stack. If we can reduce some symbols that are all since
2695 the most recent indent, then we do that first. If the minimum prefix
2696 of the current state then extends back before the most recent indent,
2697 that indent can be cancelled. If the minimum prefix is shorter then
2698 the indent is premature and we must start error handling, which
2699 currently doesn't work at all.
2701 `TK_newline` tokens are ignored unless the top stack frame records
2702 that they are permitted. In that case they will not be considered for
2703 shifting if it is possible to reduce some symbols that are all since
2704 the most recent start of line. This is how a newline forcible
2705 terminates any line-like structure - we try to reduce down to at most
2706 one symbol for each line where newlines are allowed.
2708 ###### parser includes
2711 void *parser_run(struct token_state *tokens,
2712 const struct state states[],
2713 int (*do_reduce)(int, void**, struct token_config*, void*),
2714 void (*do_free)(short, void*),
2715 FILE *trace, const char *non_term[],
2716 struct token_config *config)
2718 struct parser p = { 0 };
2719 struct token *tk = NULL;
2723 shift(&p, TK_eof, 0, 1, NULL, states);
2725 struct token *err_tk;
2726 struct frame *tos = &p.stack[p.tos-1];
2728 tk = tok_copy(token_next(tokens));
2729 parser_trace(trace, &p,
2730 tk, states, non_term, config->known_count);
2732 if (tk->num == TK_in) {
2734 tos->since_newline = 0;
2735 tos->since_indent = 0;
2736 if (!states[tos->state].starts_line)
2737 tos->newline_permitted = 0;
2740 parser_trace_action(trace, "Record");
2743 if (tk->num == TK_out) {
2744 if (states[tos->state].reduce_size >= 0 &&
2745 states[tos->state].reduce_size <= tos->since_indent)
2747 if (states[tos->state].min_prefix >= tos->since_indent) {
2749 struct frame *in = tos - tos->since_indent;
2751 if (in->indents == 0) {
2752 /* Reassess since_indent and newline_permitted */
2754 in->since_indent = in[-1].since_indent + 1;
2755 in->newline_permitted = in[-1].newline_permitted;
2757 in->since_indent = 0;
2758 in->newline_permitted = 0;
2760 if (states[in->state].starts_line)
2761 in->newline_permitted = 1;
2764 in->since_indent = in[-1].since_indent + 1;
2765 if (states[in->state].starts_line)
2766 in->newline_permitted = 1;
2768 in->newline_permitted = in[-1].newline_permitted;
2773 parser_trace_action(trace, "Cancel");
2776 // fall through and force a REDUCE (as 'shift'
2779 if (tk->num == TK_newline) {
2780 if (!tos->newline_permitted) {
2783 parser_trace_action(trace, "Discard");
2786 if (tos->since_newline > 1 &&
2787 states[tos->state].reduce_size >= 0 &&
2788 states[tos->state].reduce_size <= tos->since_newline)
2791 if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
2793 parser_trace_action(trace, "Shift");
2797 if (states[tos->state].reduce_prod >= 0) {
2800 const struct state *nextstate = &states[tos->state];
2801 int prod = nextstate->reduce_prod;
2802 int size = nextstate->reduce_size;
2804 static char buf[16*1024];
2805 short indents, start_of_line;
2807 body = p.asn_stack + (p.tos - size);
2809 bufsize = do_reduce(prod, body, config, buf);
2811 indents = pop(&p, size, &start_of_line,
2813 res = memdup(buf, bufsize);
2814 memset(buf, 0, bufsize);
2815 if (!shift(&p, nextstate->reduce_sym,
2816 indents, start_of_line,
2818 if (prod != 0) abort();
2822 parser_trace_action(trace, "Reduce");
2825 if (tk->num == TK_out) {
2826 // Indent problem - synthesise tokens to get us
2828 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[tos->state].shift_sym);
2829 shift(&p, states[tos->state].shift_sym,
2830 0, 1, tok_copy(*tk), states);
2831 // FIXME need to report this error somehow
2832 parser_trace_action(trace, "Synthesize");
2835 /* Error. We walk up the stack until we
2836 * find a state which will accept TK_error.
2837 * We then shift in TK_error and see what state
2838 * that takes us too.
2839 * Then we discard input tokens until
2840 * we find one that is acceptable.
2842 parser_trace_action(trace, "ERROR");
2843 short indents = 0, start_of_line;
2845 err_tk = tok_copy(*tk);
2846 while (shift(&p, TK_error, 0, 0,
2847 err_tk, states) == 0
2849 // discard this state
2850 indents += pop(&p, 1, &start_of_line, do_free);
2853 // no state accepted TK_error
2856 tos = &p.stack[p.tos-1];
2857 while (search(&states[tos->state], tk->num) < 0 &&
2858 tk->num != TK_eof) {
2860 tk = tok_copy(token_next(tokens));
2861 if (tk->num == TK_in)
2863 if (tk->num == TK_out) {
2867 // FIXME update since_indent here
2870 if (p.tos == 0 && tk->num == TK_eof)
2872 tos = &p.stack[p.tos-1];
2873 tos->indents += indents;
2877 pop(&p, p.tos, NULL, do_free);
2883 ###### exported functions
2884 void *parser_run(struct token_state *tokens,
2885 const struct state states[],
2886 int (*do_reduce)(int, void**, struct token_config*, void*),
2887 void (*do_free)(short, void*),
2888 FILE *trace, const char *non_term[],
2889 struct token_config *config);
2893 Being able to visualize the parser in action can be invaluable when
2894 debugging the parser code, or trying to understand how the parse of a
2895 particular grammar progresses. The stack contains all the important
2896 state, so just printing out the stack every time around the parse loop
2897 can make it possible to see exactly what is happening.
2899 This doesn't explicitly show each SHIFT and REDUCE action. However they
2900 are easily deduced from the change between consecutive lines, and the
2901 details of each state can be found by cross referencing the states list
2902 in the stack with the "report" that parsergen can generate.
2904 For terminal symbols, we just dump the token. For non-terminals we
2905 print the name of the symbol. The look ahead token is reported at the
2906 end inside square brackets.
2908 ###### parser functions
2910 static char *reserved_words[] = {
2911 [TK_error] = "ERROR",
2914 [TK_newline] = "NEWLINE",
2917 static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2919 fprintf(trace, "(%d", f->state);
2920 if (states[f->state].starts_line)
2921 fprintf(trace, "s");
2922 if (f->newline_permitted)
2923 fprintf(trace, "n%d", f->since_newline);
2924 fprintf(trace, ") ");
2927 void parser_trace(FILE *trace, struct parser *p,
2928 struct token *tk, const struct state states[],
2929 const char *non_term[], int knowns)
2934 for (i = 0; i < p->tos; i++) {
2935 struct frame *f = &p->stack[i];
2938 if (sym < TK_reserved &&
2939 reserved_words[sym] != NULL)
2940 fputs(reserved_words[sym], trace);
2941 else if (sym < TK_reserved + knowns) {
2942 struct token *t = p->asn_stack[i];
2943 text_dump(trace, t->txt, 20);
2945 fputs(non_term[sym - TK_reserved - knowns],
2948 fprintf(trace, ".%d", f->indents);
2949 if (f->since_newline == 0)
2953 parser_trace_state(trace, f, states);
2955 fprintf(trace, "[");
2956 if (tk->num < TK_reserved &&
2957 reserved_words[tk->num] != NULL)
2958 fputs(reserved_words[tk->num], trace);
2960 text_dump(trace, tk->txt, 20);
2964 void parser_trace_action(FILE *trace, char *action)
2967 fprintf(trace, " - %s\n", action);
2972 The obvious example for a parser is a calculator.
2974 As `scanner` provides number parsing function using `libgmp` is it not much
2975 work to perform arbitrary rational number calculations.
2977 This calculator takes one expression, or an equality test, per line. The
2978 results are printed and if any equality test fails, the program exits with
2981 ###### File: parsergen.mk
2982 calc.c calc.h : parsergen parsergen.mdc
2983 ./parsergen --tag calc -o calc parsergen.mdc
2984 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2985 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2990 // what do we use for a demo-grammar? A calculator of course.
3002 #include <sys/mman.h>
3007 #include "scanner.h"
3013 static void free_number(struct number *n)
3019 int main(int argc, char *argv[])
3021 int fd = open(argv[1], O_RDONLY);
3022 int len = lseek(fd, 0, 2);
3023 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3024 struct section *s = code_extract(file, file+len, NULL);
3025 struct token_config config = {
3026 .ignored = (1 << TK_line_comment)
3027 | (1 << TK_block_comment)
3030 .number_chars = ".,_+-",
3034 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3036 struct section *t = s->next;
3049 Session -> Session Line
3052 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3053 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3054 gmp_printf(" or as a decimal: %Fg\n", fl);
3058 | Expression = Expression NEWLINE ${
3059 if (mpq_equal($1.val, $3.val))
3060 gmp_printf("Both equal %Qd\n", $1.val);
3062 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
3067 | NEWLINE ${ printf("Blank line\n"); }$
3068 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3071 Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3072 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3073 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3074 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3075 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3076 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$