1 # LR(1) Parser Generator with 2D support #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 "2D support" means that indentation and line breaks can be significant.
8 Indent tokens (IN and OUT) and end-of-line tokens (EOL and NEWLINE) can
9 be used to describe the grammar and the parser can selectively ignore
10 these where they aren't relevant.
12 There are several distinct sections.
14 1. `grammar_read` will read a grammar from a literate-code file and
15 store the productions, symbols, and code fragments.
16 2. `grammar_analyse` will take that grammar and build LR parsing
17 states and look-ahead tables.
18 3. `grammar_report` will present the details of the analysis
19 in a readable format and will report any conflicts.
20 4. `parser_generate` will write out C code files with various
21 tables and with the code fragments arranged in useful places.
22 5. `parser_run` is a library function intended to be linked together
23 with the generated parser tables to complete the implementation of
25 6. Finally `calc` is a test grammar for a simple calculator. The
26 `parsergen` program built from the C code in this file can extract
27 that grammar directly from this file and process it.
29 ###### File: parsergen.c
34 ## forward declarations
45 ###### File: libparser.c
52 ###### File: parsergen.mk
55 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
58 ## Reading the grammar
60 The grammar must be stored in a markdown literate code file as parsed by
61 "[mdcode][]". It may have up to four top level (i.e. unreferenced)
62 sections: `header`, `code`, `grammar` and `reduce`. The first two, if
63 present, will be literally copied into the generated `.c` and `.h`
64 files. The third is required and contains the grammar which is
65 tokenised with "[scanner][]". `reduce` can provide variable
66 declarations and common intialization code for all the reduction code
67 fragments in the grammar.
69 If the `--tag` option is given, then any top level header that doesn't
70 start with the tag is ignored, and the tag is striped from the rest. So
71 `--tag Foo` means that the four sections expected are `Foo: header`,
72 `Foo: code`, `Foo: grammar` and `Foo: reduce`. The tag `calc` is used
73 to extract the sample calculator from this file.
76 [scanner]: scanner.html
82 ###### parser includes
86 The grammar contains production sets, precedence/associativity
87 declarations, general terminal declarations, and data type declarations.
88 These are all parsed with _ad hoc_ parsing as we don't have a parser
91 The precedence and associativity can be set for each production, and
92 can be inherited from symbols. The data type (either structure or a
93 reference to a structure) is potentially defined for each non-terminal
94 and describes what C structure is used to store information about each
98 enum assoc {Left, Right, Non};
99 char *assoc_names[] = {"Left","Right","Non"};
102 struct text struct_name;
105 unsigned short precedence;
109 unsigned short precedence;
117 The strings reported by `mdcode` and `scanner` are `struct text` which
118 have length rather than being nul terminated. To help with printing and
119 comparing we define `text_is` and `prtxt`, which should possibly go in
120 `mdcode`. `scanner` does provide `text_dump` which is useful for
121 strings which might contain control characters.
123 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
124 because that is what we need to detect tags.
127 static int text_is(struct text t, char *s)
129 return (strlen(s) == t.len &&
130 strncmp(s, t.txt, t.len) == 0);
132 static void prtxt(struct text t)
134 printf("%.*s", t.len, t.txt);
137 static int strip_tag(struct text *t, char *tag)
139 int skip = strlen(tag) + 1;
140 if (skip >= t->len ||
141 strncmp(t->txt, tag, skip-1) != 0 ||
142 t->txt[skip-1] != ':')
144 while (skip < t->len && t->txt[skip] == ' ')
153 Productions are comprised primarily of symbols - terminal and
154 non-terminal. We do not make a syntactic distinction between the two,
155 though non-terminals must be identifiers while terminals can also be
156 marks. Non-terminal symbols are those which appear in the head of a
157 production, terminal symbols are those which don't. There are also
158 "virtual" symbols used for precedence marking discussed later, and
159 sometimes we won't know what type a symbol is yet.
161 To help with code safety it is possible to declare the terminal symbols.
162 If this is done, then any symbol used in a production that does not
163 appear in the head of any production and is not declared as a terminal
164 is treated as an error.
166 ###### forward declarations
167 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
168 char *symtypes = "UVTN";
171 ###### grammar fields
172 int terminals_declared;
174 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
175 table of known symbols and the resulting parser will report them as
176 `TK_reserved + N`. A small set of identifiers are reserved for the
177 different token types that `scanner` can report, and an even smaller set
178 are reserved for a special token that the parser can generate (`EOL`) as
179 will be described later. This latter set cannot use predefined numbers,
180 so they are marked as `isspecial` for now and will get assigned a number
181 with the non-terminals later.
185 static struct { int num; char *name; } reserved_words[] = {
186 { TK_error, "ERROR" },
187 { TK_number, "NUMBER" },
188 { TK_ident, "IDENTIFIER" },
190 { TK_string, "STRING" },
191 { TK_multi_string, "MULTI_STRING" },
194 { TK_newline, "NEWLINE" },
201 unsigned int isspecial:1;
203 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
204 recognised. The former is automatically expected at the end of the text
205 being parsed. The latter are always ignored by the parser.
207 All of these symbols are stored in a simple symbol table. We use the
208 `struct text` from `mdcode` to store the name and link them together
209 into a sorted list using an insertion sort.
211 We don't have separate `find` and `insert` functions as any symbol we
212 find needs to be remembered. We simply expect `find` to always return a
213 symbol, but its type might be `Unknown`.
222 ###### grammar fields
227 static struct symbol *sym_find(struct grammar *g, struct text s)
229 struct symbol **l = &g->syms;
234 (cmp = text_cmp((*l)->name, s)) < 0)
238 n = calloc(1, sizeof(*n));
247 static void symbols_init(struct grammar *g)
249 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
251 for (i = 0; i < entries; i++) {
254 t.txt = reserved_words[i].name;
255 t.len = strlen(t.txt);
258 s->num = reserved_words[i].num;
263 ### Data types and precedence.
265 Data type specification, precedence specification, and declaration of
266 terminals are all introduced by a dollar sign at the start of the line.
267 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
268 precedence, if it is `TERM` then the line declares terminals without
269 precedence, otherwise it specifies a data type.
271 The data type name is simply stored and applied to the head of all
272 subsequent productions. It must be the name of a structure optionally
273 preceded by an asterisk which means a reference or "pointer". So
274 `$expression` maps to `struct expression` and `$*statement` maps to
275 `struct statement *`.
277 Any productions given before the first data type declaration will have
278 no data type associated with them and can carry no information. In
279 order to allow other non-terminals to have no type, the data type
280 `$void` can be given. This does *not* mean that `struct void` will be
281 used, but rather than no type will be associated with subsequent
284 The precedence line must contain a list of symbols, either terminal
285 symbols or virtual symbols. It can only contain symbols that have not
286 been seen yet, so precedence declaration must precede the productions
289 A "virtual" symbol is an identifier preceded by `$$`. Virtual symbols
290 carry precedence information but are not included in the grammar. A
291 production can declare that it inherits the precedence of a given
294 This use of `$$` precludes it from being used as a symbol in the
295 described language. Two other symbols: `${` and `}$` are also
298 Each new precedence line introduces a new precedence level and
299 declares how it associates. This level is stored in each symbol
300 listed and may be inherited by any production which uses the symbol. A
301 production inherits from the last symbol which has a precedence.
303 The symbols on the first precedence line have the lowest precedence.
304 Subsequent lines introduce symbols with higher precedence and so bind
307 Note that a declaration line (or "dollar line") can start with either of
308 two different marks: "$" or "$*". The `dollar_line()` function defined
309 here is told which was found in the `isref` argument.
311 ###### grammar fields
312 struct text current_type;
317 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
318 static const char *known[] = { "$$", "${", "}$" };
321 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
323 struct token t = token_next(ts);
329 if (t.num != TK_ident) {
330 err = "type or assoc expected after '$'";
333 if (text_is(t.txt, "LEFT"))
335 else if (text_is(t.txt, "RIGHT"))
337 else if (text_is(t.txt, "NON"))
339 else if (text_is(t.txt, "TERM")) {
341 g->terminals_declared = 1;
343 g->current_type = t.txt;
344 g->type_isref = isref;
345 if (text_is(t.txt, "void"))
346 g->current_type.txt = NULL;
348 if (t.num != TK_newline) {
349 err = "Extra tokens after type name";
356 err = "$* cannot be followed by a precedence";
360 // This is a precedence or TERM line, need some symbols.
364 while (t.num != TK_newline) {
365 enum symtype type = Terminal;
367 if (t.num == TK_virtual) {
370 if (t.num != TK_ident) {
371 err = "$$ must be followed by a word";
375 err = "Virtual symbols not permitted on $TERM line";
378 } else if (t.num != TK_ident &&
380 err = "Illegal token in precedence line";
383 s = sym_find(g, t.txt);
384 if (s->type != Unknown) {
385 err = "Symbols in precedence/TERM line must not already be known.";
390 s->precedence = g->prec_levels;
397 err = "No symbols given on precedence/TERM line";
402 while (t.num != TK_newline && t.num != TK_eof)
409 A production either starts with an identifier which is the head
410 non-terminal, or a vertical bar (`|`) in which case this production
411 uses the same head as the previous one. The identifier must be
412 followed by a `->` mark. All productions for a given non-terminal must
413 be grouped together with the `nonterminal ->` given only once.
415 After this a (possibly empty) sequence of identifiers and marks form
416 the body of the production. A virtual symbol may be given after the
417 body by preceding it with `$$`. If a virtual symbol is given, the
418 precedence of the production is that for the virtual symbol. If none
419 is given, the precedence is inherited from the last symbol in the
420 production which has a precedence specified.
422 After the optional precedence may come the `${` mark. This indicates
423 the start of a code fragment. If present, this must be on the same
424 line as the start of the production.
426 All of the text from the `${` through to the matching `}$` is
427 collected and forms the code-fragment for the production. It must all
428 be in one `code_node` of the literate code. The `}$` must be
429 at the end of a line.
431 Text in the code fragment will undergo substitutions where `$N` or
432 `$<N`,for some numeric `N` (or non-numeric indicator as described
433 later), will be replaced with a variable holding the parse information
434 for the particular symbol in the production. `$0` is the head of the
435 production, `$1` is the first symbol of the body, etc. The type of `$N`
436 for a terminal symbol is `struct token`. For a non-terminal, it is
437 whatever has been declared for that symbol. The `<` may be included and
438 means that the value (usually a reference) is being moved out, so it
439 will not automatically be freed. The effect of using '<' is that the
440 variable is cleared to all-zeros.
442 While building productions we will need to add to an array which needs to
446 static void array_add(void *varray, int *cnt, void *new)
448 void ***array = varray;
451 current = ((*cnt-1) | (step-1))+1;
452 if (*cnt == current) {
455 *array = realloc(*array, current * sizeof(void*));
457 (*array)[*cnt] = new;
461 Collecting the code fragment simply involves reading tokens until we
462 hit the end token `}$`, and noting the character position of the start and
466 static struct text collect_code(struct token_state *state,
471 code.txt = start.txt.txt + start.txt.len;
473 t = token_next(state);
474 while (t.node == start.node &&
475 t.num != TK_close && t.num != TK_error &&
477 if (t.num == TK_close && t.node == start.node)
478 code.len = t.txt.txt - code.txt;
484 Now we have all the bits we need to parse a full production.
489 ###### grammar fields
490 struct production **productions;
491 int production_count;
493 ###### production fields
495 struct symbol **body;
501 int first_production;
504 static char *parse_production(struct grammar *g,
506 struct token_state *state)
508 /* Head has already been parsed. */
511 struct production p, *pp;
513 memset(&p, 0, sizeof(p));
515 tk = token_next(state);
516 while (tk.num == TK_ident || tk.num == TK_mark) {
517 struct symbol *bs = sym_find(g, tk.txt);
518 if (bs->type == Virtual) {
519 err = "Virtual symbol not permitted in production";
522 if (bs->precedence) {
523 p.precedence = bs->precedence;
526 array_add(&p.body, &p.body_size, bs);
527 tk = token_next(state);
529 if (tk.num == TK_virtual) {
531 tk = token_next(state);
532 if (tk.num != TK_ident) {
533 err = "word required after $$";
536 vs = sym_find(g, tk.txt);
537 if (vs->precedence == 0) {
538 err = "symbol after $$ must have precedence";
541 p.precedence = vs->precedence;
544 tk = token_next(state);
546 if (tk.num == TK_open) {
547 p.code_line = tk.line;
548 p.code = collect_code(state, tk);
549 if (p.code.txt == NULL) {
550 err = "code fragment not closed properly";
553 tk = token_next(state);
556 if (tk.num != TK_newline && tk.num != TK_eof) {
557 err = "stray tokens at end of line";
560 pp = malloc(sizeof(*pp));
562 array_add(&g->productions, &g->production_count, pp);
565 while (tk.num != TK_newline && tk.num != TK_eof)
566 tk = token_next(state);
571 With the ability to parse productions and dollar-lines, we have nearly all
572 that we need to parse a grammar from a `code_node`.
574 The head of the first production will effectively be the `start` symbol
575 of the grammar. However it won't _actually_ be so. Processing the
576 grammar is greatly simplified if the real start symbol only has a single
577 production, and expects `$eof` as the final terminal. So when we find
578 the first explicit production we insert an extra implicit production as
579 production zero which looks like
581 ###### Example: production 0
584 where `START` is the first non-terminal given.
586 ###### create production zero
587 struct production *p = calloc(1,sizeof(*p));
588 struct text start = {"$start",6};
589 struct text eof = {"$eof",4};
590 struct text code = {"$0 = $<1;", 9};
591 p->head = sym_find(g, start);
592 p->head->type = Nonterminal;
593 p->head->struct_name = g->current_type;
594 p->head->isref = g->type_isref;
595 if (g->current_type.txt)
597 array_add(&p->body, &p->body_size, head);
598 array_add(&p->body, &p->body_size, sym_find(g, eof));
599 p->head->first_production = g->production_count;
600 array_add(&g->productions, &g->production_count, p);
602 Now we are ready to read in the grammar. We ignore comments
603 and strings so that the marks which introduce them can be
604 used as terminals in the grammar. We don't ignore numbers
605 even though we don't allow them as that causes the scanner
606 to produce errors that the parser is better positioned to handle.
607 We also ignore indentation, but we expect and use newlines.
609 To allow comments in the grammar, we explicitly check for "//" as the
610 first token on the line and those lines are skipped. "//" can still be
611 used as a terminal anywhere that a terminal is expected.
614 static struct grammar *grammar_read(struct code_node *code)
616 struct token_config conf = {
619 .words_marks = known,
620 .known_count = sizeof(known)/sizeof(known[0]),
622 .ignored = (1 << TK_line_comment)
623 | (1 << TK_block_comment)
626 | (1 << TK_multi_string)
631 struct token_state *state = token_open(code, &conf);
633 struct symbol *head = NULL;
637 g = calloc(1, sizeof(*g));
640 for (tk = token_next(state); tk.num != TK_eof;
641 tk = token_next(state)) {
642 if (tk.num == TK_newline)
644 if (tk.num == TK_ident) {
646 head = sym_find(g, tk.txt);
647 if (head->type == Nonterminal)
648 err = "This non-terminal has already be used.";
649 else if (head->type == Virtual)
650 err = "Virtual symbol not permitted in head of production";
652 head->type = Nonterminal;
653 head->struct_name = g->current_type;
654 head->isref = g->type_isref;
655 if (g->production_count == 0) {
656 ## create production zero
658 head->first_production = g->production_count;
659 tk = token_next(state);
660 if (tk.num == TK_mark &&
661 text_is(tk.txt, "->"))
662 err = parse_production(g, head, state);
664 err = "'->' missing in production";
666 } else if (tk.num == TK_mark
667 && text_is(tk.txt, "|")) {
668 // another production for same non-term
670 err = parse_production(g, head, state);
672 err = "First production must have a head";
673 } else if (tk.num == TK_mark
674 && text_is(tk.txt, "$")) {
675 err = dollar_line(state, g, 0);
676 } else if (tk.num == TK_mark
677 && text_is(tk.txt, "$*")) {
678 err = dollar_line(state, g, 1);
679 } else if (tk.num == TK_mark
680 && text_is(tk.txt, "//")) {
681 while (tk.num != TK_newline &&
683 tk = token_next(state);
685 err = "Unrecognised token at start of line.";
693 for (s = g->syms; s; s = s->next) {
694 if (s->type != Unknown)
696 if (!g->terminals_declared) {
700 err = "not declared";
701 fprintf(stderr, "Token %.*s not declared\n",
702 s->name.len, s->name.txt);
705 free(g); // FIXME free content
710 fprintf(stderr, "Error at line %d: %s\n",
713 free(g); // FIXME free content
717 ## Analysing the grammar
719 The central task in analysing the grammar is to determine a set of
720 states to drive the parsing process. This will require finding various
721 sets of symbols and of "items". Some of these sets will need to attach
722 information to each element in the set, so they are more like maps.
724 Each "item" may have a 'look-ahead' or `LA` set associated with it.
725 Many of these will be the same as each other. To avoid memory wastage,
726 and to simplify some comparisons of sets, these sets will be stored in a
727 list of unique sets, each assigned a number.
729 Once we have the data structures in hand to manage these sets and lists,
730 we can start setting the 'nullable' flag, build the 'FIRST' and 'FOLLOW'
731 sets, and then create the item sets which define the various states.
735 Though we don't only store symbols in these sets, they are the main
736 things we store, so they are called symbol sets or "symsets".
738 A symset has a size, an array of shorts, and an optional array of data
739 values, which are also shorts. If the array of data is not present, we
740 store an impossible pointer, as `NULL` is used to indicate that no
741 memory has been allocated yet;
746 unsigned short *syms, *data;
748 #define NO_DATA ((unsigned short *)1)
749 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
750 const struct symset INIT_DATASET = { 0, NULL, NULL };
752 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
753 using an insertion sort. We don't explicitly record the amount of
754 allocated space as it can be derived directly from the current `cnt`
755 using `((cnt - 1) | 7) + 1`.
758 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
761 int current = ((s->cnt-1) | 7) + 1;
762 if (current == s->cnt) {
764 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
765 if (s->data != NO_DATA)
766 s->data = realloc(s->data, sizeof(*s->data) * current);
769 while (i > 0 && s->syms[i-1] > key) {
770 s->syms[i] = s->syms[i-1];
771 if (s->data != NO_DATA)
772 s->data[i] = s->data[i-1];
776 if (s->data != NO_DATA)
781 Finding a symbol (or item) in a `symset` uses a simple binary search.
782 We return the index where the value was found (so data can be accessed),
783 or `-1` to indicate failure.
785 static int symset_find(struct symset *ss, unsigned short key)
792 while (lo + 1 < hi) {
793 int mid = (lo + hi) / 2;
794 if (ss->syms[mid] <= key)
799 if (ss->syms[lo] == key)
804 We will often want to form the union of two symsets. When we do, we
805 will often want to know if anything changed (as that might mean there is
806 more work to do). So `symset_union` reports whether anything was added
807 to the first set. We use a slow quadratic approach as these sets are
808 rarely large. If profiling shows this to be a problem it can be
811 static int symset_union(struct symset *a, struct symset *b)
815 for (i = 0; i < b->cnt; i++)
816 if (symset_find(a, b->syms[i]) < 0) {
817 unsigned short data = 0;
818 if (b->data != NO_DATA)
820 symset_add(a, b->syms[i], data);
826 And of course we must be able to free a symset.
828 static void symset_free(struct symset ss)
831 if (ss.data != NO_DATA)
837 Some symsets are simply stored somewhere appropriate in the `grammar`
838 data structure, others needs a bit of indirection. This applies
839 particularly to "LA" sets. These will be paired with "items" in an
840 "itemset". As itemsets will be stored in a symset, the "LA" set needs
841 to be stored in the `data` field, so we need a mapping from a small
842 (short) number to an LA `symset`.
844 As mentioned earlier this involves creating a list of unique symsets.
846 For now, we just use a linear list sorted by insertion. A skiplist
847 would be more efficient and may be added later.
854 struct setlist *next;
857 ###### grammar fields
858 struct setlist *sets;
863 static int ss_cmp(struct symset a, struct symset b)
866 int diff = a.cnt - b.cnt;
869 for (i = 0; i < a.cnt; i++) {
870 diff = (int)a.syms[i] - (int)b.syms[i];
877 static int save_set(struct grammar *g, struct symset ss)
879 struct setlist **sl = &g->sets;
883 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
890 s = malloc(sizeof(*s));
899 Finding a set by number is currently performed by a simple linear
900 search. If this turns out to hurt performance, we can store the sets in
901 a dynamic array like the productions.
903 static struct symset set_find(struct grammar *g, int num)
905 struct setlist *sl = g->sets;
906 while (sl && sl->num != num)
911 ### Setting `nullable`
913 We set `nullable` on the head symbol for any production for which all
914 the body symbols (if any) are nullable. As this is a recursive
915 definition, any change in the `nullable` setting means that we need to
916 re-evaluate where it needs to be set.
918 We simply loop around performing the same calculations until no more
925 static void set_nullable(struct grammar *g)
928 while (check_again) {
931 for (p = 0; p < g->production_count; p++) {
932 struct production *pr = g->productions[p];
935 if (pr->head->nullable)
937 for (s = 0; s < pr->body_size; s++)
938 if (! pr->body[s]->nullable)
940 if (s == pr->body_size) {
941 pr->head->nullable = 1;
948 ### Building the `first` sets
950 When calculating what can follow a particular non-terminal, we will need
951 to know what the "first" terminal in any subsequent non-terminal might
952 be. So we calculate the `first` set for every non-terminal and store
953 them in an array. We don't bother recording the "first" set for
954 terminals as they are trivial.
956 As the "first" for one symbol might depend on the "first" of another, we
957 repeat the calculations until no changes happen, just like with
958 `set_nullable`. This makes use of the fact that `symset_union` reports
959 if any change happens.
961 The core of this, which finds the "first" of part of a production body,
962 will be reused for computing the "follow" sets, so we split that out
963 into a separate function, `add_first`, which also reports if it got all
964 the way to the end of the production without finding a non-nullable
967 ###### grammar fields
968 struct symset *first;
972 static int add_first(struct production *pr, int start,
973 struct symset *target, struct grammar *g,
978 for (s = start; s < pr->body_size; s++) {
979 struct symbol *bs = pr->body[s];
980 if (bs->type == Terminal) {
981 if (symset_find(target, bs->num) < 0) {
982 symset_add(target, bs->num, 0);
986 } else if (symset_union(target, &g->first[bs->num]))
992 *to_end = (s == pr->body_size);
996 static void build_first(struct grammar *g)
1000 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1001 for (s = 0; s < g->num_syms; s++)
1002 g->first[s] = INIT_SYMSET;
1004 while (check_again) {
1007 for (p = 0; p < g->production_count; p++) {
1008 struct production *pr = g->productions[p];
1009 struct symset *head = &g->first[pr->head->num];
1011 if (add_first(pr, 0, head, g, NULL))
1017 ### Building the `follow` sets.
1019 There are two different situations when we will want to generate
1020 "follow" sets. If we are doing an SLR analysis, we want to generate a
1021 single "follow" set for each non-terminal in the grammar. That is what
1022 is happening here. If we are doing an LALR or LR analysis we will want
1023 to generate a separate "LA" set for each item. We do that later in
1026 There are two parts to generating a "follow" set. Firstly we look at
1027 every place that any non-terminal appears in the body of any production,
1028 and we find the set of possible "first" symbols after there. This is
1029 done using `add_first` above and only needs to be done once as the
1030 "first" sets are now stable and will not change.
1032 ###### grammar fields
1033 struct symset *follow;
1037 for (p = 0; p < g->production_count; p++) {
1038 struct production *pr = g->productions[p];
1041 for (b = 0; b < pr->body_size - 1; b++) {
1042 struct symbol *bs = pr->body[b];
1043 if (bs->type == Terminal)
1045 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1049 The second part is to add the "follow" set of the head of a production
1050 to the "follow" sets of the final symbol in the production, and any
1051 other symbol which is followed only by `nullable` symbols. As this
1052 depends on "follow" itself we need to repeatedly perform the process
1053 until no further changes happen.
1055 Rather than 2 nested loops, one that repeats the whole process until
1056 there is no change, and one that iterates of the set of productions, we
1057 combine these two functions into a single loop.
1061 for (check_again = 0, p = 0;
1062 p < g->production_count;
1063 p < g->production_count-1
1064 ? p++ : check_again ? (check_again = 0, p = 0)
1066 struct production *pr = g->productions[p];
1069 for (b = pr->body_size - 1; b >= 0; b--) {
1070 struct symbol *bs = pr->body[b];
1071 if (bs->type == Terminal)
1073 if (symset_union(&g->follow[bs->num],
1074 &g->follow[pr->head->num]))
1081 We now just need to create and initialise the `follow` list to get a
1085 static void build_follow(struct grammar *g)
1087 int s, check_again, p;
1088 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1089 for (s = 0; s < g->num_syms; s++)
1090 g->follow[s] = INIT_SYMSET;
1094 ### Building itemsets and states
1096 There are three different levels of detail that can be chosen for
1097 building the itemsets and states for the LR grammar. They are:
1099 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1100 itemsets - look-ahead, if used at all, is simply the 'follow' sets
1102 2. LALR(1) where we build look-ahead sets with each item and merge
1103 the LA sets when we find two paths to the same "kernel" set of items.
1104 3. LR(1) where different look-ahead for any item in the set means
1105 a different item set must be created.
1107 ###### forward declarations
1108 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1110 We need to be able to look through existing states to see if a newly
1111 generated state already exists. For now we use a simple sorted linked
1114 An item is a pair of numbers: the production number and the position of
1115 "DOT", which is an index into the body of the production. As the
1116 numbers are not enormous we can combine them into a single "short" and
1117 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1118 production number (so 2000 productions with maximum of 31 symbols in the
1121 Comparing the itemsets will be a little different to comparing symsets
1122 as we want to do the lookup after generating the "kernel" of an
1123 itemset, so we need to ignore the offset=zero items which are added during
1126 To facilitate this, we modify the "DOT" number so that "0" sorts to
1127 the end of the list in the symset, and then only compare items before
1131 static inline unsigned short item_num(int production, int dot)
1133 return production | ((31-dot) << 11);
1135 static inline int item_prod(unsigned short item)
1137 return item & 0x7ff;
1139 static inline int item_dot(unsigned short item)
1141 return (31-(item >> 11)) & 0x1f;
1144 For LR(1) analysis we need to compare not just the itemset in a state
1145 but also the LA sets. As we assign each unique LA set a number, we
1146 can just compare the symset and the data values together.
1149 static int itemset_cmp(struct symset a, struct symset b,
1150 enum grammar_type type)
1156 i < a.cnt && i < b.cnt &&
1157 item_dot(a.syms[i]) > 0 &&
1158 item_dot(b.syms[i]) > 0;
1160 int diff = a.syms[i] - b.syms[i];
1164 diff = a.data[i] - b.data[i];
1169 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1173 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1179 if (type < LR1 || av == -1)
1182 a.data[i] - b.data[i];
1185 It will be helpful to know if an itemset has been "completed" or not,
1186 particularly for LALR where itemsets get merged, at which point they
1187 need to be consider for completion again. So a `completed` flag is
1190 And now we can build the list of itemsets. The lookup routine returns
1191 both a success flag and a pointer to where in the list an insert should
1192 happen, so we don't need to search a second time.
1196 struct itemset *next;
1198 struct symset items;
1199 struct symset go_to;
1201 unsigned short precedence;
1205 ###### grammar fields
1206 struct itemset *items;
1210 static int itemset_find(struct grammar *g, struct itemset ***where,
1211 struct symset kernel, enum grammar_type type)
1213 struct itemset **ip;
1215 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1216 struct itemset *i = *ip;
1218 diff = itemset_cmp(i->items, kernel, type);
1231 Adding an itemset may require merging the LA sets if LALR analysis is
1232 happening. If any new LA set adds any symbols that weren't in the old
1233 LA set, we clear the `completed` flag so that the dependants of this
1234 itemset will be recalculated and their LA sets updated.
1236 `add_itemset` must consume the symsets it is passed, either by adding
1237 them to a data structure, of freeing them.
1239 static int add_itemset(struct grammar *g, struct symset ss,
1240 enum assoc assoc, unsigned short precedence,
1241 enum grammar_type type)
1243 struct itemset **where, *is;
1245 int found = itemset_find(g, &where, ss, type);
1247 is = calloc(1, sizeof(*is));
1248 is->state = g->states;
1252 is->precedence = precedence;
1254 is->go_to = INIT_DATASET;
1263 for (i = 0; i < ss.cnt; i++) {
1264 struct symset temp = INIT_SYMSET, s;
1265 if (ss.data[i] == is->items.data[i])
1267 s = set_find(g, is->items.data[i]);
1268 symset_union(&temp, &s);
1269 s = set_find(g, ss.data[i]);
1270 if (symset_union(&temp, &s)) {
1271 is->items.data[i] = save_set(g, temp);
1282 To build all the itemsets, we first insert the initial itemset made from
1283 production zero, complete each itemset, and then generate new itemsets
1284 from old until no new ones can be made.
1286 Completing an itemset means finding all the items where "DOT" is
1287 followed by a nonterminal and adding "DOT=0" items for every production
1288 from that non-terminal - providing each item hasn't already been added.
1289 When we add such an item it might get inserted before the current
1290 one, so to make sure we process it we reset the loop counter to the
1293 If LA sets are needed, the LA set for each new item is found using
1294 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This
1295 may be supplemented by the LA set for the item which produced the new
1298 We also collect a set of all symbols which follow "DOT" (in `done`) as
1299 this is used in the next stage.
1301 When itemsets are created we assign a precedence to the itemset from the
1302 complete item, if there is one. We ignore the possibility of there
1303 being two and don't (currently) handle precedence in such grammars.
1304 When completing a grammar we ignore any item where DOT is followed by a
1305 terminal with a precedence lower than that for the itemset. Unless the
1306 terminal has right associativity, we also ignore items where the
1307 terminal has the same precedence. The result is that unwanted items are
1308 still in the itemset, but the terminal doesn't get into the go to set,
1309 so the item is ineffective.
1311 ###### complete itemset
1312 for (i = 0; i < is->items.cnt; i++) {
1313 int p = item_prod(is->items.syms[i]);
1314 int bs = item_dot(is->items.syms[i]);
1315 struct production *pr = g->productions[p];
1318 struct symset LA = INIT_SYMSET;
1319 unsigned short sn = 0;
1321 if (bs == pr->body_size)
1324 if (s->precedence && is->precedence &&
1325 is->precedence > s->precedence)
1326 /* This terminal has a low precedence and
1327 * shouldn't be shifted
1330 if (s->precedence && is->precedence &&
1331 is->precedence == s->precedence && s->assoc != Right)
1332 /* This terminal has a matching precedence and is
1333 * not Right-associative, so we mustn't shift it.
1336 if (symset_find(&done, s->num) < 0)
1337 symset_add(&done, s->num, 0);
1339 if (s->type != Nonterminal)
1345 add_first(pr, bs+1, &LA, g, &to_end);
1347 struct symset ss = set_find(g, is->items.data[i]);
1348 symset_union(&LA, &ss);
1350 sn = save_set(g, LA);
1351 LA = set_find(g, sn);
1354 /* Add productions for this symbol */
1355 for (p2 = s->first_production;
1356 p2 < g->production_count &&
1357 g->productions[p2]->head == s;
1359 int itm = item_num(p2, 0);
1360 int pos = symset_find(&is->items, itm);
1362 symset_add(&is->items, itm, sn);
1363 /* Will have re-ordered, so start
1364 * from beginning again */
1366 } else if (type >= LALR) {
1367 struct symset ss = set_find(g, is->items.data[pos]);
1368 struct symset tmp = INIT_SYMSET;
1369 struct symset *la = &LA;
1371 symset_union(&tmp, &ss);
1372 if (symset_union(&tmp, la)) {
1373 is->items.data[pos] = save_set(g, tmp);
1381 For each symbol we found (and placed in `done`) we collect all the items
1382 for which this symbol is next, and create a new itemset with "DOT"
1383 advanced over the symbol. This is then added to the collection of
1384 itemsets (or merged with a pre-existing itemset).
1386 ###### derive itemsets
1387 // Now we have a completed itemset, so we need to
1388 // compute all the 'next' itemsets and create them
1389 // if they don't exist.
1390 for (i = 0; i < done.cnt; i++) {
1392 unsigned short state;
1393 struct symbol *sym = g->symtab[done.syms[i]];
1394 enum assoc assoc = Non;
1395 unsigned short precedence = 0;
1396 struct symset newitemset = INIT_SYMSET;
1398 newitemset = INIT_DATASET;
1400 for (j = 0; j < is->items.cnt; j++) {
1401 int itm = is->items.syms[j];
1402 int p = item_prod(itm);
1403 int bp = item_dot(itm);
1404 struct production *pr = g->productions[p];
1405 unsigned short la = 0;
1408 if (bp == pr->body_size)
1410 if (pr->body[bp] != sym)
1415 la = is->items.data[j];
1416 if (bp == pr->body_size &&
1417 pr->precedence > 0 &&
1418 pr->precedence > precedence) {
1419 // new itemset is reducible and has a precedence.
1420 precedence = pr->precedence;
1423 pos = symset_find(&newitemset, item_num(p, bp));
1426 symset_add(&newitemset, item_num(p, bp), la);
1427 else if (type >= LALR) {
1428 // Need to merge la set.
1429 int la2 = newitemset.data[pos];
1431 struct symset ss = set_find(g, la2);
1432 struct symset LA = INIT_SYMSET;
1433 symset_union(&LA, &ss);
1434 ss = set_find(g, la);
1435 if (symset_union(&LA, &ss))
1436 newitemset.data[pos] = save_set(g, LA);
1442 state = add_itemset(g, newitemset, assoc, precedence, type);
1443 if (symset_find(&is->go_to, done.syms[i]) < 0)
1444 symset_add(&is->go_to, done.syms[i], state);
1447 All that is left is to create the initial itemset from production zero, and
1448 with `TK_eof` as the LA set.
1451 static void build_itemsets(struct grammar *g, enum grammar_type type)
1453 struct symset first = INIT_SYMSET;
1456 unsigned short la = 0;
1458 // LA set just has eof
1459 struct symset eof = INIT_SYMSET;
1460 symset_add(&eof, TK_eof, 0);
1461 la = save_set(g, eof);
1462 first = INIT_DATASET;
1464 // production 0, offset 0 (with no data)
1465 symset_add(&first, item_num(0, 0), la);
1466 add_itemset(g, first, Non, 0, type);
1467 for (check_again = 0, is = g->items;
1469 is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1471 struct symset done = INIT_SYMSET;
1482 ### Completing the analysis.
1484 The exact process of analysis depends on which level was requested. For
1485 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1486 `LR1` we need first, but not follow. For `SLR` we need both.
1488 We don't build the "action" tables that you might expect as the parser
1489 doesn't use them. They will be calculated to some extent if needed for
1492 Once we have built everything we allocate arrays for the two lists:
1493 symbols and itemsets. This allows more efficient access during
1494 reporting. The symbols are grouped as terminals, then non-terminals,
1495 then virtual, with the start of non-terminals recorded as `first_nonterm`.
1496 Special terminals -- meaning just EOL -- are included with the
1497 non-terminals so that they are not expected by the scanner.
1499 ###### grammar fields
1500 struct symbol **symtab;
1501 struct itemset **statetab;
1504 ###### grammar_analyse
1506 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1510 int snum = TK_reserved;
1511 for (s = g->syms; s; s = s->next)
1512 if (s->num < 0 && s->type == Terminal && !s->isspecial) {
1516 g->first_nonterm = snum;
1517 for (s = g->syms; s; s = s->next)
1518 if (s->num < 0 && s->type != Virtual) {
1522 for (s = g->syms; s; s = s->next)
1528 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1529 for (s = g->syms; s; s = s->next)
1530 g->symtab[s->num] = s;
1539 build_itemsets(g, type);
1541 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1542 for (is = g->items; is ; is = is->next)
1543 g->statetab[is->state] = is;
1546 ## Reporting on the Grammar
1548 The purpose of the report is to give the grammar developer insight into
1549 how the grammar parser will work. It is basically a structured dump of
1550 all the tables that have been generated, plus a description of any conflicts.
1552 ###### grammar_report
1553 static int grammar_report(struct grammar *g, enum grammar_type type)
1559 return report_conflicts(g, type);
1562 Firstly we have the complete list of symbols, together with the
1563 "FIRST" set if that was generated. We add a mark to each symbol to
1564 show if it is nullable (`.`).
1568 static void report_symbols(struct grammar *g)
1572 printf("SYMBOLS + FIRST:\n");
1574 printf("SYMBOLS:\n");
1576 for (n = 0; n < g->num_syms; n++) {
1577 struct symbol *s = g->symtab[n];
1581 printf(" %c%3d%c: ",
1582 s->nullable ? '.':' ',
1583 s->num, symtypes[s->type]);
1586 printf(" (%d%s)", s->precedence,
1587 assoc_names[s->assoc]);
1589 if (g->first && s->type == Nonterminal) {
1592 for (j = 0; j < g->first[n].cnt; j++) {
1595 prtxt(g->symtab[g->first[n].syms[j]]->name);
1602 Then we have the follow sets if they were computed.
1604 static void report_follow(struct grammar *g)
1607 printf("FOLLOW:\n");
1608 for (n = 0; n < g->num_syms; n++)
1609 if (g->follow[n].cnt) {
1613 prtxt(g->symtab[n]->name);
1614 for (j = 0; j < g->follow[n].cnt; j++) {
1617 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1623 And finally the item sets. These include the GO TO tables and, for
1624 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1625 it up a bit. First the items, with production number and associativity.
1627 static void report_item(struct grammar *g, int itm)
1629 int p = item_prod(itm);
1630 int dot = item_dot(itm);
1631 struct production *pr = g->productions[p];
1635 prtxt(pr->head->name);
1637 for (i = 0; i < pr->body_size; i++) {
1638 printf(" %s", (dot == i ? ". ": ""));
1639 prtxt(pr->body[i]->name);
1641 if (dot == pr->body_size)
1644 if (pr->precedence && dot == pr->body_size)
1645 printf(" (%d%s)", pr->precedence,
1646 assoc_names[pr->assoc]);
1647 if (dot < pr->body_size &&
1648 pr->body[dot]->precedence) {
1649 struct symbol *s = pr->body[dot];
1650 printf(" [%d%s]", s->precedence,
1651 assoc_names[s->assoc]);
1656 The LA sets which are (possibly) reported with each item:
1658 static void report_la(struct grammar *g, int lanum)
1660 struct symset la = set_find(g, lanum);
1664 printf(" LOOK AHEAD(%d)", lanum);
1665 for (i = 0; i < la.cnt; i++) {
1668 prtxt(g->symtab[la.syms[i]]->name);
1673 Then the go to sets:
1675 static void report_goto(struct grammar *g, struct symset gt)
1680 for (i = 0; i < gt.cnt; i++) {
1682 prtxt(g->symtab[gt.syms[i]]->name);
1683 printf(" -> %d\n", gt.data[i]);
1687 Now we can report all the item sets complete with items, LA sets, and GO TO.
1689 static void report_itemsets(struct grammar *g)
1692 printf("ITEM SETS(%d)\n", g->states);
1693 for (s = 0; s < g->states; s++) {
1695 struct itemset *is = g->statetab[s];
1696 printf(" Itemset %d:", s);
1698 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1700 for (j = 0; j < is->items.cnt; j++) {
1701 report_item(g, is->items.syms[j]);
1702 if (is->items.data != NO_DATA)
1703 report_la(g, is->items.data[j]);
1705 report_goto(g, is->go_to);
1709 ### Reporting conflicts
1711 Conflict detection varies a lot among different analysis levels. However
1712 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1713 LR1 are also similar as they have FOLLOW or LA sets.
1717 ## conflict functions
1719 static int report_conflicts(struct grammar *g, enum grammar_type type)
1722 printf("Conflicts:\n");
1724 cnt = conflicts_lr0(g, type);
1726 cnt = conflicts_slr(g, type);
1728 printf(" - no conflicts\n");
1732 LR0 conflicts are any state which have both a reducible item and
1733 a shiftable item, or two reducible items.
1735 LR05 conflicts only occur if two possible reductions exist,
1736 as shifts always over-ride reductions.
1738 ###### conflict functions
1739 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1743 for (i = 0; i < g->states; i++) {
1744 struct itemset *is = g->statetab[i];
1745 int last_reduce = -1;
1746 int prev_reduce = -1;
1747 int last_shift = -1;
1751 for (j = 0; j < is->items.cnt; j++) {
1752 int itm = is->items.syms[j];
1753 int p = item_prod(itm);
1754 int bp = item_dot(itm);
1755 struct production *pr = g->productions[p];
1757 if (bp == pr->body_size) {
1758 prev_reduce = last_reduce;
1762 if (pr->body[bp]->type == Terminal)
1765 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1766 printf(" State %d has both SHIFT and REDUCE:\n", i);
1767 report_item(g, is->items.syms[last_shift]);
1768 report_item(g, is->items.syms[last_reduce]);
1771 if (prev_reduce >= 0) {
1772 printf(" State %d has 2 (or more) reducible items\n", i);
1773 report_item(g, is->items.syms[prev_reduce]);
1774 report_item(g, is->items.syms[last_reduce]);
1781 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1782 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1783 in the source of the look ahead set.
1785 We build two datasets to reflect the "action" table: one which maps
1786 terminals to items where that terminal could be shifted and another
1787 which maps terminals to items that could be reduced when the terminal
1788 is in look-ahead. We report when we get conflicts between the two.
1790 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1795 for (i = 0; i < g->states; i++) {
1796 struct itemset *is = g->statetab[i];
1797 struct symset shifts = INIT_DATASET;
1798 struct symset reduce = INIT_DATASET;
1802 /* First collect the shifts */
1803 for (j = 0; j < is->items.cnt; j++) {
1804 unsigned short itm = is->items.syms[j];
1805 int p = item_prod(itm);
1806 int bp = item_dot(itm);
1807 struct production *pr = g->productions[p];
1810 if (bp >= pr->body_size ||
1811 pr->body[bp]->type != Terminal)
1816 if (s->precedence && is->precedence)
1817 /* Precedence resolves this, so no conflict */
1820 if (symset_find(&shifts, s->num) < 0)
1821 symset_add(&shifts, s->num, itm);
1823 /* Now look for reductions and conflicts */
1824 for (j = 0; j < is->items.cnt; j++) {
1825 unsigned short itm = is->items.syms[j];
1826 int p = item_prod(itm);
1827 int bp = item_dot(itm);
1828 struct production *pr = g->productions[p];
1830 if (bp < pr->body_size)
1835 la = g->follow[pr->head->num];
1837 la = set_find(g, is->items.data[j]);
1839 for (k = 0; k < la.cnt; k++) {
1840 int pos = symset_find(&shifts, la.syms[k]);
1842 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1844 prtxt(g->symtab[la.syms[k]]->name);
1846 report_item(g, shifts.data[pos]);
1847 report_item(g, itm);
1849 pos = symset_find(&reduce, la.syms[k]);
1851 symset_add(&reduce, la.syms[k], itm);
1854 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1855 prtxt(g->symtab[la.syms[k]]->name);
1857 report_item(g, itm);
1858 report_item(g, reduce.data[pos]);
1862 symset_free(shifts);
1863 symset_free(reduce);
1869 ## Generating the parser
1871 The exported part of the parser is the `parse_XX` function, where the name
1872 `XX` is based on the name of the parser files.
1874 This takes a `code_node`, a partially initialized `token_config`, and an
1875 optional `FILE` to send tracing to. The `token_config` gets the list of
1876 known words added and then is used with the `code_node` to initialize the
1879 `parse_XX` then calls the library function `parser_run` to actually complete
1880 the parse. This needs the `states` table and functions to call the various
1881 pieces of code provided in the grammar file, so they are generated first.
1883 ###### parser_generate
1885 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1886 struct code_node *pre_reduce)
1892 gen_reduce(f, g, file, pre_reduce);
1895 fprintf(f, "#line 0 \"gen_parser\"\n");
1896 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1899 fprintf(f, "\tstruct token_state *tokens;\n");
1900 fprintf(f, "\tconfig->words_marks = known;\n");
1901 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1902 fprintf(f, "\ttokens = token_open(code, config);\n");
1903 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1904 fprintf(f, "\ttoken_close(tokens);\n");
1905 fprintf(f, "\treturn rv;\n");
1906 fprintf(f, "}\n\n");
1909 ### Known words table
1911 The known words table is simply an array of terminal symbols.
1912 The table of nonterminals used for tracing is a similar array.
1916 static void gen_known(FILE *f, struct grammar *g)
1919 fprintf(f, "#line 0 \"gen_known\"\n");
1920 fprintf(f, "static const char *known[] = {\n");
1921 for (i = TK_reserved;
1922 i < g->num_syms && g->symtab[i]->type == Terminal;
1924 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1925 g->symtab[i]->name.txt);
1926 fprintf(f, "};\n\n");
1929 static void gen_non_term(FILE *f, struct grammar *g)
1932 fprintf(f, "#line 0 \"gen_non_term\"\n");
1933 fprintf(f, "static const char *non_term[] = {\n");
1934 for (i = TK_reserved;
1937 if (g->symtab[i]->type == Nonterminal ||
1938 g->symtab[i]->isspecial)
1939 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1940 g->symtab[i]->name.txt);
1941 fprintf(f, "};\n\n");
1944 ### States and the goto tables.
1946 For each state we record the goto table and details of the reducible
1947 production if there is one.
1948 Some of the details of the reducible production are stored in the
1949 `do_reduce` function to come later. Here we store the production
1950 number, the body size (useful for stack management), and the resulting
1951 symbol (useful for knowing how to free data later).
1953 The go to table is stored in a simple array of `sym` and corresponding
1956 ###### exported types
1964 const struct lookup * go_to;
1973 static void gen_goto(FILE *f, struct grammar *g)
1976 fprintf(f, "#line 0 \"gen_goto\"\n");
1977 for (i = 0; i < g->states; i++) {
1978 struct symset gt = g->statetab[i]->go_to;
1983 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1985 for (j = 0; j < gt.cnt; j++)
1986 fprintf(f, "\t{ %d, %d },\n",
1987 gt.syms[j], gt.data[j]);
1992 static void gen_states(FILE *f, struct grammar *g)
1995 fprintf(f, "#line 0 \"gen_states\"\n");
1996 fprintf(f, "static const struct state states[] = {\n");
1997 for (i = 0; i < g->states; i++) {
1998 struct itemset *is = g->statetab[i];
1999 int j, prod = -1, prod_len;
2001 for (j = 0; j < is->items.cnt; j++) {
2002 int itm = is->items.syms[j];
2003 int p = item_prod(itm);
2004 int bp = item_dot(itm);
2005 struct production *pr = g->productions[p];
2007 if (bp < pr->body_size)
2009 /* This is what we reduce - choose longest */
2010 if (prod < 0 || prod_len < pr->body_size) {
2012 prod_len = pr->body_size;
2016 fprintf(f, "\t[%d] = { %d, goto_%d, ",
2017 i, is->go_to.cnt, i);
2019 fprintf(f, "\t[%d] = { 0, NULL, ", i);
2021 struct production *pr = g->productions[prod];
2022 struct symbol *hd = pr->head;
2023 fprintf(f, "%d, %d, %d, ",
2024 prod, pr->body_size, pr->head->num);
2025 if (hd->struct_name.txt == NULL)
2026 fprintf(f, "0 },\n");
2028 fprintf(f, "sizeof(struct %.*s%s) },\n",
2029 hd->struct_name.len,
2030 hd->struct_name.txt,
2031 hd->isref ? "*" : "");
2033 fprintf(f, "-1, -1, -1, -1 },\n");
2035 fprintf(f, "};\n\n");
2038 ### The `do_reduce` function and the code
2040 When the parser engine decides to reduce a production, it calls
2041 `do_reduce` which runs the code that was included with the production,
2044 This code needs to be able to store data somewhere so we record the size
2045 of the data expected with each state so it can be allocated before
2046 `do_reduce` is called.
2048 In order for the code to access "global" context, we pass in the
2049 "config" pointer that was passed to the parser function. If the `struct
2050 token_config` is embedded in some larger structure, the reducing code
2051 can access the larger structure using pointer manipulation. Performing
2052 that pointer manipulation, and any other common code or variables for
2053 all reduce actions, can be provided in code node called "reduce" which
2054 is passed around in `pre_reduce` which you might have already noticed.
2056 The code fragments require translation when written out. Any `$N` needs
2057 to be converted to a reference either to that buffer (if `$0`) or to the
2058 structure returned by a previous reduction. These pointers need to be
2059 cast to the appropriate type for each access. All this is handled in
2062 `gen_code` also allows symbol references to contain a '`<`' as in
2063 '`$<2`'. This is particularly useful for references (or pointers), but
2064 can be used with structures too. The `<` implies that the value is
2065 being moved out, so the object will not be automatically freed. It is
2066 equivalent to assigning `NULL` to the pointer or filling a structure
2069 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2070 and possibly a number. A number by itself (other than zero) selects a
2071 symbol from the body of the production. A sequence of letters selects
2072 the shortest symbol in the body which contains those letters in the
2073 given order. If a number follows the letters, then a later occurrence
2074 of that symbol is chosen. So "`$AB2`" will refer to the structure
2075 attached to the second occurrence of the shortest symbol which contains
2076 an `A` followed by a `B`. If there is no unique shortest system, or if
2077 the number given is too large, then the symbol reference is not
2078 transformed, and will cause an error when the code is compiled.
2082 static int textchr(struct text t, char c, int s)
2085 for (i = s; i < t.len; i++)
2091 static int subseq_match(char *seq, int slen, struct text name)
2095 st = textchr(name, *seq, st);
2105 static int choose_sym(char **namep, int len, struct production *p)
2107 char *name = *namep;
2116 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2122 while (len > 0 && (c >= '0' && c <= '9')) {
2125 n = n * 10 + (c - '0');
2129 if (name == *namep || n > p->body_size)
2135 for (i = 0; i < p->body_size; i++) {
2136 if (!subseq_match(nam, namlen, p->body[i]->name))
2138 if (slen == 0 || p->body[i]->name.len < slen) {
2140 slen = p->body[i]->name.len;
2142 if (s >= 0 && p->body[i] != p->body[s] &&
2143 p->body[i]->name.len == p->body[s]->name.len)
2144 /* not unique, so s cannot be used */
2151 for (i = 0; i < p->body_size; i++)
2152 if (p->body[i] == p->body[s]) {
2157 if (n > 0 || i == p->body_size)
2163 static void gen_code(struct production *p, FILE *f, struct grammar *g)
2166 char *used = calloc(1, p->body_size);
2169 fprintf(f, "\t\t\t");
2170 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2184 n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2193 fprintf(f, "(*(struct %.*s*%s)ret)",
2194 p->head->struct_name.len,
2195 p->head->struct_name.txt,
2196 p->head->isref ? "*":"");
2197 else if (p->body[n-1]->type == Terminal)
2198 fprintf(f, "(*(struct token *)body[%d])",
2200 else if (p->body[n-1]->struct_name.txt == NULL)
2201 fprintf(f, "$%d", n);
2203 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2204 p->body[n-1]->struct_name.len,
2205 p->body[n-1]->struct_name.txt,
2206 p->body[n-1]->isref ? "*":"", n-1);
2212 for (i = 0; i < p->body_size; i++) {
2213 if (p->body[i]->struct_name.txt &&
2215 // assume this has been copied out
2216 if (p->body[i]->isref)
2217 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2219 fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2227 static void gen_reduce(FILE *f, struct grammar *g, char *file,
2228 struct code_node *pre_reduce)
2231 fprintf(f, "#line 1 \"gen_reduce\"\n");
2232 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2234 fprintf(f, "\tint ret_size = 0;\n");
2236 code_node_print(f, pre_reduce, file);
2238 fprintf(f, "#line 4 \"gen_reduce\"\n");
2239 fprintf(f, "\tswitch(prod) {\n");
2240 for (i = 0; i < g->production_count; i++) {
2241 struct production *p = g->productions[i];
2242 fprintf(f, "\tcase %d:\n", i);
2245 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2249 if (p->head->struct_name.txt)
2250 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2251 p->head->struct_name.len,
2252 p->head->struct_name.txt,
2253 p->head->isref ? "*":"");
2255 fprintf(f, "\t\tbreak;\n");
2257 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2262 As each non-terminal can potentially cause a different type of data
2263 structure to be allocated and filled in, we need to be able to free it when
2266 It is particularly important to have fine control over freeing during error
2267 recovery where individual stack frames might need to be freed.
2269 For this, the grammar author is required to defined a `free_XX` function for
2270 each structure that is used by a non-terminal. `do_free` will call whichever
2271 is appropriate given a symbol number, and will call `free` (as is
2272 appropriate for tokens) on any terminal symbol.
2276 static void gen_free(FILE *f, struct grammar *g)
2280 fprintf(f, "#line 0 \"gen_free\"\n");
2281 fprintf(f, "static void do_free(short sym, void *asn)\n");
2283 fprintf(f, "\tif (!asn) return;\n");
2284 fprintf(f, "\tif (sym < %d", g->first_nonterm);
2285 /* Need to handle special terminals too */
2286 for (i = 0; i < g->num_syms; i++) {
2287 struct symbol *s = g->symtab[i];
2288 if (i >= g->first_nonterm && s->type == Terminal &&
2290 fprintf(f, " || sym == %d", s->num);
2292 fprintf(f, ") {\n");
2293 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2294 fprintf(f, "\tswitch(sym) {\n");
2296 for (i = 0; i < g->num_syms; i++) {
2297 struct symbol *s = g->symtab[i];
2299 s->type != Nonterminal ||
2300 s->struct_name.txt == NULL)
2303 fprintf(f, "\tcase %d:\n", s->num);
2305 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2307 s->struct_name.txt);
2308 fprintf(f, "\t\tfree(asn);\n");
2310 fprintf(f, "\t\tfree_%.*s(asn);\n",
2312 s->struct_name.txt);
2313 fprintf(f, "\t\tbreak;\n");
2315 fprintf(f, "\t}\n}\n\n");
2318 ## The main routine.
2320 There are three key parts to the "main" function of parsergen: processing
2321 the arguments, loading the grammar file, and dealing with the grammar.
2323 ### Argument processing.
2325 Command line options allow the selection of analysis mode, name of output
2326 file, and whether or not a report should be generated. By default we create
2327 a report only if no code output was requested.
2329 The `parse_XX` function name uses the basename of the output file which
2330 should not have a suffix (`.c`). `.c` is added to the given name for the
2331 code, and `.h` is added for the header (if header text is specifed in the
2338 static const struct option long_options[] = {
2339 { "LR0", 0, NULL, '0' },
2340 { "LR05", 0, NULL, '5' },
2341 { "SLR", 0, NULL, 'S' },
2342 { "LALR", 0, NULL, 'L' },
2343 { "LR1", 0, NULL, '1' },
2344 { "tag", 1, NULL, 't' },
2345 { "report", 0, NULL, 'R' },
2346 { "output", 1, NULL, 'o' },
2347 { NULL, 0, NULL, 0 }
2349 const char *options = "05SL1t:Ro:";
2351 ###### process arguments
2353 char *outfile = NULL;
2358 enum grammar_type type = LR05;
2359 while ((opt = getopt_long(argc, argv, options,
2360 long_options, NULL)) != -1) {
2375 outfile = optarg; break;
2377 tag = optarg; break;
2379 fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2384 infile = argv[optind++];
2386 fprintf(stderr, "No input file given\n");
2389 if (outfile && report == 1)
2392 if (name && strchr(name, '/'))
2393 name = strrchr(name, '/')+1;
2395 if (optind < argc) {
2396 fprintf(stderr, "Excess command line arguments\n");
2400 ### Loading the grammar file
2402 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2405 Once we have extracted the code (with `mdcode`) we expect to find three
2406 or four sections: header, code, grammar, reduce.
2407 Anything else that is not excluded by the `--tag` option is an error.
2409 "header", "code", and "reduce" are optional, though it is hard to build
2410 a working parser without the first two. "grammar" must be provided.
2414 #include <sys/mman.h>
2419 static void pr_err(char *msg)
2422 fprintf(stderr, "%s\n", msg);
2426 struct section *table;
2430 fd = open(infile, O_RDONLY);
2432 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2433 infile, strerror(errno));
2436 len = lseek(fd, 0, 2);
2437 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2438 table = code_extract(file, file+len, pr_err);
2440 struct code_node *hdr = NULL;
2441 struct code_node *code = NULL;
2442 struct code_node *gram = NULL;
2443 struct code_node *pre_reduce = NULL;
2444 for (s = table; s; s = s->next) {
2445 struct text sec = s->section;
2446 if (tag && !strip_tag(&sec, tag))
2448 if (text_is(sec, "header"))
2450 else if (text_is(sec, "code"))
2452 else if (text_is(sec, "grammar"))
2454 else if (text_is(sec, "reduce"))
2455 pre_reduce = s->code;
2457 fprintf(stderr, "Unknown content section: %.*s\n",
2458 s->section.len, s->section.txt);
2463 ### Processing the input
2465 As we need to append an extention to a filename and then open it for
2466 writing, and we need to do this twice, it helps to have a separate function.
2470 static FILE *open_ext(char *base, char *ext)
2472 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2474 strcat(strcpy(fn, base), ext);
2480 If we can read the grammar, then we analyse and optionally report on it. If
2481 the report finds conflicts we will exit with an error status.
2483 ###### process input
2484 struct grammar *g = NULL;
2486 fprintf(stderr, "No grammar section provided\n");
2489 g = grammar_read(gram);
2491 fprintf(stderr, "Failure to parse grammar\n");
2496 grammar_analyse(g, type);
2498 if (grammar_report(g, type))
2502 If a "headers" section is defined, we write it out and include a declaration
2503 for the `parse_XX` function so it can be used from separate code.
2505 if (rv == 0 && hdr && outfile) {
2506 FILE *f = open_ext(outfile, ".h");
2508 code_node_print(f, hdr, infile);
2509 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2513 fprintf(stderr, "Cannot create %s.h\n",
2519 And if all goes well and an output file was provided, we create the `.c`
2520 file with the code section (if any) and the parser tables and function.
2522 if (rv == 0 && outfile) {
2523 FILE *f = open_ext(outfile, ".c");
2526 code_node_print(f, code, infile);
2527 gen_parser(f, g, infile, name, pre_reduce);
2530 fprintf(stderr, "Cannot create %s.c\n",
2536 And that about wraps it up. We need to set the locale so that UTF-8 is
2537 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2539 ###### File: parsergen.mk
2540 parsergen : parsergen.o libscanner.o libmdcode.o
2541 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2548 int main(int argc, char *argv[])
2553 setlocale(LC_ALL,"");
2555 ## process arguments
2562 ## The SHIFT/REDUCE parser
2564 Having analysed the grammar and generated all the tables, we only need
2565 the shift/reduce engine to bring it all together.
2567 ### Goto table lookup
2569 The parser generator has nicely provided us with goto tables sorted by
2570 symbol number. We need a binary search function to find a symbol in the
2573 ###### parser functions
2575 static int search(const struct state *l, int sym)
2578 int hi = l->go_to_cnt;
2582 while (lo + 1 < hi) {
2583 int mid = (lo + hi) / 2;
2584 if (l->go_to[mid].sym <= sym)
2589 if (l->go_to[lo].sym == sym)
2590 return l->go_to[lo].state;
2595 ### Memory allocation
2597 The `scanner` returns tokens in a local variable - we want them in allocated
2598 memory so they can live in the `asn_stack`. So we provide `tok_copy` to
2599 make an allocated copy as required.
2601 ###### parser includes
2604 ###### parser functions
2606 static struct token *tok_copy(struct token tk)
2608 struct token *new = malloc(sizeof(*new));
2613 ### The state stack.
2615 The core data structure for the parser is the stack. This tracks all
2616 the symbols that have been recognised or partially recognised.
2618 The stack usually won't grow very large - maybe a few tens of entries.
2619 So we dynamically grow an array as required but never bother to
2620 shrink it down again.
2622 We keep the stack as two separate allocations. One, `asn_stack`, stores
2623 the "abstract syntax nodes" which are created by each reduction. When
2624 we call `do_reduce` we need to pass an array of the `asn`s of the body
2625 of the production, and by keeping a separate `asn` stack, we can just
2626 pass a pointer into this stack.
2628 The other allocation stores all other stack fields of which there are
2629 two. The `state` is the most important one and guides the parsing
2630 process. The `sym` is nearly unnecessary as it is implicit in the
2631 state. However when we want to free entries from the `asn_stack`, it
2632 helps to know what type they are so we can call the right freeing
2633 function. The symbol leads us to the right free function through
2636 The stack is most properly seen as alternating states and symbols -
2637 states, like the 'DOT' in items, are between symbols. Each frame in
2638 our stack holds a state and the symbol that was before it. The
2639 bottom of stack holds the start state but no symbol, as nothing came
2640 before the beginning. As we need to store some value, `TK_eof` is used
2641 to mark the beginning of the file as well as the end.
2643 ###### parser functions
2659 Two operations are needed on the stack - shift (which is like push) and pop.
2661 Shift applies not only to terminals but also to non-terminals. When we
2662 reduce a production we will pop off frames corresponding to the body
2663 symbols, then push on a frame for the head of the production. This last
2664 is exactly the same process as shifting in a terminal so we use the same
2665 function for both. In both cases we provide the symbol. The state is
2666 deduced from the current top-of-stack state and the new symbol.
2668 To simplify other code we arrange for `shift` to fail if there is no `goto`
2669 state for the symbol. This is useful in basic parsing due to our design
2670 that we shift when we can, and reduce when we cannot. So the `shift`
2671 function reports if it could.
2673 `shift` is also used to push state zero onto the stack, so if the
2674 stack is empty, it always chooses zero as the next state.
2676 So `shift` finds the next state. If that succeeds it extends the
2677 allocations if needed and pushes all the information onto the stacks.
2679 An extra complication is added to `shift` by the `EOL` token. This
2680 token must be generated when a `NEWLINE` is seen, but an `EOL` is
2681 expected. When this happens, the `NEWLINE` is NOT consumed, so multiple
2682 EOL can appear before a NEWLINE. To indicate that the token was shifted
2683 by not consumed, `shift` can return the special value `2`. The token
2684 number for `EOL` cannot be statically declared, so when the parser
2685 starts we need to look through the array of non-terminals to find the
2693 while (strcmp(non_term[p.tk_eol], "EOL") != 0)
2695 p.tk_eol += TK_reserved + config->known_count;
2698 ###### parser functions
2700 static int shift(struct parser *p,
2701 short sym, void *asn,
2702 const struct state states[])
2704 struct frame next = {0};
2706 int newstate = p->tos
2707 ? search(&states[p->stack[p->tos-1].state],
2712 else if (sym != TK_newline)
2715 // have a NEWLINE, might need an EOL
2718 ? search(&states[p->stack[p->tos-1].state],
2724 asn = tok_copy(*(struct token*)asn);
2727 if (p->tos >= p->stack_size) {
2728 p->stack_size += 10;
2729 p->stack = realloc(p->stack, p->stack_size
2730 * sizeof(p->stack[0]));
2731 p->asn_stack = realloc(p->asn_stack, p->stack_size
2732 * sizeof(p->asn_stack[0]));
2735 next.state = newstate;
2737 p->stack[p->tos] = next;
2738 p->asn_stack[p->tos] = asn;
2743 `pop` primarily moves the top of stack (`tos`) back down the required
2744 amount and frees any `asn` entries that need to be freed. It is called
2745 _after_ we reduce a production, just before we `shift` the nonterminal
2748 ###### parser functions
2750 static void pop(struct parser *p, int num,
2751 void(*do_free)(short sym, void *asn))
2756 for (i = 0; i < num; i++)
2757 do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]);
2760 ### The heart of the parser.
2762 Now we have the parser. For each token we might shift it, trigger a
2763 reduction, or start error handling. 2D tokens (IN, OUT, NEWLINE, EOL)
2764 might also be ignored. Ignoring tokens is combined with shifting.
2768 struct parser p = { 0 };
2769 struct token *tk = NULL;
2772 ###### heart of parser
2774 shift(&p, TK_eof, NULL, states);
2775 while (!accepted && p.tos > 0) {
2776 struct frame *tos = &p.stack[p.tos-1];
2778 tk = tok_copy(token_next(tokens));
2779 parser_trace(trace, &p,
2780 tk, states, non_term, config->known_count);
2782 ## try shift or ignore
2787 Indents are ignored unless they can be shifted onto the stack
2788 immediately. The end of an indented section - the OUT token - is
2789 ignored precisely when the indent was ignored. To keep track of this we
2790 need a small stack of flags, which is easily stored as bits in an
2791 `unsigned long`. This will never overflow and the scanner only allows
2792 20 levels of indentation.
2795 unsigned long ignored_indents;
2797 NEWLINE/EOL is ignored when in an indented section of text which was not
2798 explicitly expected by the grammar. So if the most recent indent is
2799 ignored, so is any EOL token.
2801 For other tokens, we shift the next token if that is possible, otherwise
2802 we try to reduce a production.
2804 ###### try shift or ignore
2806 if ((tk->num == TK_newline || tk->num == TK_out) &&
2807 (p.ignored_indents & 1)) {
2808 /* indented, so ignore OUT and NEWLINE */
2809 if (tk->num == TK_out)
2810 p.ignored_indents >>= 1;
2813 parser_trace_action(trace, "Ignore");
2817 switch (shift(&p, tk->num, tk, states)) {
2819 if (tk->num == TK_out)
2820 p.ignored_indents >>= 1;
2821 if (tk->num == TK_in)
2822 p.ignored_indents <<= 1;
2827 parser_trace_action(trace, tk ? "ShiftEOL" : "Shift");
2832 if (tk->num == TK_in) {
2833 /* No indent expected here, so ignore IN */
2836 p.ignored_indents <<= 1;
2837 p.ignored_indents |= 1;
2838 parser_trace_action(trace, "Ignore");
2842 We have already discussed the bulk of the handling of a "reduce" action,
2843 with the `pop()` and `shift()` functions doing much of the work. There
2844 is a little more complexity needed to manage storage for the asn (Abstract
2845 Syntax Node), and also a test of whether the reduction is permitted.
2847 When we try to shift the result of reducing production-zero, it will
2848 fail because there is no next state. In this case the asn will not have
2849 been stored on the stack, so it get stored in the `ret` variable, and we
2850 report that that input has been accepted.
2858 if (states[tos->state].reduce_prod >= 0) {
2861 const struct state *nextstate = &states[tos->state];
2862 int prod = nextstate->reduce_prod;
2863 int size = nextstate->reduce_size;
2864 int res_size = nextstate->result_size;
2866 body = p.asn_stack + (p.tos - size);
2867 res = res_size ? calloc(1, res_size) : NULL;
2868 res_size = do_reduce(prod, body, config, res);
2869 if (res_size != nextstate->result_size)
2871 pop(&p, size, do_free);
2872 if (!shift(&p, nextstate->reduce_sym, res, states)) {
2875 parser_trace_action(trace, "Accept");
2877 parser_trace_action(trace, "Reduce");
2881 If we can neither shift nor reduce we have an error to handle. There
2882 are two possible responses to an error: we can pop single frames off the
2883 stack until we find a frame that can shift `TK_error`, or we can discard
2884 the current look-ahead symbol.
2886 When we first see an error we do the first of these and set a flag to
2887 record that we are processing an error. If the normal shift/reduce
2888 tests fail to find that anything can be done from that state, we start
2889 dropping tokens until either we manage to shift one, or reach end-of-file.
2902 parser_trace_action(trace, "DISCARD");
2903 if (tk->num == TK_eof)
2908 struct token *err_tk;
2910 parser_trace_action(trace, "ERROR");
2912 err_tk = tok_copy(*tk);
2913 while (p.tos > 0 && shift(&p, TK_error, err_tk, states) == 0)
2914 // discard this state
2915 pop(&p, 1, do_free);
2918 // no state accepted TK_error
2924 ###### parser includes
2929 void *parser_run(struct token_state *tokens,
2930 const struct state states[],
2931 int (*do_reduce)(int, void**, struct token_config*, void*),
2932 void (*do_free)(short, void*),
2933 FILE *trace, const char *non_term[],
2934 struct token_config *config)
2943 pop(&p, p.tos, do_free);
2949 ###### exported functions
2950 void *parser_run(struct token_state *tokens,
2951 const struct state states[],
2952 int (*do_reduce)(int, void**, struct token_config*, void*),
2953 void (*do_free)(short, void*),
2954 FILE *trace, const char *non_term[],
2955 struct token_config *config);
2959 Being able to visualize the parser in action can be invaluable when
2960 debugging the parser code, or trying to understand how the parse of a
2961 particular grammar progresses. The stack contains all the important
2962 state, so just printing out the stack every time around the parse loop
2963 can make it possible to see exactly what is happening.
2965 This doesn't explicitly show each SHIFT and REDUCE action. However they
2966 are easily deduced from the change between consecutive lines, and the
2967 details of each state can be found by cross referencing the states list
2968 in the stack with the "report" that parsergen can generate.
2970 For terminal symbols, we just dump the token. For non-terminals we
2971 print the name of the symbol. The look ahead token is reported at the
2972 end inside square brackets.
2974 ###### parser functions
2976 static char *reserved_words[] = {
2977 [TK_error] = "ERROR",
2980 [TK_newline] = "NEWLINE",
2984 void parser_trace(FILE *trace, struct parser *p,
2985 struct token *tk, const struct state states[],
2986 const char *non_term[], int knowns)
2991 for (i = 0; i < p->tos; i++) {
2992 struct frame *f = &p->stack[i];
2995 if (sym < TK_reserved &&
2996 reserved_words[sym] != NULL)
2997 fputs(reserved_words[sym], trace);
2998 else if (sym < TK_reserved + knowns) {
2999 struct token *t = p->asn_stack[i];
3000 text_dump(trace, t->txt, 20);
3002 fputs(non_term[sym - TK_reserved - knowns],
3006 fprintf(trace, "(%d) ", f->state);
3008 fprintf(trace, "[");
3009 if (tk->num < TK_reserved &&
3010 reserved_words[tk->num] != NULL)
3011 fputs(reserved_words[tk->num], trace);
3013 text_dump(trace, tk->txt, 20);
3014 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3017 void parser_trace_action(FILE *trace, char *action)
3020 fprintf(trace, " - %s\n", action);
3025 The obvious example for a parser is a calculator.
3027 As `scanner` provides number parsing function using `libgmp` it is not much
3028 work to perform arbitrary rational number calculations.
3030 This calculator takes one expression, or an equality test, per line.
3031 The results are printed and if any equality test fails, the program
3032 exits with an error.
3034 ###### File: parsergen.mk
3035 calc.c calc.h : parsergen parsergen.mdc
3036 ./parsergen --tag calc -o calc parsergen.mdc
3037 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3038 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3040 ./calc parsergen.mdc
3045 #include "parse_number.h"
3046 // what do we use for a demo-grammar? A calculator of course.
3058 #include <sys/mman.h>
3064 #include "scanner.h"
3069 static void free_number(struct number *n)
3075 static int text_is(struct text t, char *s)
3077 return (strlen(s) == t.len &&
3078 strncmp(s, t.txt, t.len) == 0);
3081 int main(int argc, char *argv[])
3083 int fd = open(argv[1], O_RDONLY);
3084 int len = lseek(fd, 0, 2);
3085 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3086 struct section *table = code_extract(file, file+len, NULL);
3088 struct token_config config = {
3089 .ignored = (1 << TK_line_comment)
3092 .number_chars = ".,_+-",
3096 for (s = table; s; s = s->next)
3097 if (text_is(s->section, "example: input"))
3098 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3100 struct section *t = table->next;
3101 code_free(table->code);
3113 Session -> Session Line
3116 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3117 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3118 gmp_printf(" or as a decimal: %Fg\n", fl);
3122 | Expression = Expression NEWLINE ${
3123 if (mpq_equal($1.val, $3.val))
3124 gmp_printf("Both equal %Qd\n", $1.val);
3126 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
3131 | NEWLINE ${ printf("Blank line\n"); }$
3132 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3135 Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3136 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3137 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3138 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3139 | Expression // Expression ${ {
3142 mpz_init(z0); mpz_init(z1); mpz_init(z2);
3143 mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3144 mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3145 mpz_tdiv_q(z0, z1, z2);
3146 mpq_set_z($0.val, z0);
3147 mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3149 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3150 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3155 3.1415926535 - 355/113
3157 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3159 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1