1 # LR(1) Parser Generator with 2D support #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 "2D support" means that indentation and line breaks can be significant.
9 There are several distinct sections.
11 1. `grammar_read` will read a grammar from a literate-code file and
12 store the productions, symbols, and code fragments.
13 2. `grammar_analyse` will take that grammar and build LR parsing
14 states and look-ahead tables.
15 3. `grammar_report` will present the details of the analysis
16 in a readable format and will report any conflicts.
17 4. `parser_generate` will write out C code files with various
18 tables and with the code fragments arranged in useful places.
19 5. `parser_run` is a library function intended to be linked together
20 with the generated parser tables to complete the implementation of
22 6. Finally `calc` is a test grammar for a simple calculator. The
23 `parsergen` program built from the C code in this file can extract
24 that grammar directly from this file and process it.
26 ###### File: parsergen.c
31 ## forward declarations
42 ###### File: libparser.c
49 ###### File: parsergen.mk
52 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
55 ## Reading the grammar
57 The grammar must be stored in a markdown literate code file as parsed by
58 "[mdcode][]". It may have up to four top level (i.e. unreferenced)
59 sections: `header`, `code`, `grammar` and `reduce`. The first two, if
60 present, will be literally copied into the generated `.c` and `.h`
61 files. The third is required and contains the grammar which is
62 tokenised with "[scanner][]". `reduce` can provide variable
63 declarations and common intialization code for all the reduction code
64 fragments in the grammar.
66 If the `--tag` option is given, then any top level header that doesn't
67 start with the tag is ignored, and the tag is striped from the rest. So
68 `--tag Foo` means that the four sections expected are `Foo: header`,
69 `Foo: code`, `Foo: grammar` and `Foo: reduce`. The tag `calc` is used
70 to extract the sample calculator from this file.
73 [scanner]: scanner.html
79 ###### parser includes
83 The grammar contains production sets, precedence/associativity
84 declarations, general terminal declarations, and data type declarations.
85 These are all parsed with _ad hoc_ parsing as we don't have a parser
88 The precedence and associativity can be set for each production, and
89 can be inherited from symbols. The data type (either structure or a
90 reference to a structure) is potentially defined for each non-terminal
91 and describes what C structure is used to store information about each
95 enum assoc {Left, Right, Non};
96 char *assoc_names[] = {"Left","Right","Non"};
99 struct text struct_name;
102 unsigned short precedence;
106 unsigned short precedence;
114 The strings reported by `mdcode` and `scanner` are `struct text` which
115 have length rather than being nul terminated. To help with printing and
116 comparing we define `text_is` and `prtxt`, which should possibly go in
117 `mdcode`. `scanner` does provide `text_dump` which is useful for
118 strings which might contain control characters.
120 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
121 because that is what we need to detect tags.
124 static int text_is(struct text t, char *s)
126 return (strlen(s) == t.len &&
127 strncmp(s, t.txt, t.len) == 0);
129 static void prtxt(struct text t)
131 printf("%.*s", t.len, t.txt);
134 static int strip_tag(struct text *t, char *tag)
136 int skip = strlen(tag) + 1;
137 if (skip >= t->len ||
138 strncmp(t->txt, tag, skip-1) != 0 ||
139 t->txt[skip-1] != ':')
141 while (skip < t->len && t->txt[skip] == ' ')
150 Productions are comprised primarily of symbols - terminal and
151 non-terminal. We do not make a syntactic distinction between the two,
152 though non-terminals must be identifiers. Non-terminal symbols are
153 those which appear in the head of a production, terminal symbols are
154 those which don't. There are also "virtual" symbols used for precedence
155 marking discussed later, and sometimes we won't know what type a symbol
158 To help with code safety it is possible to declare the terminal symbols.
159 If this is done, then any symbol used in a production that does not
160 appear in the head of any production and is not declared as a terminal
161 is treated as an error.
163 ###### forward declarations
164 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
165 char *symtypes = "UVTN";
168 ###### grammar fields
169 int terminals_declared;
171 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
172 table of known symbols and the resulting parser will report them as
173 `TK_reserved + N`. A small set of identifiers are reserved for the
174 different token types that `scanner` can report.
178 static struct { int num; char *name; } reserved_words[] = {
179 { TK_error, "ERROR" },
180 { TK_number, "NUMBER" },
181 { TK_ident, "IDENTIFIER" },
183 { TK_string, "STRING" },
184 { TK_multi_string, "MULTI_STRING" },
187 { TK_newline, "NEWLINE" },
193 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
194 recognised. The former is automatically expected at the end of the text
195 being parsed. The latter are always ignored by the parser.
197 All of these symbols are stored in a simple symbol table. We use the
198 `struct text` from `mdcode` to store the name and link them together
199 into a sorted list using an insertion sort.
201 We don't have separate `find` and `insert` functions as any symbol we
202 find needs to be remembered. We simply expect `find` to always return a
203 symbol, but its type might be `Unknown`.
212 ###### grammar fields
217 static struct symbol *sym_find(struct grammar *g, struct text s)
219 struct symbol **l = &g->syms;
224 (cmp = text_cmp((*l)->name, s)) < 0)
228 n = calloc(1, sizeof(*n));
237 static void symbols_init(struct grammar *g)
239 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
241 for (i = 0; i < entries; i++) {
244 t.txt = reserved_words[i].name;
245 t.len = strlen(t.txt);
248 s->num = reserved_words[i].num;
252 ### Data types and precedence.
254 Data type specification, precedence specification, and declaration of
255 terminals are all introduced by a dollar sign at the start of the line.
256 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
257 precedence, if it is `TERM` then the line declares terminals without
258 precedence, otherwise it specifies a data type.
260 The data type name is simply stored and applied to the head of all
261 subsequent productions. It must be the name of a structure optionally
262 preceded by an asterisk which means a reference or "pointer". So
263 `$expression` maps to `struct expression` and `$*statement` maps to
264 `struct statement *`.
266 Any productions given before the first data type declaration will have
267 no data type associated with them and can carry no information. In
268 order to allow other non-terminals to have no type, the data type
269 `$void` can be given. This does *not* mean that `struct void` will be
270 used, but rather than no type will be associated with subsequent
273 The precedence line must contain a list of symbols, either terminal
274 symbols or virtual symbols. It can only contain symbols that have not
275 been seen yet, so precedence declaration must precede the productions
278 A "virtual" symbol is an identifier preceded by `$$`. Virtual symbols
279 carry precedence information but are not included in the grammar. A
280 production can declare that it inherits the precedence of a given
283 This use for `$$` precludes it from being used as a symbol in the
284 described language. Two other symbols: `${` and `}$` are also
287 Each new precedence line introduces a new precedence level and
288 declares how it associates. This level is stored in each symbol
289 listed and may be inherited by any production which uses the symbol. A
290 production inherits from the last symbol which has a precedence.
292 The symbols on the first precedence line have the lowest precedence.
293 Subsequent lines introduce symbols with higher precedence and so bind
296 Note that a declaration line (or "dollar line") can start with either of
297 two different marks: "$" or "$*". The `dollar_line()` function defined
298 here is told which was found in the `isref` argument.
300 ###### grammar fields
301 struct text current_type;
306 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
307 static const char *known[] = { "$$", "${", "}$" };
310 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
312 struct token t = token_next(ts);
318 if (t.num != TK_ident) {
319 err = "type or assoc expected after '$'";
322 if (text_is(t.txt, "LEFT"))
324 else if (text_is(t.txt, "RIGHT"))
326 else if (text_is(t.txt, "NON"))
328 else if (text_is(t.txt, "TERM")) {
330 g->terminals_declared = 1;
332 g->current_type = t.txt;
333 g->type_isref = isref;
334 if (text_is(t.txt, "void"))
335 g->current_type.txt = NULL;
337 if (t.num != TK_newline) {
338 err = "Extra tokens after type name";
345 err = "$* cannot be followed by a precedence";
349 // This is a precedence or TERM line, need some symbols.
353 while (t.num != TK_newline) {
354 enum symtype type = Terminal;
356 if (t.num == TK_virtual) {
359 if (t.num != TK_ident) {
360 err = "$$ must be followed by a word";
364 err = "Virtual symbols not permitted on $TERM line";
367 } else if (t.num != TK_ident &&
369 err = "Illegal token in precedence line";
372 s = sym_find(g, t.txt);
373 if (s->type != Unknown) {
374 err = "Symbols in precedence/TERM line must not already be known.";
379 s->precedence = g->prec_levels;
386 err = "No symbols given on precedence/TERM line";
390 while (t.num != TK_newline && t.num != TK_eof)
397 A production either starts with an identifier which is the head
398 non-terminal, or a vertical bar (`|`) in which case this production
399 uses the same head as the previous one. The identifier must be
400 followed by a `->` mark. All productions for a given non-terminal must
401 be grouped together with the `nonterminal ->` given only once.
403 After this a (possibly empty) sequence of identifiers and marks form
404 the body of the production. A virtual symbol may be given after the
405 body by preceding it with `$$`. If a virtual symbol is given, the
406 precedence of the production is that for the virtual symbol. If none
407 is given, the precedence is inherited from the last symbol in the
408 production which has a precedence specified.
410 After the optional precedence may come the `${` mark. This indicates
411 the start of a code fragment. If present, this must be on the same
412 line as the start of the production.
414 All of the text from the `${` through to the matching `}$` is
415 collected and forms the code-fragment for the production. It must all
416 be in one `code_node` of the literate code. The `}$` must be
417 at the end of a line.
419 Text in the code fragment will undergo substitutions where `$N` or
420 `$<N`,for some numeric `N` (or non-numeric indicator as described
421 later), will be replaced with a variable holding the parse information
422 for the particular symbol in the production. `$0` is the head of the
423 production, `$1` is the first symbol of the body, etc. The type of `$N`
424 for a terminal symbol is `struct token`. For a non-terminal, it is
425 whatever has been declared for that symbol. The `<` may be included and
426 means that the value (usually a reference) is being moved out, so it
427 will not automatically be freed. The effect of using '<' is that the
428 variable is cleared to all-zeros.
430 While building productions we will need to add to an array which needs to
434 static void array_add(void *varray, int *cnt, void *new)
436 void ***array = varray;
439 current = ((*cnt-1) | (step-1))+1;
440 if (*cnt == current) {
443 *array = realloc(*array, current * sizeof(void*));
445 (*array)[*cnt] = new;
449 Collecting the code fragment simply involves reading tokens until we
450 hit the end token `}$`, and noting the character position of the start and
454 static struct text collect_code(struct token_state *state,
459 code.txt = start.txt.txt + start.txt.len;
461 t = token_next(state);
462 while (t.node == start.node &&
463 t.num != TK_close && t.num != TK_error &&
465 if (t.num == TK_close && t.node == start.node)
466 code.len = t.txt.txt - code.txt;
472 Now we have all the bits we need to parse a full production.
477 ###### grammar fields
478 struct production **productions;
479 int production_count;
481 ###### production fields
483 struct symbol **body;
489 int first_production;
492 static char *parse_production(struct grammar *g,
494 struct token_state *state)
496 /* Head has already been parsed. */
499 struct production p, *pp;
501 memset(&p, 0, sizeof(p));
503 tk = token_next(state);
504 while (tk.num == TK_ident || tk.num == TK_mark) {
505 struct symbol *bs = sym_find(g, tk.txt);
506 if (bs->type == Unknown) {
507 if (!g->terminals_declared)
510 if (bs->type == Virtual) {
511 err = "Virtual symbol not permitted in production";
514 if (bs->precedence) {
515 p.precedence = bs->precedence;
518 array_add(&p.body, &p.body_size, bs);
519 tk = token_next(state);
521 if (tk.num == TK_virtual) {
523 tk = token_next(state);
524 if (tk.num != TK_ident) {
525 err = "word required after $$";
528 vs = sym_find(g, tk.txt);
529 if (vs->precedence == 0) {
530 err = "symbol after $$ must have precedence";
533 p.precedence = vs->precedence;
536 tk = token_next(state);
538 if (tk.num == TK_open) {
539 p.code_line = tk.line;
540 p.code = collect_code(state, tk);
541 if (p.code.txt == NULL) {
542 err = "code fragment not closed properly";
545 tk = token_next(state);
548 if (tk.num != TK_newline && tk.num != TK_eof) {
549 err = "stray tokens at end of line";
552 pp = malloc(sizeof(*pp));
554 array_add(&g->productions, &g->production_count, pp);
557 while (tk.num != TK_newline && tk.num != TK_eof)
558 tk = token_next(state);
563 With the ability to parse productions and dollar-lines, we have nearly all
564 that we need to parse a grammar from a `code_node`.
566 The head of the first production will effectively be the `start` symbol
567 of the grammar. However it won't _actually_ be so. Processing the
568 grammar is greatly simplified if the real start symbol only has a single
569 production, and expects `$eof` as the final terminal. So when we find
570 the first explicit production we insert an extra implicit production as
571 production zero which looks like
573 ###### Example: production 0
576 where `START` is the first non-terminal given.
578 ###### create production zero
579 struct production *p = calloc(1,sizeof(*p));
580 struct text start = {"$start",6};
581 struct text eof = {"$eof",4};
582 struct text code = {"$0 = $<1;", 9};
583 p->head = sym_find(g, start);
584 p->head->type = Nonterminal;
585 p->head->struct_name = g->current_type;
586 p->head->isref = g->type_isref;
587 if (g->current_type.txt)
589 array_add(&p->body, &p->body_size, head);
590 array_add(&p->body, &p->body_size, sym_find(g, eof));
591 p->head->first_production = g->production_count;
592 array_add(&g->productions, &g->production_count, p);
594 Now we are ready to read in the grammar. We ignore comments
595 and strings so that the marks which introduce them can be
596 used as terminals in the grammar. We don't ignore numbers
597 even though we don't allow them as that causes the scanner
598 to produce errors that the parser is better positioned to handle.
599 We also ignore indentation, but we expect and use newlines.
601 To allow comments in the grammar, we explicitly check for "//" as the
602 first token on the line and those lines are skipped. "//" can still be
603 used as a terminal anywhere that a terminal is expected.
606 static struct grammar *grammar_read(struct code_node *code)
608 struct token_config conf = {
611 .words_marks = known,
612 .known_count = sizeof(known)/sizeof(known[0]),
614 .ignored = (1 << TK_line_comment)
615 | (1 << TK_block_comment)
618 | (1 << TK_multi_string)
623 struct token_state *state = token_open(code, &conf);
625 struct symbol *head = NULL;
629 g = calloc(1, sizeof(*g));
632 for (tk = token_next(state); tk.num != TK_eof;
633 tk = token_next(state)) {
634 if (tk.num == TK_newline)
636 if (tk.num == TK_ident) {
638 head = sym_find(g, tk.txt);
639 if (head->type == Nonterminal)
640 err = "This non-terminal has already be used.";
641 else if (head->type == Virtual)
642 err = "Virtual symbol not permitted in head of production";
644 head->type = Nonterminal;
645 head->struct_name = g->current_type;
646 head->isref = g->type_isref;
647 if (g->production_count == 0) {
648 ## create production zero
650 head->first_production = g->production_count;
651 tk = token_next(state);
652 if (tk.num == TK_mark &&
653 text_is(tk.txt, "->"))
654 err = parse_production(g, head, state);
656 err = "'->' missing in production";
658 } else if (tk.num == TK_mark
659 && text_is(tk.txt, "|")) {
660 // another production for same non-term
662 err = parse_production(g, head, state);
664 err = "First production must have a head";
665 } else if (tk.num == TK_mark
666 && text_is(tk.txt, "$")) {
667 err = dollar_line(state, g, 0);
668 } else if (tk.num == TK_mark
669 && text_is(tk.txt, "$*")) {
670 err = dollar_line(state, g, 1);
671 } else if (tk.num == TK_mark
672 && text_is(tk.txt, "//")) {
673 while (tk.num != TK_newline &&
675 tk = token_next(state);
677 err = "Unrecognised token at start of line.";
683 if (g->terminals_declared) {
686 for (s = g->syms; s; s = s->next) {
687 if (s->type != Unknown)
690 fprintf(stderr, "Token %.*s not declared\n",
691 s->name.len, s->name.txt);
694 free(g); // FIXME free content
700 fprintf(stderr, "Error at line %d: %s\n",
703 free(g); // FIXME free content
707 ## Analysing the grammar
709 The central task in analysing the grammar is to determine a set of
710 states to drive the parsing process. This will require finding various
711 sets of symbols and of "items". Some of these sets will need to attach
712 information to each element in the set, so they are more like maps.
714 Each "item" may have a 'look-ahead' or `LA` set associated with it.
715 Many of these will be the same as each other. To avoid memory wastage,
716 and to simplify some comparisons of sets, these sets will be stored in a
717 list of unique sets, each assigned a number.
719 Once we have the data structures in hand to manage these sets and lists,
720 we can start setting the 'nullable' flag, build the 'FIRST' sets, and
721 then create the item sets which define the various states.
725 Though we don't only store symbols in these sets, they are the main
726 things we store, so they are called symbol sets or "symsets".
728 A symset has a size, an array of shorts, and an optional array of data
729 values, which are also shorts. If the array of data is not present, we
730 store an impossible pointer, as `NULL` is used to indicate that no
731 memory has been allocated yet;
736 unsigned short *syms, *data;
738 #define NO_DATA ((unsigned short *)1)
739 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
740 const struct symset INIT_DATASET = { 0, NULL, NULL };
742 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
743 using an insertion sort. We don't explicitly record the amount of
744 allocated space as it can be derived directly from the current `cnt`
745 using `((cnt - 1) | 7) + 1`.
748 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
751 int current = ((s->cnt-1) | 7) + 1;
752 if (current == s->cnt) {
754 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
755 if (s->data != NO_DATA)
756 s->data = realloc(s->data, sizeof(*s->data) * current);
759 while (i > 0 && s->syms[i-1] > key) {
760 s->syms[i] = s->syms[i-1];
761 if (s->data != NO_DATA)
762 s->data[i] = s->data[i-1];
766 if (s->data != NO_DATA)
771 Finding a symbol (or item) in a `symset` uses a simple binary search.
772 We return the index where the value was found (so data can be accessed),
773 or `-1` to indicate failure.
775 static int symset_find(struct symset *ss, unsigned short key)
782 while (lo + 1 < hi) {
783 int mid = (lo + hi) / 2;
784 if (ss->syms[mid] <= key)
789 if (ss->syms[lo] == key)
794 We will often want to form the union of two symsets. When we do, we
795 will often want to know if anything changed (as that might mean there is
796 more work to do). So `symset_union` reports whether anything was added
797 to the first set. We use a slow quadratic approach as these sets are
798 rarely large. If profiling shows this to be a problem it can be
801 static int symset_union(struct symset *a, struct symset *b)
805 for (i = 0; i < b->cnt; i++)
806 if (symset_find(a, b->syms[i]) < 0) {
807 unsigned short data = 0;
808 if (b->data != NO_DATA)
810 symset_add(a, b->syms[i], data);
816 And of course we must be able to free a symset.
818 static void symset_free(struct symset ss)
821 if (ss.data != NO_DATA)
827 Some symsets are simply stored somewhere appropriate in the `grammar`
828 data structure, others needs a bit of indirection. This applies
829 particularly to "LA" sets. These will be paired with "items" in an
830 "itemset". As itemsets will be stored in a symset, the "LA" set needs
831 to be stored in the `data` field, so we need a mapping from a small
832 (short) number to an LA `symset`.
834 As mentioned earlier this involves creating a list of unique symsets.
836 For now, we just use a linear list sorted by insertion. A skiplist
837 would be more efficient and may be added later.
844 struct setlist *next;
847 ###### grammar fields
848 struct setlist *sets;
853 static int ss_cmp(struct symset a, struct symset b)
856 int diff = a.cnt - b.cnt;
859 for (i = 0; i < a.cnt; i++) {
860 diff = (int)a.syms[i] - (int)b.syms[i];
867 static int save_set(struct grammar *g, struct symset ss)
869 struct setlist **sl = &g->sets;
873 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
880 s = malloc(sizeof(*s));
889 Finding a set by number is currently performed by a simple linear
890 search. If this turns out to hurt performance, we can store the sets in
891 a dynamic array like the productions.
893 static struct symset set_find(struct grammar *g, int num)
895 struct setlist *sl = g->sets;
896 while (sl && sl->num != num)
901 ### Setting `nullable`
903 We set `nullable` on the head symbol for any production for which all
904 the body symbols (if any) are nullable. As this is a recursive
905 definition, any change in the `nullable` setting means that we need to
906 re-evaluate where it needs to be set.
908 We simply loop around performing the same calculations until no more
915 static void set_nullable(struct grammar *g)
918 while (check_again) {
921 for (p = 0; p < g->production_count; p++) {
922 struct production *pr = g->productions[p];
925 if (pr->head->nullable)
927 for (s = 0; s < pr->body_size; s++)
928 if (! pr->body[s]->nullable)
930 if (s == pr->body_size) {
931 pr->head->nullable = 1;
938 ### Building the `first` sets
940 When calculating what can follow a particular non-terminal, we will need
941 to know what the "first" terminal in any subsequent non-terminal might
942 be. So we calculate the `first` set for every non-terminal and store
943 them in an array. We don't bother recording the "first" set for
944 terminals as they are trivial.
946 As the "first" for one symbol might depend on the "first" of another, we
947 repeat the calculations until no changes happen, just like with
948 `set_nullable`. This makes use of the fact that `symset_union` reports
949 if any change happens.
951 The core of this, which finds the "first" of part of a production body,
952 will be reused for computing the "follow" sets, so we split that out
953 into a separate function, `add_first`, which also reports if it got all
954 the way to the end of the production without finding a non-nullable
957 ###### grammar fields
958 struct symset *first;
962 static int add_first(struct production *pr, int start,
963 struct symset *target, struct grammar *g,
968 for (s = start; s < pr->body_size; s++) {
969 struct symbol *bs = pr->body[s];
970 if (bs->type == Terminal) {
971 if (symset_find(target, bs->num) < 0) {
972 symset_add(target, bs->num, 0);
976 } else if (symset_union(target, &g->first[bs->num]))
982 *to_end = (s == pr->body_size);
986 static void build_first(struct grammar *g)
990 g->first = calloc(g->num_syms, sizeof(g->first[0]));
991 for (s = 0; s < g->num_syms; s++)
992 g->first[s] = INIT_SYMSET;
994 while (check_again) {
997 for (p = 0; p < g->production_count; p++) {
998 struct production *pr = g->productions[p];
999 struct symset *head = &g->first[pr->head->num];
1001 if (add_first(pr, 0, head, g, NULL))
1007 ### Building the `follow` sets.
1009 There are two different situations when we will want to generate
1010 "follow" sets. If we are doing an SLR analysis, we want to generate a
1011 single "follow" set for each non-terminal in the grammar. That is what
1012 is happening here. If we are doing an LALR or LR analysis we will want
1013 to generate a separate "LA" set for each item. We do that later in
1016 There are two parts to generating a "follow" set. Firstly we look at
1017 every place that any non-terminal appears in the body of any production,
1018 and we find the set of possible "first" symbols after there. This is
1019 done using `add_first` above and only needs to be done once as the
1020 "first" sets are now stable and will not change.
1022 ###### grammar fields
1023 struct symset *follow;
1027 for (p = 0; p < g->production_count; p++) {
1028 struct production *pr = g->productions[p];
1031 for (b = 0; b < pr->body_size - 1; b++) {
1032 struct symbol *bs = pr->body[b];
1033 if (bs->type == Terminal)
1035 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1039 The second part is to add the "follow" set of the head of a production
1040 to the "follow" sets of the final symbol in the production, and any
1041 other symbol which is followed only by `nullable` symbols. As this
1042 depends on "follow" itself we need to repeatedly perform the process
1043 until no further changes happen.
1045 Rather than 2 nested loops, one that repeats the whole process until
1046 there is no change, and one that iterates of the set of productions, we
1047 combine these two functions into a single loop.
1051 for (check_again = 0, p = 0;
1052 p < g->production_count;
1053 p < g->production_count-1
1054 ? p++ : check_again ? (check_again = 0, p = 0)
1056 struct production *pr = g->productions[p];
1059 for (b = pr->body_size - 1; b >= 0; b--) {
1060 struct symbol *bs = pr->body[b];
1061 if (bs->type == Terminal)
1063 if (symset_union(&g->follow[bs->num],
1064 &g->follow[pr->head->num]))
1071 We now just need to create and initialise the `follow` list to get a
1075 static void build_follow(struct grammar *g)
1077 int s, check_again, p;
1078 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1079 for (s = 0; s < g->num_syms; s++)
1080 g->follow[s] = INIT_SYMSET;
1084 ### Building itemsets and states
1086 There are three different levels of detail that can be chosen for
1087 building the itemsets and states for the LR grammar. They are:
1089 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1090 itemsets - look-ahead, if used at all, is simply the 'follow' sets
1092 2. LALR(1) where we build look-ahead sets with each item and merge
1093 the LA sets when we find two paths to the same "kernel" set of items.
1094 3. LR(1) where different look-ahead for any item in the set means
1095 a different item set must be created.
1097 ###### forward declarations
1098 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1100 We need to be able to look through existing states to see if a newly
1101 generated state already exists. For now we use a simple sorted linked
1104 An item is a pair of numbers: the production number and the position of
1105 "DOT", which is an index into the body of the production. As the
1106 numbers are not enormous we can combine them into a single "short" and
1107 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1108 production number (so 2000 productions with maximum of 31 symbols in the
1111 Comparing the itemsets will be a little different to comparing symsets
1112 as we want to do the lookup after generating the "kernel" of an
1113 itemset, so we need to ignore the offset=zero items which are added during
1116 To facilitate this, we modify the "DOT" number so that "0" sorts to
1117 the end of the list in the symset, and then only compare items before
1121 static inline unsigned short item_num(int production, int dot)
1123 return production | ((31-dot) << 11);
1125 static inline int item_prod(unsigned short item)
1127 return item & 0x7ff;
1129 static inline int item_dot(unsigned short item)
1131 return (31-(item >> 11)) & 0x1f;
1134 For LR(1) analysis we need to compare not just the itemset in a state
1135 but also the LA sets. As we assign each unique LA set a number, we
1136 can just compare the symset and the data values together.
1139 static int itemset_cmp(struct symset a, struct symset b,
1140 enum grammar_type type)
1146 i < a.cnt && i < b.cnt &&
1147 item_dot(a.syms[i]) > 0 &&
1148 item_dot(b.syms[i]) > 0;
1150 int diff = a.syms[i] - b.syms[i];
1154 diff = a.data[i] - b.data[i];
1159 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1163 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1169 if (type < LR1 || av == -1)
1172 a.data[i] - b.data[i];
1175 It will be helpful to know if an itemset has been "completed" or not,
1176 particularly for LALR where itemsets get merged, at which point they
1177 need to be consider for completion again. So a `completed` flag is
1180 Finally, for handling `TK_out` we need to know whether productions in the
1181 current state started *before* the most recent indent. A state
1182 doesn't usually keep details of individual productions, so we need to
1183 add one extra detail. `min_prefix` is the smallest non-zero number of
1184 symbols *before* DOT in any production in an itemset. This will allow
1185 us to determine if the the most recent indent is sufficiently recent
1186 to cancel it against a `TK_out`. If it was seen longer ago than the
1187 `min_prefix`, and if the current state cannot be reduced, then the
1188 indented section must have ended in the middle of a syntactic unit, so
1189 an error must be signaled.
1191 And now we can build the list of itemsets. The lookup routine returns
1192 both a success flag and a pointer to where in the list an insert should
1193 happen, so we don't need to search a second time.
1197 struct itemset *next;
1199 struct symset items;
1200 struct symset go_to;
1202 unsigned short precedence;
1208 ###### grammar fields
1209 struct itemset *items;
1213 static int itemset_find(struct grammar *g, struct itemset ***where,
1214 struct symset kernel, enum grammar_type type)
1216 struct itemset **ip;
1218 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1219 struct itemset *i = *ip;
1221 diff = itemset_cmp(i->items, kernel, type);
1234 Adding an itemset may require merging the LA sets if LALR analysis is
1235 happening. If any new LA set adds any symbols that weren't in the old
1236 LA set, we clear the `completed` flag so that the dependants of this
1237 itemset will be recalculated and their LA sets updated.
1239 `add_itemset` must consume the symsets it is passed, either by adding
1240 them to a data structure, of freeing them.
1242 static int add_itemset(struct grammar *g, struct symset ss,
1243 enum assoc assoc, unsigned short precedence,
1244 enum grammar_type type)
1246 struct itemset **where, *is;
1248 int found = itemset_find(g, &where, ss, type);
1250 is = calloc(1, sizeof(*is));
1251 is->state = g->states;
1255 is->precedence = precedence;
1257 is->go_to = INIT_DATASET;
1266 for (i = 0; i < ss.cnt; i++) {
1267 struct symset temp = INIT_SYMSET, s;
1268 if (ss.data[i] == is->items.data[i])
1270 s = set_find(g, is->items.data[i]);
1271 symset_union(&temp, &s);
1272 s = set_find(g, ss.data[i]);
1273 if (symset_union(&temp, &s)) {
1274 is->items.data[i] = save_set(g, temp);
1285 To build all the itemsets, we first insert the initial itemset made from
1286 production zero, complete each itemset, and then generate new itemsets
1287 from old until no new ones can be made.
1289 Completing an itemset means finding all the items where "DOT" is
1290 followed by a nonterminal and adding "DOT=0" items for every production
1291 from that non-terminal - providing each item hasn't already been added.
1292 When we add such an item it might get inserted before the current
1293 one, so to make sure we process it we reset the loop counter to the
1296 If LA sets are needed, the LA set for each new item is found using
1297 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This
1298 may be supplemented by the LA set for the item which produced the new
1301 We also collect a set of all symbols which follow "DOT" (in `done`) as
1302 this is used in the next stage.
1304 When itemsets are created we assign a precedence to the itemset from the
1305 complete item, if there is one. We ignore the possibility of there
1306 being two and don't (currently) handle precedence in such grammars.
1307 When completing a grammar we ignore any item where DOT is followed by a
1308 terminal with a precedence lower than that for the itemset. Unless the
1309 terminal has right associativity, we also ignore items where the
1310 terminal has the same precedence. The result is that unwanted items are
1311 still in the itemset, but the terminal doesn't get into the go to set,
1312 so the item is ineffective.
1314 ###### complete itemset
1315 for (i = 0; i < is->items.cnt; i++) {
1316 int p = item_prod(is->items.syms[i]);
1317 int bs = item_dot(is->items.syms[i]);
1318 struct production *pr = g->productions[p];
1321 struct symset LA = INIT_SYMSET;
1322 unsigned short sn = 0;
1324 if (is->min_prefix == 0 ||
1325 (bs > 0 && bs < is->min_prefix))
1326 is->min_prefix = bs;
1327 if (bs == pr->body_size)
1330 if (s->precedence && is->precedence &&
1331 is->precedence > s->precedence)
1332 /* This terminal has a low precedence and
1333 * shouldn't be shifted
1336 if (s->precedence && is->precedence &&
1337 is->precedence == s->precedence && s->assoc != Right)
1338 /* This terminal has a matching precedence and is
1339 * not Right-associative, so we mustn't shift it.
1342 if (symset_find(&done, s->num) < 0)
1343 symset_add(&done, s->num, 0);
1345 if (s->type != Nonterminal)
1351 add_first(pr, bs+1, &LA, g, &to_end);
1353 struct symset ss = set_find(g, is->items.data[i]);
1354 symset_union(&LA, &ss);
1356 sn = save_set(g, LA);
1357 LA = set_find(g, sn);
1360 /* Add productions for this symbol */
1361 for (p2 = s->first_production;
1362 p2 < g->production_count &&
1363 g->productions[p2]->head == s;
1365 int itm = item_num(p2, 0);
1366 int pos = symset_find(&is->items, itm);
1368 symset_add(&is->items, itm, sn);
1369 /* Will have re-ordered, so start
1370 * from beginning again */
1372 } else if (type >= LALR) {
1373 struct symset ss = set_find(g, is->items.data[pos]);
1374 struct symset tmp = INIT_SYMSET;
1375 struct symset *la = &LA;
1377 symset_union(&tmp, &ss);
1378 if (symset_union(&tmp, la)) {
1379 is->items.data[pos] = save_set(g, tmp);
1387 For each symbol we found (and placed in `done`) we collect all the items
1388 for which this symbol is next, and create a new itemset with "DOT"
1389 advanced over the symbol. This is then added to the collection of
1390 itemsets (or merged with a pre-existing itemset).
1392 ###### derive itemsets
1393 // Now we have a completed itemset, so we need to
1394 // compute all the 'next' itemsets and create them
1395 // if they don't exist.
1396 for (i = 0; i < done.cnt; i++) {
1398 unsigned short state;
1399 struct symbol *sym = g->symtab[done.syms[i]];
1400 enum assoc assoc = Non;
1401 unsigned short precedence = 0;
1402 struct symset newitemset = INIT_SYMSET;
1404 newitemset = INIT_DATASET;
1406 for (j = 0; j < is->items.cnt; j++) {
1407 int itm = is->items.syms[j];
1408 int p = item_prod(itm);
1409 int bp = item_dot(itm);
1410 struct production *pr = g->productions[p];
1411 unsigned short la = 0;
1414 if (bp == pr->body_size)
1416 if (pr->body[bp] != sym)
1421 la = is->items.data[j];
1422 if (bp == pr->body_size &&
1423 pr->precedence > 0 &&
1424 pr->precedence > precedence) {
1425 // new itemset is reducible and has a precedence.
1426 precedence = pr->precedence;
1429 pos = symset_find(&newitemset, item_num(p, bp));
1432 symset_add(&newitemset, item_num(p, bp), la);
1433 else if (type >= LALR) {
1434 // Need to merge la set.
1435 int la2 = newitemset.data[pos];
1437 struct symset ss = set_find(g, la2);
1438 struct symset LA = INIT_SYMSET;
1439 symset_union(&LA, &ss);
1440 ss = set_find(g, la);
1441 if (symset_union(&LA, &ss))
1442 newitemset.data[pos] = save_set(g, LA);
1448 state = add_itemset(g, newitemset, assoc, precedence, type);
1449 if (symset_find(&is->go_to, done.syms[i]) < 0)
1450 symset_add(&is->go_to, done.syms[i], state);
1453 All that is left is to create the initial itemset from production zero, and
1454 with `TK_eof` as the LA set.
1457 static void build_itemsets(struct grammar *g, enum grammar_type type)
1459 struct symset first = INIT_SYMSET;
1462 unsigned short la = 0;
1464 // LA set just has eof
1465 struct symset eof = INIT_SYMSET;
1466 symset_add(&eof, TK_eof, 0);
1467 la = save_set(g, eof);
1468 first = INIT_DATASET;
1470 // production 0, offset 0 (with no data)
1471 symset_add(&first, item_num(0, 0), la);
1472 add_itemset(g, first, Non, 0, type);
1473 for (check_again = 0, is = g->items;
1475 is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1477 struct symset done = INIT_SYMSET;
1488 ### Completing the analysis.
1490 The exact process of analysis depends on which level was requested. For
1491 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1492 `LR1` we need first, but not follow. For `SLR` we need both.
1494 We don't build the "action" tables that you might expect as the parser
1495 doesn't use them. They will be calculated to some extent if needed for
1498 Once we have built everything we allocate arrays for the two lists:
1499 symbols and itemsets. This allows more efficient access during
1500 reporting. The symbols are grouped as terminals and then non-terminals,
1501 and we record the changeover point in `first_nonterm`.
1503 ###### grammar fields
1504 struct symbol **symtab;
1505 struct itemset **statetab;
1508 ###### grammar_analyse
1510 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1514 int snum = TK_reserved;
1515 for (s = g->syms; s; s = s->next)
1516 if (s->num < 0 && s->type == Terminal) {
1520 g->first_nonterm = snum;
1521 for (s = g->syms; s; s = s->next)
1522 if (s->num < 0 && s->type != Virtual) {
1526 for (s = g->syms; s; s = s->next)
1532 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1533 for (s = g->syms; s; s = s->next)
1534 g->symtab[s->num] = s;
1543 build_itemsets(g, type);
1545 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1546 for (is = g->items; is ; is = is->next)
1547 g->statetab[is->state] = is;
1550 ## Reporting on the Grammar
1552 The purpose of the report is to give the grammar developer insight into
1553 how the grammar parser will work. It is basically a structured dump of
1554 all the tables that have been generated, plus a description of any conflicts.
1556 ###### grammar_report
1557 static int grammar_report(struct grammar *g, enum grammar_type type)
1563 return report_conflicts(g, type);
1566 Firstly we have the complete list of symbols, together with the
1567 "FIRST" set if that was generated. We add a mark to each symbol to
1568 show if it is nullable (`.`).
1572 static void report_symbols(struct grammar *g)
1576 printf("SYMBOLS + FIRST:\n");
1578 printf("SYMBOLS:\n");
1580 for (n = 0; n < g->num_syms; n++) {
1581 struct symbol *s = g->symtab[n];
1585 printf(" %c%3d%c: ",
1586 s->nullable ? '.':' ',
1587 s->num, symtypes[s->type]);
1590 printf(" (%d%s)", s->precedence,
1591 assoc_names[s->assoc]);
1593 if (g->first && s->type == Nonterminal) {
1596 for (j = 0; j < g->first[n].cnt; j++) {
1599 prtxt(g->symtab[g->first[n].syms[j]]->name);
1606 Then we have the follow sets if they were computed.
1608 static void report_follow(struct grammar *g)
1611 printf("FOLLOW:\n");
1612 for (n = 0; n < g->num_syms; n++)
1613 if (g->follow[n].cnt) {
1617 prtxt(g->symtab[n]->name);
1618 for (j = 0; j < g->follow[n].cnt; j++) {
1621 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1627 And finally the item sets. These include the GO TO tables and, for
1628 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1629 it up a bit. First the items, with production number and associativity.
1631 static void report_item(struct grammar *g, int itm)
1633 int p = item_prod(itm);
1634 int dot = item_dot(itm);
1635 struct production *pr = g->productions[p];
1639 prtxt(pr->head->name);
1641 for (i = 0; i < pr->body_size; i++) {
1642 printf(" %s", (dot == i ? ". ": ""));
1643 prtxt(pr->body[i]->name);
1645 if (dot == pr->body_size)
1648 if (pr->precedence && dot == pr->body_size)
1649 printf(" (%d%s)", pr->precedence,
1650 assoc_names[pr->assoc]);
1651 if (dot < pr->body_size &&
1652 pr->body[dot]->precedence) {
1653 struct symbol *s = pr->body[dot];
1654 printf(" [%d%s]", s->precedence,
1655 assoc_names[s->assoc]);
1660 The LA sets which are (possibly) reported with each item:
1662 static void report_la(struct grammar *g, int lanum)
1664 struct symset la = set_find(g, lanum);
1668 printf(" LOOK AHEAD(%d)", lanum);
1669 for (i = 0; i < la.cnt; i++) {
1672 prtxt(g->symtab[la.syms[i]]->name);
1677 Then the go to sets:
1679 static void report_goto(struct grammar *g, struct symset gt)
1684 for (i = 0; i < gt.cnt; i++) {
1686 prtxt(g->symtab[gt.syms[i]]->name);
1687 printf(" -> %d\n", gt.data[i]);
1691 Now we can report all the item sets complete with items, LA sets, and GO TO.
1693 static void report_itemsets(struct grammar *g)
1696 printf("ITEM SETS(%d)\n", g->states);
1697 for (s = 0; s < g->states; s++) {
1699 struct itemset *is = g->statetab[s];
1700 printf(" Itemset %d:%s min prefix=%d",
1701 s, is->starts_line?" (startsline)":"", is->min_prefix);
1703 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1705 for (j = 0; j < is->items.cnt; j++) {
1706 report_item(g, is->items.syms[j]);
1707 if (is->items.data != NO_DATA)
1708 report_la(g, is->items.data[j]);
1710 report_goto(g, is->go_to);
1714 ### Reporting conflicts
1716 Conflict detection varies a lot among different analysis levels. However
1717 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1718 LR1 are also similar as they have FOLLOW or LA sets.
1722 ## conflict functions
1724 static int report_conflicts(struct grammar *g, enum grammar_type type)
1727 printf("Conflicts:\n");
1729 cnt = conflicts_lr0(g, type);
1731 cnt = conflicts_slr(g, type);
1733 printf(" - no conflicts\n");
1737 LR0 conflicts are any state which have both a reducible item and
1738 a shiftable item, or two reducible items.
1740 LR05 conflicts only occur if two possible reductions exist,
1741 as shifts always over-ride reductions.
1743 ###### conflict functions
1744 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1748 for (i = 0; i < g->states; i++) {
1749 struct itemset *is = g->statetab[i];
1750 int last_reduce = -1;
1751 int prev_reduce = -1;
1752 int last_shift = -1;
1756 for (j = 0; j < is->items.cnt; j++) {
1757 int itm = is->items.syms[j];
1758 int p = item_prod(itm);
1759 int bp = item_dot(itm);
1760 struct production *pr = g->productions[p];
1762 if (bp == pr->body_size) {
1763 prev_reduce = last_reduce;
1767 if (pr->body[bp]->type == Terminal)
1770 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1771 printf(" State %d has both SHIFT and REDUCE:\n", i);
1772 report_item(g, is->items.syms[last_shift]);
1773 report_item(g, is->items.syms[last_reduce]);
1776 if (prev_reduce >= 0) {
1777 printf(" State %d has 2 (or more) reducible items\n", i);
1778 report_item(g, is->items.syms[prev_reduce]);
1779 report_item(g, is->items.syms[last_reduce]);
1786 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1787 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1788 in the source of the look ahead set.
1790 We build two datasets to reflect the "action" table: one which maps
1791 terminals to items where that terminal could be shifted and another
1792 which maps terminals to items that could be reduced when the terminal
1793 is in look-ahead. We report when we get conflicts between the two.
1795 As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
1796 terminal, we ignore it. NEWLINES are handled specially with its own
1797 rules for when to shift and when to reduce. Conflicts are expected,
1798 but handled internally.
1800 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1805 for (i = 0; i < g->states; i++) {
1806 struct itemset *is = g->statetab[i];
1807 struct symset shifts = INIT_DATASET;
1808 struct symset reduce = INIT_DATASET;
1812 /* First collect the shifts */
1813 for (j = 0; j < is->items.cnt; j++) {
1814 unsigned short itm = is->items.syms[j];
1815 int p = item_prod(itm);
1816 int bp = item_dot(itm);
1817 struct production *pr = g->productions[p];
1820 if (bp >= pr->body_size ||
1821 pr->body[bp]->type != Terminal)
1826 if (s->precedence && is->precedence)
1827 /* Precedence resolves this, so no conflict */
1830 if (symset_find(&shifts, s->num) < 0)
1831 symset_add(&shifts, s->num, itm);
1833 /* Now look for reductions and conflicts */
1834 for (j = 0; j < is->items.cnt; j++) {
1835 unsigned short itm = is->items.syms[j];
1836 int p = item_prod(itm);
1837 int bp = item_dot(itm);
1838 struct production *pr = g->productions[p];
1840 if (bp < pr->body_size)
1845 la = g->follow[pr->head->num];
1847 la = set_find(g, is->items.data[j]);
1849 for (k = 0; k < la.cnt; k++) {
1850 int pos = symset_find(&shifts, la.syms[k]);
1851 if (pos >= 0 && la.syms[k] != TK_newline) {
1852 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1854 prtxt(g->symtab[la.syms[k]]->name);
1856 report_item(g, shifts.data[pos]);
1857 report_item(g, itm);
1859 pos = symset_find(&reduce, la.syms[k]);
1861 symset_add(&reduce, la.syms[k], itm);
1864 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1865 prtxt(g->symtab[la.syms[k]]->name);
1867 report_item(g, itm);
1868 report_item(g, reduce.data[pos]);
1872 symset_free(shifts);
1873 symset_free(reduce);
1879 ## Generating the parser
1881 The exported part of the parser is the `parse_XX` function, where the name
1882 `XX` is based on the name of the parser files.
1884 This takes a `code_node`, a partially initialized `token_config`, and an
1885 optional `FILE` to send tracing to. The `token_config` gets the list of
1886 known words added and then is used with the `code_node` to initialize the
1889 `parse_XX` then calls the library function `parser_run` to actually complete
1890 the parse. This needs the `states` table and functions to call the various
1891 pieces of code provided in the grammar file, so they are generated first.
1893 ###### parser_generate
1895 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1896 struct code_node *pre_reduce)
1902 gen_reduce(f, g, file, pre_reduce);
1905 fprintf(f, "#line 0 \"gen_parser\"\n");
1906 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1909 fprintf(f, "\tstruct token_state *tokens;\n");
1910 fprintf(f, "\tconfig->words_marks = known;\n");
1911 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1912 fprintf(f, "\ttokens = token_open(code, config);\n");
1913 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1914 fprintf(f, "\ttoken_close(tokens);\n");
1915 fprintf(f, "\treturn rv;\n");
1916 fprintf(f, "}\n\n");
1919 ### Known words table
1921 The known words table is simply an array of terminal symbols.
1922 The table of nonterminals used for tracing is a similar array.
1926 static void gen_known(FILE *f, struct grammar *g)
1929 fprintf(f, "#line 0 \"gen_known\"\n");
1930 fprintf(f, "static const char *known[] = {\n");
1931 for (i = TK_reserved;
1932 i < g->num_syms && g->symtab[i]->type == Terminal;
1934 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1935 g->symtab[i]->name.txt);
1936 fprintf(f, "};\n\n");
1939 static void gen_non_term(FILE *f, struct grammar *g)
1942 fprintf(f, "#line 0 \"gen_non_term\"\n");
1943 fprintf(f, "static const char *non_term[] = {\n");
1944 for (i = TK_reserved;
1947 if (g->symtab[i]->type == Nonterminal)
1948 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1949 g->symtab[i]->name.txt);
1950 fprintf(f, "};\n\n");
1953 ### States and the goto tables.
1955 For each state we record the goto table and details of the reducible
1956 production if there is one.
1957 Some of the details of the reducible production are stored in the
1958 `do_reduce` function to come later. Here we store the production
1959 number, the body size (useful for stack management), and the resulting
1960 symbol (useful for knowing how to free data later).
1962 The go to table is stored in a simple array of `sym` and corresponding
1965 ###### exported types
1973 const struct lookup * go_to;
1985 static void gen_goto(FILE *f, struct grammar *g)
1988 fprintf(f, "#line 0 \"gen_goto\"\n");
1989 for (i = 0; i < g->states; i++) {
1990 struct symset gt = g->statetab[i]->go_to;
1995 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1997 for (j = 0; j < gt.cnt; j++)
1998 fprintf(f, "\t{ %d, %d },\n",
1999 gt.syms[j], gt.data[j]);
2004 static void gen_states(FILE *f, struct grammar *g)
2007 fprintf(f, "#line 0 \"gen_states\"\n");
2008 fprintf(f, "static const struct state states[] = {\n");
2009 for (i = 0; i < g->states; i++) {
2010 struct itemset *is = g->statetab[i];
2011 int j, prod = -1, prod_len;
2013 for (j = 0; j < is->items.cnt; j++) {
2014 int itm = is->items.syms[j];
2015 int p = item_prod(itm);
2016 int bp = item_dot(itm);
2017 struct production *pr = g->productions[p];
2019 if (bp < pr->body_size)
2021 /* This is what we reduce - choose longest */
2022 if (prod < 0 || prod_len < pr->body_size) {
2024 prod_len = pr->body_size;
2028 fprintf(f, "\t[%d] = { %d, goto_",
2029 i, is->go_to.cnt, i);
2031 fprintf(f, "\t[%d] = { 0, NULL, ", i);
2033 struct production *pr = g->productions[prod];
2034 struct symbol *hd = pr->head;
2035 fprintf(f, "%d, %d, %d, %d, %d, %d, ", prod,
2040 if (hd->struct_name.txt == NULL)
2041 fprintf(f, "0 },\n");
2043 fprintf(f, "sizeof(struct %.*s%s) },\n",
2044 hd->struct_name.len,
2045 hd->struct_name.txt,
2046 hd->isref ? "*" : "");
2048 fprintf(f, "-1, -1, -1, %d, %d, -1 },\n",
2049 is->starts_line, is->min_prefix);
2051 fprintf(f, "};\n\n");
2054 ### The `do_reduce` function and the code
2056 When the parser engine decides to reduce a production, it calls
2057 `do_reduce` which runs the code that was included with the production,
2060 This code needs to be able to store data somewhere. Rather than
2061 requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
2062 buffer and have `do_reduce` return the size to be saved.
2064 In order for the code to access "global" context, we pass in the
2065 "config" pointer that was passed to the parser function. If the `struct
2066 token_config` is embedded in some larger structure, the reducing code
2067 can access the larger structure using pointer manipulation. Performing
2068 that pointer manipulation, and any other common code or variables for
2069 all reduce actions, can be provided in code node called "reduce" which
2070 is passed around in `pre_reduce` which you might have already noticed.
2072 The code fragments require translation when written out. Any `$N` needs
2073 to be converted to a reference either to that buffer (if `$0`) or to the
2074 structure returned by a previous reduction. These pointers need to be
2075 cast to the appropriate type for each access. All this is handled in
2078 `gen_code` also allows symbol references to contain a '`<`' as in
2079 '`$<2`'. This is particularly useful for references (or pointers), but
2080 can be used with structures too. The `<` implies that the value is
2081 being moved out, so the object will not be automatically freed. It is
2082 equivalent to assigning `NULL` to the pointer or filling a structure
2085 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2086 and possibly a number. A number by itself (other than zero) selects a
2087 symbol from the body of the production. A sequence of letters selects
2088 the shortest symbol in the body which contains those letters in the
2089 given order. If a number follows the letters, then a later occurrence
2090 of that symbol is chosen. So "`$AB2`" will refer to the structure
2091 attached to the second occurrence of the shortest symbol which contains
2092 an `A` followed by a `B`. If there is no unique shortest system, or if
2093 the number given is too large, then the symbol reference is not
2094 transformed, and will cause an error when the code is compiled.
2098 static int textchr(struct text t, char c, int s)
2101 for (i = s; i < t.len; i++)
2107 static int subseq_match(char *seq, int slen, struct text name)
2111 st = textchr(name, *seq, st);
2121 static int choose_sym(char **namep, int len, struct production *p)
2123 char *name = *namep;
2132 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2138 while (len > 0 && (c >= '0' && c <= '9')) {
2141 n = n * 10 + (c - '0');
2145 if (name == *namep || n > p->body_size)
2151 for (i = 0; i < p->body_size; i++) {
2152 if (!subseq_match(nam, namlen, p->body[i]->name))
2154 if (slen == 0 || p->body[i]->name.len < slen) {
2156 slen = p->body[i]->name.len;
2158 if (s >= 0 && p->body[i] != p->body[s] &&
2159 p->body[i]->name.len == p->body[s]->name.len)
2160 /* not unique, so s cannot be used */
2167 for (i = 0; i < p->body_size; i++)
2168 if (p->body[i] == p->body[s]) {
2173 if (n > 0 || i == p->body_size)
2179 static void gen_code(struct production *p, FILE *f, struct grammar *g)
2182 char *used = calloc(1, p->body_size);
2185 fprintf(f, "\t\t\t");
2186 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2200 n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2209 fprintf(f, "(*(struct %.*s*%s)ret)",
2210 p->head->struct_name.len,
2211 p->head->struct_name.txt,
2212 p->head->isref ? "*":"");
2213 else if (p->body[n-1]->type == Terminal)
2214 fprintf(f, "(*(struct token *)body[%d])",
2216 else if (p->body[n-1]->struct_name.txt == NULL)
2217 fprintf(f, "$%d", n);
2219 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2220 p->body[n-1]->struct_name.len,
2221 p->body[n-1]->struct_name.txt,
2222 p->body[n-1]->isref ? "*":"", n-1);
2228 for (i = 0; i < p->body_size; i++) {
2229 if (p->body[i]->struct_name.txt &&
2231 // assume this has been copied out
2232 if (p->body[i]->isref)
2233 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2235 fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2243 static void gen_reduce(FILE *f, struct grammar *g, char *file,
2244 struct code_node *pre_reduce)
2247 fprintf(f, "#line 1 \"gen_reduce\"\n");
2248 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2250 fprintf(f, "\tint ret_size = 0;\n");
2252 code_node_print(f, pre_reduce, file);
2254 fprintf(f, "#line 4 \"gen_reduce\"\n");
2255 fprintf(f, "\tswitch(prod) {\n");
2256 for (i = 0; i < g->production_count; i++) {
2257 struct production *p = g->productions[i];
2258 fprintf(f, "\tcase %d:\n", i);
2261 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2265 if (p->head->struct_name.txt)
2266 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2267 p->head->struct_name.len,
2268 p->head->struct_name.txt,
2269 p->head->isref ? "*":"");
2271 fprintf(f, "\t\tbreak;\n");
2273 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2278 As each non-terminal can potentially cause a different type of data
2279 structure to be allocated and filled in, we need to be able to free it when
2282 It is particularly important to have fine control over freeing during error
2283 recovery where individual stack frames might need to be freed.
2285 For this, the grammar author is required to defined a `free_XX` function for
2286 each structure that is used by a non-terminal. `do_free` will call whichever
2287 is appropriate given a symbol number, and will call `free` (as is
2288 appropriate for tokens) on any terminal symbol.
2292 static void gen_free(FILE *f, struct grammar *g)
2296 fprintf(f, "#line 0 \"gen_free\"\n");
2297 fprintf(f, "static void do_free(short sym, void *asn)\n");
2299 fprintf(f, "\tif (!asn) return;\n");
2300 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2301 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2302 fprintf(f, "\tswitch(sym) {\n");
2304 for (i = 0; i < g->num_syms; i++) {
2305 struct symbol *s = g->symtab[i];
2307 s->type != Nonterminal ||
2308 s->struct_name.txt == NULL)
2311 fprintf(f, "\tcase %d:\n", s->num);
2313 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2315 s->struct_name.txt);
2316 fprintf(f, "\t\tfree(asn);\n");
2318 fprintf(f, "\t\tfree_%.*s(asn);\n",
2320 s->struct_name.txt);
2321 fprintf(f, "\t\tbreak;\n");
2323 fprintf(f, "\t}\n}\n\n");
2326 ## The main routine.
2328 There are three key parts to the "main" function of parsergen: processing
2329 the arguments, loading the grammar file, and dealing with the grammar.
2331 ### Argument processing.
2333 Command line options allow the selection of analysis mode, name of output
2334 file, and whether or not a report should be generated. By default we create
2335 a report only if no code output was requested.
2337 The `parse_XX` function name uses the basename of the output file which
2338 should not have a suffix (`.c`). `.c` is added to the given name for the
2339 code, and `.h` is added for the header (if header text is specifed in the
2346 static const struct option long_options[] = {
2347 { "LR0", 0, NULL, '0' },
2348 { "LR05", 0, NULL, '5' },
2349 { "SLR", 0, NULL, 'S' },
2350 { "LALR", 0, NULL, 'L' },
2351 { "LR1", 0, NULL, '1' },
2352 { "tag", 1, NULL, 't' },
2353 { "report", 0, NULL, 'R' },
2354 { "output", 1, NULL, 'o' },
2355 { NULL, 0, NULL, 0 }
2357 const char *options = "05SL1t:Ro:";
2359 ###### process arguments
2361 char *outfile = NULL;
2366 enum grammar_type type = LR05;
2367 while ((opt = getopt_long(argc, argv, options,
2368 long_options, NULL)) != -1) {
2383 outfile = optarg; break;
2385 tag = optarg; break;
2387 fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2392 infile = argv[optind++];
2394 fprintf(stderr, "No input file given\n");
2397 if (outfile && report == 1)
2400 if (name && strchr(name, '/'))
2401 name = strrchr(name, '/')+1;
2403 if (optind < argc) {
2404 fprintf(stderr, "Excess command line arguments\n");
2408 ### Loading the grammar file
2410 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2413 Once we have extracted the code (with `mdcode`) we expect to find three
2414 or four sections: header, code, grammar, reduce.
2415 Anything else that is not excluded by the `--tag` option is an error.
2417 "header", "code", and "reduce" are optional, though it is hard to build
2418 a working parser without the first two. "grammar" must be provided.
2422 #include <sys/mman.h>
2427 static void pr_err(char *msg)
2430 fprintf(stderr, "%s\n", msg);
2434 struct section *table;
2438 fd = open(infile, O_RDONLY);
2440 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2441 infile, strerror(errno));
2444 len = lseek(fd, 0, 2);
2445 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2446 table = code_extract(file, file+len, pr_err);
2448 struct code_node *hdr = NULL;
2449 struct code_node *code = NULL;
2450 struct code_node *gram = NULL;
2451 struct code_node *pre_reduce = NULL;
2452 for (s = table; s; s = s->next) {
2453 struct text sec = s->section;
2454 if (tag && !strip_tag(&sec, tag))
2456 if (text_is(sec, "header"))
2458 else if (text_is(sec, "code"))
2460 else if (text_is(sec, "grammar"))
2462 else if (text_is(sec, "reduce"))
2463 pre_reduce = s->code;
2465 fprintf(stderr, "Unknown content section: %.*s\n",
2466 s->section.len, s->section.txt);
2471 ### Processing the input
2473 As we need to append an extention to a filename and then open it for
2474 writing, and we need to do this twice, it helps to have a separate function.
2478 static FILE *open_ext(char *base, char *ext)
2480 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2482 strcat(strcpy(fn, base), ext);
2488 If we can read the grammar, then we analyse and optionally report on it. If
2489 the report finds conflicts we will exit with an error status.
2491 ###### process input
2492 struct grammar *g = NULL;
2494 fprintf(stderr, "No grammar section provided\n");
2497 g = grammar_read(gram);
2499 fprintf(stderr, "Failure to parse grammar\n");
2504 grammar_analyse(g, type);
2506 if (grammar_report(g, type))
2510 If a "headers" section is defined, we write it out and include a declaration
2511 for the `parse_XX` function so it can be used from separate code.
2513 if (rv == 0 && hdr && outfile) {
2514 FILE *f = open_ext(outfile, ".h");
2516 code_node_print(f, hdr, infile);
2517 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2521 fprintf(stderr, "Cannot create %s.h\n",
2527 And if all goes well and an output file was provided, we create the `.c`
2528 file with the code section (if any) and the parser tables and function.
2530 if (rv == 0 && outfile) {
2531 FILE *f = open_ext(outfile, ".c");
2534 code_node_print(f, code, infile);
2535 gen_parser(f, g, infile, name, pre_reduce);
2538 fprintf(stderr, "Cannot create %s.c\n",
2544 And that about wraps it up. We need to set the locale so that UTF-8 is
2545 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2547 ###### File: parsergen.mk
2548 parsergen : parsergen.o libscanner.o libmdcode.o
2549 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2556 int main(int argc, char *argv[])
2561 setlocale(LC_ALL,"");
2563 ## process arguments
2570 ## The SHIFT/REDUCE parser
2572 Having analysed the grammar and generated all the tables, we only need
2573 the shift/reduce engine to bring it all together.
2575 ### Goto table lookup
2577 The parser generator has nicely provided us with goto tables sorted by
2578 symbol number. We need a binary search function to find a symbol in the
2581 ###### parser functions
2583 static int search(const struct state *l, int sym)
2586 int hi = l->go_to_cnt;
2590 while (lo + 1 < hi) {
2591 int mid = (lo + hi) / 2;
2592 if (l->go_to[mid].sym <= sym)
2597 if (l->go_to[lo].sym == sym)
2598 return l->go_to[lo].state;
2603 ### Memory allocation
2605 The `scanner` returns tokens in a local variable - we want them in allocated
2606 memory so they can live in the `asn_stack`. So we provide `tok_copy` to
2607 make an allocated copy as required.
2609 ###### parser includes
2612 ###### parser functions
2614 static struct token *tok_copy(struct token tk)
2616 struct token *new = malloc(sizeof(*new));
2621 ### The state stack.
2623 The core data structure for the parser is the stack. This tracks all
2624 the symbols that have been recognised or partially recognised.
2626 The stack usually won't grow very large - maybe a few tens of entries.
2627 So we dynamically grow an array as required but never bother to
2628 shrink it down again.
2630 We keep the stack as two separate allocations. One, `asn_stack` stores
2631 the "abstract syntax nodes" which are created by each reduction. When
2632 we call `do_reduce` we need to pass an array of the `asn`s of the body
2633 of the production, and by keeping a separate `asn` stack, we can just
2634 pass a pointer into this stack.
2636 The other allocation stores all other stack fields of which there are
2637 several. The `state` is the most important one and guides the parsing
2638 process. The `sym` is nearly unnecessary as it is implicit in the
2639 state. However when we want to free entries from the `asn_stack`, it
2640 helps to know what type they are so we can call the right freeing
2641 function. The symbol leads us to the right free function through
2644 The `indents` count tracks the line indents with-in the symbol or
2645 immediately follow it. These are used to allow indent information to
2646 guide parsing and error recovery.
2648 `since_newline` tracks how many stack frames since the last
2649 start-of-line (whether indented or not). So if `since_newline` is
2650 zero, then this symbol is at the start of a line. Similarly
2651 `since_indent` counts the number of states since an indent, it is zero
2652 precisely when `indents` is not zero.
2654 `newline_permitted` keeps track of whether newlines should be ignored
2657 The stack is most properly seen as alternating states and symbols -
2658 states, like the 'DOT' in items, are between symbols. Each frame in
2659 our stack holds a state and the symbol that was before it. The
2660 bottom of stack holds the start state but no symbol, as nothing came
2661 before the beginning. As we need to store some value, `TK_eof` is used
2662 to mark the beginning of the file as well as the end.
2664 ###### parser functions
2669 short newline_permitted;
2673 short since_newline;
2683 Two operations are needed on the stack - shift (which is like push) and pop.
2685 Shift applies not only to terminals but also to non-terminals. When we
2686 reduce a production we will pop off frames corresponding to the body
2687 symbols, then push on a frame for the head of the production. This last
2688 is exactly the same process as shifting in a terminal so we use the same
2689 function for both. In both cases we provide the symbol, the number of
2690 indents the symbol contains (which will be zero for a terminal symbol)
2691 and a flag indicating the the symbol was at (or was reduced from a
2692 symbol which was at) the start of a line. The state is deduced from the
2693 current top-of-stack state and the new symbol.
2695 To simplify other code we arrange for `shift` to fail if there is no `goto`
2696 state for the symbol. This is useful in basic parsing due to our design
2697 that we shift when we can, and reduce when we cannot. So the `shift`
2698 function reports if it could.
2700 `shift` is also used to push state zero onto the stack, so if the
2701 stack is empty, it always chooses zero as the next state.
2703 So `shift` finds the next state. If that succeeds it extends the
2704 allocations if needed and pushes all the information onto the stacks.
2706 Newlines are permitted after a `starts_line` state until an internal
2707 indent. If the new frame has neither a `starts_line` state nor an
2708 indent, newlines are permitted if the previous stack frame permitted
2711 ###### parser functions
2713 static int shift(struct parser *p,
2714 short sym, short indents, short start_of_line,
2716 const struct state states[])
2718 // Push an entry onto the stack
2719 struct frame next = {0};
2720 int newstate = p->tos
2721 ? search(&states[p->stack[p->tos-1].state],
2726 if (p->tos >= p->stack_size) {
2727 p->stack_size += 10;
2728 p->stack = realloc(p->stack, p->stack_size
2729 * sizeof(p->stack[0]));
2730 p->asn_stack = realloc(p->asn_stack, p->stack_size
2731 * sizeof(p->asn_stack[0]));
2734 next.indents = indents;
2735 next.state = newstate;
2736 if (states[newstate].starts_line)
2737 next.newline_permitted = 1;
2739 next.newline_permitted = 0;
2741 next.newline_permitted =
2742 p->stack[p->tos-1].newline_permitted;
2744 next.newline_permitted = 0;
2746 if (!start_of_line) {
2748 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2750 next.since_newline = 1;
2753 next.since_indent = 0;
2755 next.since_indent = p->stack[p->tos-1].since_indent + 1;
2757 next.since_indent = 1;
2759 p->stack[p->tos] = next;
2760 p->asn_stack[p->tos] = asn;
2765 `pop` primarily moves the top of stack (`tos`) back down the required
2766 amount and frees any `asn` entries that need to be freed. It also
2767 collects a summary of the indents and line starts in the symbols that
2768 are being removed. It is called _after_ we reduce a production, just
2769 before we `shift` the nonterminal in.
2771 ###### parser functions
2773 static int pop(struct parser *p, int num,
2774 short *start_of_line,
2775 void(*do_free)(short sym, void *asn))
2782 for (i = 0; i < num; i++) {
2783 sol |= !p->stack[p->tos+i].since_newline;
2784 indents += p->stack[p->tos+i].indents;
2785 do_free(p->stack[p->tos+i].sym,
2786 p->asn_stack[p->tos+i]);
2789 *start_of_line = sol;
2793 ### The heart of the parser.
2795 Now we have the parser. For each token we might shift it, trigger a
2796 reduction, or start error handling. 2D tokens (IN, OUT, EOL) also need
2799 We return whatever `asn` was returned by reducing production zero.
2801 If we can neither shift nor reduce we have an error to handle. We pop
2802 single entries off the stack until we can shift the `TK_error` symbol,
2803 then drop input tokens until we find one we can shift into the new error
2804 state. We need to ensure that something is dropped or shifted after an
2805 error, or we could get into an infinite loop, only shifting in
2806 `TK_error`, then reducing. So we track if there has been a shift since
2807 the last error, and if not the next error always discards one token.
2809 When we find `TK_in` and `TK_out` tokens which report indents we need
2810 to handle them directly as the grammar cannot express what we want to
2813 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2814 record how many indents there are following the previous token.
2816 `TK_out` tokens must be canceled against an indent count
2817 within the stack. If we can reduce some symbols that are all since
2818 the most recent indent, then we do that first. If the minimum prefix
2819 of the current state then extends back before the most recent indent,
2820 that indent can be cancelled. If the minimum prefix is shorter then
2821 the indent had ended prematurely and we must start error handling, which
2822 is still a work-in-progress.
2824 `TK_newline` tokens are ignored unless the top stack frame records
2825 that they are permitted. In that case they will not be considered for
2826 shifting if it is possible to reduce some symbols that are all since
2827 the most recent start of line. This is how a newline forcibly
2828 terminates any line-like structure - we try to reduce down to at most
2829 one symbol for each line where newlines are allowed.
2830 A consequence of this is that a rule like
2832 ###### Example: newlines - broken
2836 IfStatement -> Newlines if ....
2838 cannot work, as the NEWLINE will never be shifted as the empty string
2839 will be reduced first. Optional sets of newlines need to be include
2840 in the thing that preceed:
2842 ###### Example: newlines - works
2846 IfStatement -> If ....
2848 Here the NEWLINE will be shifted because nothing can be reduced until
2851 When during error handling we discard tokens read in, we want to keep
2852 discarding until we see one that is recognised. If we had a full set
2853 of LR(1) grammar states, this would mean looking in the look-ahead set,
2854 but we don't keep a full look-ahead set. We only record the subset
2855 that leads to SHIFT. We can, however, deduce the look-ahead set by
2856 looking at the SHIFT subsets for all states that we can get to by
2857 reducing zero or more times. So we need a little function which
2858 checks if a given token is in any of these look-ahead sets.
2860 ###### parser includes
2865 static int in_lookahead(struct token *tk, const struct state *states, int state)
2867 while (state >= 0) {
2868 if (search(&states[state], tk->num) >= 0)
2870 if (states[state].reduce_prod < 0)
2872 state = search(&states[state], states[state].reduce_sym);
2877 void *parser_run(struct token_state *tokens,
2878 const struct state states[],
2879 int (*do_reduce)(int, void**, struct token_config*, void*),
2880 void (*do_free)(short, void*),
2881 FILE *trace, const char *non_term[],
2882 struct token_config *config)
2884 struct parser p = { 0 };
2885 struct token *tk = NULL;
2887 int shift_since_err = 1;
2890 shift(&p, TK_eof, 0, 1, NULL, states);
2891 while (!accepted && p.tos > 0) {
2892 struct token *err_tk;
2893 struct frame *tos = &p.stack[p.tos-1];
2895 tk = tok_copy(token_next(tokens));
2896 parser_trace(trace, &p,
2897 tk, states, non_term, config->known_count);
2899 if (tk->num == TK_in) {
2901 tos->since_newline = 0;
2902 tos->since_indent = 0;
2903 if (!states[tos->state].starts_line)
2904 tos->newline_permitted = 0;
2907 parser_trace_action(trace, "Record");
2910 if (tk->num == TK_out) {
2911 if (states[tos->state].reduce_size >= 0 &&
2912 states[tos->state].reduce_size <= tos->since_indent)
2914 if (states[tos->state].min_prefix >= tos->since_indent) {
2916 struct frame *in = tos - tos->since_indent;
2918 if (in->indents == 0) {
2919 /* Reassess since_indent and newline_permitted */
2921 in->since_indent = in[-1].since_indent + 1;
2922 in->newline_permitted = in[-1].newline_permitted;
2924 in->since_indent = 0;
2925 in->newline_permitted = 0;
2927 if (states[in->state].starts_line)
2928 in->newline_permitted = 1;
2931 in->since_indent = in[-1].since_indent + 1;
2932 if (states[in->state].starts_line)
2933 in->newline_permitted = 1;
2935 in->newline_permitted = in[-1].newline_permitted;
2940 parser_trace_action(trace, "Cancel");
2943 // fall through to error handling as both SHIFT and REDUCE
2946 if (tk->num == TK_newline) {
2947 if (!tos->newline_permitted) {
2950 parser_trace_action(trace, "Discard");
2953 if (tos->since_newline > 1 &&
2954 states[tos->state].reduce_size >= 0 &&
2955 states[tos->state].reduce_size <= tos->since_newline)
2958 if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
2959 shift_since_err = 1;
2961 parser_trace_action(trace, "Shift");
2965 if (states[tos->state].reduce_prod >= 0 &&
2966 states[tos->state].newline_only &&
2967 !(tk->num == TK_newline ||
2968 tk->num == TK_eof ||
2969 tk->num == TK_out ||
2970 (tos->indents == 0 && tos->since_newline == 0))) {
2971 /* Anything other than newline or out or eof
2972 * in an error unless we are already at start
2973 * of line, as this production must end at EOL.
2975 } else if (states[tos->state].reduce_prod >= 0) {
2978 const struct state *nextstate = &states[tos->state];
2979 int prod = nextstate->reduce_prod;
2980 int size = nextstate->reduce_size;
2981 int res_size = nextstate->result_size;
2982 short indents, start_of_line;
2984 body = p.asn_stack + (p.tos - size);
2985 res = res_size ? calloc(1, res_size) : NULL;
2986 res_size = do_reduce(prod, body, config, res);
2987 if (res_size != nextstate->result_size)
2990 indents = pop(&p, size, &start_of_line,
2993 if (!shift(&p, nextstate->reduce_sym,
2994 indents, start_of_line,
2996 if (prod != 0) abort();
3000 parser_trace_action(trace, "Reduce");
3003 /* Error. We walk up the stack until we
3004 * find a state which will accept TK_error.
3005 * We then shift in TK_error and see what state
3006 * that takes us too.
3007 * Then we discard input tokens until
3008 * we find one that is acceptable.
3010 parser_trace_action(trace, "ERROR");
3011 short indents = 0, start_of_line;
3013 err_tk = tok_copy(*tk);
3015 shift(&p, TK_error, 0, 0,
3016 err_tk, states) == 0)
3017 // discard this state
3018 indents += pop(&p, 1, &start_of_line, do_free);
3021 // no state accepted TK_error
3024 if (!shift_since_err) {
3025 /* must discard at least one token to avoid
3028 if (tk->num == TK_eof)
3031 tk = tok_copy(token_next(tokens));
3033 shift_since_err = 0;
3034 tos = &p.stack[p.tos-1];
3035 while (!in_lookahead(tk, states, tos->state) &&
3036 tk->num != TK_eof) {
3038 tk = tok_copy(token_next(tokens));
3039 shift_since_err = 1;
3040 if (tk->num == TK_in)
3042 if (tk->num == TK_out) {
3046 // FIXME update since_indent here
3049 tos->indents += indents;
3052 pop(&p, p.tos, NULL, do_free);
3058 ###### exported functions
3059 void *parser_run(struct token_state *tokens,
3060 const struct state states[],
3061 int (*do_reduce)(int, void**, struct token_config*, void*),
3062 void (*do_free)(short, void*),
3063 FILE *trace, const char *non_term[],
3064 struct token_config *config);
3068 Being able to visualize the parser in action can be invaluable when
3069 debugging the parser code, or trying to understand how the parse of a
3070 particular grammar progresses. The stack contains all the important
3071 state, so just printing out the stack every time around the parse loop
3072 can make it possible to see exactly what is happening.
3074 This doesn't explicitly show each SHIFT and REDUCE action. However they
3075 are easily deduced from the change between consecutive lines, and the
3076 details of each state can be found by cross referencing the states list
3077 in the stack with the "report" that parsergen can generate.
3079 For terminal symbols, we just dump the token. For non-terminals we
3080 print the name of the symbol. The look ahead token is reported at the
3081 end inside square brackets.
3083 ###### parser functions
3085 static char *reserved_words[] = {
3086 [TK_error] = "ERROR",
3089 [TK_newline] = "NEWLINE",
3092 static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
3094 fprintf(trace, "(%d", f->state);
3095 if (states[f->state].starts_line)
3096 fprintf(trace, "s");
3097 if (f->newline_permitted)
3098 fprintf(trace, "n%d", f->since_newline);
3099 fprintf(trace, ") ");
3102 void parser_trace(FILE *trace, struct parser *p,
3103 struct token *tk, const struct state states[],
3104 const char *non_term[], int knowns)
3109 for (i = 0; i < p->tos; i++) {
3110 struct frame *f = &p->stack[i];
3113 if (sym < TK_reserved &&
3114 reserved_words[sym] != NULL)
3115 fputs(reserved_words[sym], trace);
3116 else if (sym < TK_reserved + knowns) {
3117 struct token *t = p->asn_stack[i];
3118 text_dump(trace, t->txt, 20);
3120 fputs(non_term[sym - TK_reserved - knowns],
3123 fprintf(trace, ".%d", f->indents);
3124 if (f->since_newline == 0)
3128 parser_trace_state(trace, f, states);
3130 fprintf(trace, "[");
3131 if (tk->num < TK_reserved &&
3132 reserved_words[tk->num] != NULL)
3133 fputs(reserved_words[tk->num], trace);
3135 text_dump(trace, tk->txt, 20);
3136 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3139 void parser_trace_action(FILE *trace, char *action)
3142 fprintf(trace, " - %s\n", action);
3147 The obvious example for a parser is a calculator.
3149 As `scanner` provides number parsing function using `libgmp` it is not much
3150 work to perform arbitrary rational number calculations.
3152 This calculator takes one expression, or an equality test, per line.
3153 The results are printed and if any equality test fails, the program
3154 exits with an error.
3156 ###### File: parsergen.mk
3157 calc.c calc.h : parsergen parsergen.mdc
3158 ./parsergen --tag calc -o calc parsergen.mdc
3159 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3160 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3162 ./calc parsergen.mdc
3167 #include "parse_number.h"
3168 // what do we use for a demo-grammar? A calculator of course.
3180 #include <sys/mman.h>
3186 #include "scanner.h"
3191 static void free_number(struct number *n)
3197 static int text_is(struct text t, char *s)
3199 return (strlen(s) == t.len &&
3200 strncmp(s, t.txt, t.len) == 0);
3203 int main(int argc, char *argv[])
3205 int fd = open(argv[1], O_RDONLY);
3206 int len = lseek(fd, 0, 2);
3207 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3208 struct section *table = code_extract(file, file+len, NULL);
3210 struct token_config config = {
3211 .ignored = (1 << TK_line_comment)
3214 .number_chars = ".,_+-",
3218 for (s = table; s; s = s->next)
3219 if (text_is(s->section, "example: input"))
3220 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3222 struct section *t = table->next;
3223 code_free(table->code);
3235 Session -> Session Line
3238 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3239 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3240 gmp_printf(" or as a decimal: %Fg\n", fl);
3244 | Expression = Expression NEWLINE ${
3245 if (mpq_equal($1.val, $3.val))
3246 gmp_printf("Both equal %Qd\n", $1.val);
3248 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
3253 | NEWLINE ${ printf("Blank line\n"); }$
3254 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3257 Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3258 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3259 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3260 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3261 | Expression // Expression ${ {
3264 mpz_init(z0); mpz_init(z1); mpz_init(z2);
3265 mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3266 mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3267 mpz_tdiv_q(z0, z1, z2);
3268 mpq_set_z($0.val, z0);
3269 mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3271 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3272 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3277 3.1415926535 - 355/113
3279 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3281 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1