1 # LR(1) Parser Generator with 2D support #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 "2D support" means that indentation and line breaks can be significant.
9 There are several distinct sections.
11 1. `grammar_read` will read a grammar from a literate-code file and
12 store the productions, symbols, and code fragments.
13 2. `grammar_analyse` will take that grammar and build LR parsing
14 states and look-ahead tables.
15 3. `grammar_report` will present the details of the analysis
16 in a readable format and will report any conflicts.
17 4. `parser_generate` will write out C code files with various
18 tables and with the code fragments arranged in useful places.
19 5. `parser_run` is a library function intended to be linked together
20 with the generated parser tables to complete the implementation of
22 6. Finally `calc` is a test grammar for a simple calculator. The
23 `parsergen` program built from the C code in this file can extract
24 that grammar directly from this file and process it.
26 ###### File: parsergen.c
31 ## forward declarations
42 ###### File: libparser.c
49 ###### File: parsergen.mk
52 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
55 ## Reading the grammar
57 The grammar must be stored in a markdown literate code file as parsed by
58 "[mdcode][]". It may have up to four top level (i.e. unreferenced)
59 sections: `header`, `code`, `grammar` and `reduce`. The first two, if
60 present, will be literally copied into the generated `.c` and `.h`
61 files. The third is required and contains the grammar which is
62 tokenised with "[scanner][]". `reduce` can provide variable
63 declarations and common intialization code for all the reduction code
64 fragments in the grammar.
66 If the `--tag` option is given, then any top level header that doesn't
67 start with the tag is ignored, and the tag is striped from the rest. So
68 `--tag Foo` means that the four sections expected are `Foo: header`,
69 `Foo: code`, `Foo: grammar` and `Foo: reduce`. The tag `calc` is used
70 to extract the sample calculator from this file.
73 [scanner]: scanner.html
79 ###### parser includes
83 The grammar contains production sets, precedence/associativity
84 declarations, general terminal declarations, and data type declarations.
85 These are all parsed with _ad hoc_ parsing as we don't have a parser
88 The precedence and associativity can be set for each production, and
89 can be inherited from symbols. The data type (either structure or a
90 reference to a structure) is potentially defined for each non-terminal
91 and describes what C structure is used to store information about each
95 enum assoc {Left, Right, Non};
96 char *assoc_names[] = {"Left","Right","Non"};
99 struct text struct_name;
102 unsigned short precedence;
106 unsigned short precedence;
115 The strings reported by `mdcode` and `scanner` are `struct text` which
116 have length rather than being nul terminated. To help with printing and
117 comparing we define `text_is` and `prtxt`, which should possibly go in
118 `mdcode`. `scanner` does provide `text_dump` which is useful for
119 strings which might contain control characters.
121 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
122 because that is what we need to detect tags.
125 static int text_is(struct text t, char *s)
127 return (strlen(s) == t.len &&
128 strncmp(s, t.txt, t.len) == 0);
130 static void prtxt(struct text t)
132 printf("%.*s", t.len, t.txt);
135 static int strip_tag(struct text *t, char *tag)
137 int skip = strlen(tag) + 1;
138 if (skip >= t->len ||
139 strncmp(t->txt, tag, skip-1) != 0 ||
140 t->txt[skip-1] != ':')
142 while (skip < t->len && t->txt[skip] == ' ')
151 Productions are comprised primarily of symbols - terminal and
152 non-terminal. We do not make a syntactic distinction between the two,
153 though non-terminals must be identifiers. Non-terminal symbols are
154 those which appear in the head of a production, terminal symbols are
155 those which don't. There are also "virtual" symbols used for precedence
156 marking discussed later, and sometimes we won't know what type a symbol
159 To help with code safety it is possible to declare the terminal symbols.
160 If this is done, then any symbol used in a production that does not
161 appear in the head of any production and is not declared as a terminal
162 is treated as an error.
164 ###### forward declarations
165 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
166 char *symtypes = "UVTN";
169 ###### grammar fields
170 int terminals_declared;
172 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
173 table of known symbols and the resulting parser will report them as
174 `TK_reserved + N`. A small set of identifiers are reserved for the
175 different token types that `scanner` can report.
179 static struct { int num; char *name; } reserved_words[] = {
180 { TK_error, "ERROR" },
181 { TK_number, "NUMBER" },
182 { TK_ident, "IDENTIFIER" },
184 { TK_string, "STRING" },
185 { TK_multi_string, "MULTI_STRING" },
188 { TK_newline, "NEWLINE" },
194 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
195 recognised. The former is automatically expected at the end of the text
196 being parsed. The latter are always ignored by the parser.
198 All of these symbols are stored in a simple symbol table. We use the
199 `struct text` from `mdcode` to store the name and link them together
200 into a sorted list using an insertion sort.
202 We don't have separate `find` and `insert` functions as any symbol we
203 find needs to be remembered. We simply expect `find` to always return a
204 symbol, but its type might be `Unknown`.
213 ###### grammar fields
218 static struct symbol *sym_find(struct grammar *g, struct text s)
220 struct symbol **l = &g->syms;
225 (cmp = text_cmp((*l)->name, s)) < 0)
229 n = calloc(1, sizeof(*n));
238 static void symbols_init(struct grammar *g)
240 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
242 for (i = 0; i < entries; i++) {
245 t.txt = reserved_words[i].name;
246 t.len = strlen(t.txt);
249 s->num = reserved_words[i].num;
253 ### Data types and precedence.
255 Data type specification, precedence specification, and declaration of
256 terminals are all introduced by a dollar sign at the start of the line.
257 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
258 precedence, if it is `TERM` then the line declares terminals without
259 precedence, otherwise it specifies a data type.
261 The data type name is simply stored and applied to the head of all
262 subsequent productions. It must be the name of a structure optionally
263 preceded by an asterisk which means a reference or "pointer". So
264 `$expression` maps to `struct expression` and `$*statement` maps to
265 `struct statement *`.
267 Any productions given before the first data type declaration will have
268 no data type associated with them and can carry no information. In
269 order to allow other non-terminals to have no type, the data type
270 `$void` can be given. This does *not* mean that `struct void` will be
271 used, but rather than no type will be associated with subsequent
274 The precedence line must contain a list of symbols, either terminal
275 symbols or virtual symbols. It can only contain symbols that have not
276 been seen yet, so precedence declaration must precede the productions
279 A "virtual" symbol is an identifier preceded by `$$`. Virtual symbols
280 carry precedence information but are not included in the grammar. A
281 production can declare that it inherits the precedence of a given
284 This use for `$$` precludes it from being used as a symbol in the
285 described language. Two other symbols: `${` and `}$` are also
288 Each new precedence line introduces a new precedence level and
289 declares how it associates. This level is stored in each symbol
290 listed and may be inherited by any production which uses the symbol. A
291 production inherits from the last symbol which has a precedence.
293 The symbols on the first precedence line have the lowest precedence.
294 Subsequent lines introduce symbols with higher precedence and so bind
297 Note that a declaration line (or "dollar line") can start with either of
298 two different marks: "$" or "$*". The `dollar_line()` function defined
299 here is told which was found in the `isref` argument.
301 ###### grammar fields
302 struct text current_type;
307 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
308 static const char *known[] = { "$$", "${", "}$" };
311 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
313 struct token t = token_next(ts);
319 if (t.num != TK_ident) {
320 err = "type or assoc expected after '$'";
323 if (text_is(t.txt, "LEFT"))
325 else if (text_is(t.txt, "RIGHT"))
327 else if (text_is(t.txt, "NON"))
329 else if (text_is(t.txt, "TERM")) {
331 g->terminals_declared = 1;
333 g->current_type = t.txt;
334 g->type_isref = isref;
335 if (text_is(t.txt, "void"))
336 g->current_type.txt = NULL;
338 if (t.num != TK_newline) {
339 err = "Extra tokens after type name";
346 err = "$* cannot be followed by a precedence";
350 // This is a precedence or TERM line, need some symbols.
354 while (t.num != TK_newline) {
355 enum symtype type = Terminal;
357 if (t.num == TK_virtual) {
360 if (t.num != TK_ident) {
361 err = "$$ must be followed by a word";
365 err = "Virtual symbols not permitted on $TERM line";
368 } else if (t.num != TK_ident &&
370 err = "Illegal token in precedence line";
373 s = sym_find(g, t.txt);
374 if (s->type != Unknown) {
375 err = "Symbols in precedence/TERM line must not already be known.";
380 s->precedence = g->prec_levels;
387 err = "No symbols given on precedence/TERM line";
391 while (t.num != TK_newline && t.num != TK_eof)
398 A production either starts with an identifier which is the head
399 non-terminal, or a vertical bar (`|`) in which case this production
400 uses the same head as the previous one. The identifier must be
401 followed by a `->` mark. All productions for a given non-terminal must
402 be grouped together with the `nonterminal ->` given only once.
404 After this a (possibly empty) sequence of identifiers and marks form
405 the body of the production. A virtual symbol may be given after the
406 body by preceding it with `$$`. If a virtual symbol is given, the
407 precedence of the production is that for the virtual symbol. If none
408 is given, the precedence is inherited from the last symbol in the
409 production which has a precedence specified.
411 After the optional precedence may come the `${` mark. This indicates
412 the start of a code fragment. If present, this must be on the same
413 line as the start of the production.
415 All of the text from the `${` through to the matching `}$` is
416 collected and forms the code-fragment for the production. It must all
417 be in one `code_node` of the literate code. The `}$` must be
418 at the end of a line.
420 Text in the code fragment will undergo substitutions where `$N` or
421 `$<N`,for some numeric `N` (or non-numeric indicator as described
422 later), will be replaced with a variable holding the parse information
423 for the particular symbol in the production. `$0` is the head of the
424 production, `$1` is the first symbol of the body, etc. The type of `$N`
425 for a terminal symbol is `struct token`. For a non-terminal, it is
426 whatever has been declared for that symbol. The `<` may be included and
427 means that the value (usually a reference) is being moved out, so it
428 will not automatically be freed. The effect of using '<' is that the
429 variable is cleared to all-zeros.
431 While building productions we will need to add to an array which needs to
435 static void array_add(void *varray, int *cnt, void *new)
437 void ***array = varray;
440 current = ((*cnt-1) | (step-1))+1;
441 if (*cnt == current) {
444 *array = realloc(*array, current * sizeof(void*));
446 (*array)[*cnt] = new;
450 Collecting the code fragment simply involves reading tokens until we
451 hit the end token `}$`, and noting the character position of the start and
455 static struct text collect_code(struct token_state *state,
460 code.txt = start.txt.txt + start.txt.len;
462 t = token_next(state);
463 while (t.node == start.node &&
464 t.num != TK_close && t.num != TK_error &&
466 if (t.num == TK_close && t.node == start.node)
467 code.len = t.txt.txt - code.txt;
473 Now we have all the bits we need to parse a full production.
478 ###### grammar fields
479 struct production **productions;
480 int production_count;
482 ###### production fields
484 struct symbol **body;
490 int first_production;
493 static char *parse_production(struct grammar *g,
495 struct token_state *state)
497 /* Head has already been parsed. */
500 struct production p, *pp;
502 memset(&p, 0, sizeof(p));
504 tk = token_next(state);
505 while (tk.num == TK_ident || tk.num == TK_mark) {
506 struct symbol *bs = sym_find(g, tk.txt);
507 if (bs->type == Unknown) {
508 if (!g->terminals_declared)
511 if (bs->type == Virtual) {
512 err = "Virtual symbol not permitted in production";
515 if (bs->precedence) {
516 p.precedence = bs->precedence;
519 array_add(&p.body, &p.body_size, bs);
520 tk = token_next(state);
522 if (tk.num == TK_virtual) {
524 tk = token_next(state);
525 if (tk.num != TK_ident) {
526 err = "word required after $$";
529 vs = sym_find(g, tk.txt);
530 if (vs->num == TK_newline)
532 else if (vs->num == TK_out)
534 else if (vs->precedence == 0) {
535 err = "symbol after $$ must have precedence";
538 p.precedence = vs->precedence;
541 tk = token_next(state);
543 if (tk.num == TK_open) {
544 p.code_line = tk.line;
545 p.code = collect_code(state, tk);
546 if (p.code.txt == NULL) {
547 err = "code fragment not closed properly";
550 tk = token_next(state);
553 if (tk.num != TK_newline && tk.num != TK_eof) {
554 err = "stray tokens at end of line";
557 pp = malloc(sizeof(*pp));
559 array_add(&g->productions, &g->production_count, pp);
562 while (tk.num != TK_newline && tk.num != TK_eof)
563 tk = token_next(state);
568 With the ability to parse productions and dollar-lines, we have nearly all
569 that we need to parse a grammar from a `code_node`.
571 The head of the first production will effectively be the `start` symbol
572 of the grammar. However it won't _actually_ be so. Processing the
573 grammar is greatly simplified if the real start symbol only has a single
574 production, and expects `$eof` as the final terminal. So when we find
575 the first explicit production we insert an extra implicit production as
576 production zero which looks like
578 ###### Example: production 0
581 where `START` is the first non-terminal given.
583 ###### create production zero
584 struct production *p = calloc(1,sizeof(*p));
585 struct text start = {"$start",6};
586 struct text eof = {"$eof",4};
587 struct text code = {"$0 = $<1;", 9};
588 p->head = sym_find(g, start);
589 p->head->type = Nonterminal;
590 p->head->struct_name = g->current_type;
591 p->head->isref = g->type_isref;
592 if (g->current_type.txt)
594 array_add(&p->body, &p->body_size, head);
595 array_add(&p->body, &p->body_size, sym_find(g, eof));
596 p->head->first_production = g->production_count;
597 array_add(&g->productions, &g->production_count, p);
599 Now we are ready to read in the grammar. We ignore comments
600 and strings so that the marks which introduce them can be
601 used as terminals in the grammar. We don't ignore numbers
602 even though we don't allow them as that causes the scanner
603 to produce errors that the parser is better positioned to handle.
604 We also ignore indentation, but we expect and use newlines.
606 To allow comments in the grammar, we explicitly check for "//" as the
607 first token on the line and those lines are skipped. "//" can still be
608 used as a terminal anywhere that a terminal is expected.
611 static struct grammar *grammar_read(struct code_node *code)
613 struct token_config conf = {
616 .words_marks = known,
617 .known_count = sizeof(known)/sizeof(known[0]),
619 .ignored = (1 << TK_line_comment)
620 | (1 << TK_block_comment)
623 | (1 << TK_multi_string)
628 struct token_state *state = token_open(code, &conf);
630 struct symbol *head = NULL;
634 g = calloc(1, sizeof(*g));
637 for (tk = token_next(state); tk.num != TK_eof;
638 tk = token_next(state)) {
639 if (tk.num == TK_newline)
641 if (tk.num == TK_ident) {
643 head = sym_find(g, tk.txt);
644 if (head->type == Nonterminal)
645 err = "This non-terminal has already be used.";
646 else if (head->type == Virtual)
647 err = "Virtual symbol not permitted in head of production";
649 head->type = Nonterminal;
650 head->struct_name = g->current_type;
651 head->isref = g->type_isref;
652 if (g->production_count == 0) {
653 ## create production zero
655 head->first_production = g->production_count;
656 tk = token_next(state);
657 if (tk.num == TK_mark &&
658 text_is(tk.txt, "->"))
659 err = parse_production(g, head, state);
661 err = "'->' missing in production";
663 } else if (tk.num == TK_mark
664 && text_is(tk.txt, "|")) {
665 // another production for same non-term
667 err = parse_production(g, head, state);
669 err = "First production must have a head";
670 } else if (tk.num == TK_mark
671 && text_is(tk.txt, "$")) {
672 err = dollar_line(state, g, 0);
673 } else if (tk.num == TK_mark
674 && text_is(tk.txt, "$*")) {
675 err = dollar_line(state, g, 1);
676 } else if (tk.num == TK_mark
677 && text_is(tk.txt, "//")) {
678 while (tk.num != TK_newline &&
680 tk = token_next(state);
682 err = "Unrecognised token at start of line.";
688 if (g->terminals_declared) {
691 for (s = g->syms; s; s = s->next) {
692 if (s->type != Unknown)
695 fprintf(stderr, "Token %.*s not declared\n",
696 s->name.len, s->name.txt);
699 free(g); // FIXME free content
705 fprintf(stderr, "Error at line %d: %s\n",
708 free(g); // FIXME free content
712 ## Analysing the grammar
714 The central task in analysing the grammar is to determine a set of
715 states to drive the parsing process. This will require finding various
716 sets of symbols and of "items". Some of these sets will need to attach
717 information to each element in the set, so they are more like maps.
719 Each "item" may have a 'look-ahead' or `LA` set associated with it.
720 Many of these will be the same as each other. To avoid memory wastage,
721 and to simplify some comparisons of sets, these sets will be stored in a
722 list of unique sets, each assigned a number.
724 Once we have the data structures in hand to manage these sets and lists,
725 we can start setting the 'nullable' flag, build the 'FIRST' sets, and
726 then create the item sets which define the various states.
730 Though we don't only store symbols in these sets, they are the main
731 things we store, so they are called symbol sets or "symsets".
733 A symset has a size, an array of shorts, and an optional array of data
734 values, which are also shorts. If the array of data is not present, we
735 store an impossible pointer, as `NULL` is used to indicate that no
736 memory has been allocated yet;
741 unsigned short *syms, *data;
743 #define NO_DATA ((unsigned short *)1)
744 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
745 const struct symset INIT_DATASET = { 0, NULL, NULL };
747 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
748 using an insertion sort. We don't explicitly record the amount of
749 allocated space as it can be derived directly from the current `cnt`
750 using `((cnt - 1) | 7) + 1`.
753 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
756 int current = ((s->cnt-1) | 7) + 1;
757 if (current == s->cnt) {
759 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
760 if (s->data != NO_DATA)
761 s->data = realloc(s->data, sizeof(*s->data) * current);
764 while (i > 0 && s->syms[i-1] > key) {
765 s->syms[i] = s->syms[i-1];
766 if (s->data != NO_DATA)
767 s->data[i] = s->data[i-1];
771 if (s->data != NO_DATA)
776 Finding a symbol (or item) in a `symset` uses a simple binary search.
777 We return the index where the value was found (so data can be accessed),
778 or `-1` to indicate failure.
780 static int symset_find(struct symset *ss, unsigned short key)
787 while (lo + 1 < hi) {
788 int mid = (lo + hi) / 2;
789 if (ss->syms[mid] <= key)
794 if (ss->syms[lo] == key)
799 We will often want to form the union of two symsets. When we do, we
800 will often want to know if anything changed (as that might mean there is
801 more work to do). So `symset_union` reports whether anything was added
802 to the first set. We use a slow quadratic approach as these sets are
803 rarely large. If profiling shows this to be a problem it can be
806 static int symset_union(struct symset *a, struct symset *b)
810 for (i = 0; i < b->cnt; i++)
811 if (symset_find(a, b->syms[i]) < 0) {
812 unsigned short data = 0;
813 if (b->data != NO_DATA)
815 symset_add(a, b->syms[i], data);
821 And of course we must be able to free a symset.
823 static void symset_free(struct symset ss)
826 if (ss.data != NO_DATA)
832 Some symsets are simply stored somewhere appropriate in the `grammar`
833 data structure, others needs a bit of indirection. This applies
834 particularly to "LA" sets. These will be paired with "items" in an
835 "itemset". As itemsets will be stored in a symset, the "LA" set needs
836 to be stored in the `data` field, so we need a mapping from a small
837 (short) number to an LA `symset`.
839 As mentioned earlier this involves creating a list of unique symsets.
841 For now, we just use a linear list sorted by insertion. A skiplist
842 would be more efficient and may be added later.
849 struct setlist *next;
852 ###### grammar fields
853 struct setlist *sets;
858 static int ss_cmp(struct symset a, struct symset b)
861 int diff = a.cnt - b.cnt;
864 for (i = 0; i < a.cnt; i++) {
865 diff = (int)a.syms[i] - (int)b.syms[i];
872 static int save_set(struct grammar *g, struct symset ss)
874 struct setlist **sl = &g->sets;
878 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
885 s = malloc(sizeof(*s));
894 Finding a set by number is currently performed by a simple linear
895 search. If this turns out to hurt performance, we can store the sets in
896 a dynamic array like the productions.
898 static struct symset set_find(struct grammar *g, int num)
900 struct setlist *sl = g->sets;
901 while (sl && sl->num != num)
906 ### Setting `nullable`
908 We set `nullable` on the head symbol for any production for which all
909 the body symbols (if any) are nullable. As this is a recursive
910 definition, any change in the `nullable` setting means that we need to
911 re-evaluate where it needs to be set.
913 We simply loop around performing the same calculations until no more
920 static void set_nullable(struct grammar *g)
923 while (check_again) {
926 for (p = 0; p < g->production_count; p++) {
927 struct production *pr = g->productions[p];
930 if (pr->head->nullable)
932 for (s = 0; s < pr->body_size; s++)
933 if (! pr->body[s]->nullable)
935 if (s == pr->body_size) {
936 pr->head->nullable = 1;
943 ### Setting `line_like`
945 In order to be able to ignore newline tokens when not relevant, but
946 still include them in the parse when needed, we will need to know
947 which states can start a "line-like" section of code. We ignore
948 newlines when there is an indent since the most recent start of a
951 A "line_like" symbol is simply any symbol that can derive a NEWLINE.
952 If a symbol cannot derive a NEWLINE, then it is only part of a line -
953 so is word-like. If it can derive a NEWLINE, then we consider it to
956 Clearly the `TK_newline` token can derive a NEWLINE. Any symbol which
957 is the head of a production that contains a line_like symbol is also a
958 line-like symbol. We use a new field `line_like` to record this
959 attribute of symbols, and compute it in a repetitive manner similar to
966 static void set_line_like(struct grammar *g)
969 g->symtab[TK_newline]->line_like = 1;
970 while (check_again) {
973 for (p = 0; p < g->production_count; p++) {
974 struct production *pr = g->productions[p];
977 if (pr->head->line_like)
980 for (s = 0 ; s < pr->body_size; s++) {
981 if (pr->body[s]->line_like) {
982 pr->head->line_like = 1;
991 ### Building the `first` sets
993 When calculating what can follow a particular non-terminal, we will need
994 to know what the "first" terminal in any subsequent non-terminal might
995 be. So we calculate the `first` set for every non-terminal and store
996 them in an array. We don't bother recording the "first" set for
997 terminals as they are trivial.
999 As the "first" for one symbol might depend on the "first" of another, we
1000 repeat the calculations until no changes happen, just like with
1001 `set_nullable`. This makes use of the fact that `symset_union` reports
1002 if any change happens.
1004 The core of this, which finds the "first" of part of a production body,
1005 will be reused for computing the "follow" sets, so we split that out
1006 into a separate function, `add_first`, which also reports if it got all
1007 the way to the end of the production without finding a non-nullable
1010 ###### grammar fields
1011 struct symset *first;
1015 static int add_first(struct production *pr, int start,
1016 struct symset *target, struct grammar *g,
1021 for (s = start; s < pr->body_size; s++) {
1022 struct symbol *bs = pr->body[s];
1023 if (bs->type == Terminal) {
1024 if (symset_find(target, bs->num) < 0) {
1025 symset_add(target, bs->num, 0);
1029 } else if (symset_union(target, &g->first[bs->num]))
1035 *to_end = (s == pr->body_size);
1039 static void build_first(struct grammar *g)
1041 int check_again = 1;
1043 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1044 for (s = 0; s < g->num_syms; s++)
1045 g->first[s] = INIT_SYMSET;
1047 while (check_again) {
1050 for (p = 0; p < g->production_count; p++) {
1051 struct production *pr = g->productions[p];
1052 struct symset *head = &g->first[pr->head->num];
1054 if (add_first(pr, 0, head, g, NULL))
1060 ### Building the `follow` sets.
1062 There are two different situations when we will want to generate
1063 "follow" sets. If we are doing an SLR analysis, we want to generate a
1064 single "follow" set for each non-terminal in the grammar. That is what
1065 is happening here. If we are doing an LALR or LR analysis we will want
1066 to generate a separate "LA" set for each item. We do that later in
1069 There are two parts to generating a "follow" set. Firstly we look at
1070 every place that any non-terminal appears in the body of any production,
1071 and we find the set of possible "first" symbols after there. This is
1072 done using `add_first` above and only needs to be done once as the
1073 "first" sets are now stable and will not change.
1075 ###### grammar fields
1076 struct symset *follow;
1080 for (p = 0; p < g->production_count; p++) {
1081 struct production *pr = g->productions[p];
1084 for (b = 0; b < pr->body_size - 1; b++) {
1085 struct symbol *bs = pr->body[b];
1086 if (bs->type == Terminal)
1088 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1092 The second part is to add the "follow" set of the head of a production
1093 to the "follow" sets of the final symbol in the production, and any
1094 other symbol which is followed only by `nullable` symbols. As this
1095 depends on "follow" itself we need to repeatedly perform the process
1096 until no further changes happen.
1098 Rather than 2 nested loops, one that repeats the whole process until
1099 there is no change, and one that iterates of the set of productions, we
1100 combine these two functions into a single loop.
1104 for (check_again = 0, p = 0;
1105 p < g->production_count;
1106 p < g->production_count-1
1107 ? p++ : check_again ? (check_again = 0, p = 0)
1109 struct production *pr = g->productions[p];
1112 for (b = pr->body_size - 1; b >= 0; b--) {
1113 struct symbol *bs = pr->body[b];
1114 if (bs->type == Terminal)
1116 if (symset_union(&g->follow[bs->num],
1117 &g->follow[pr->head->num]))
1124 We now just need to create and initialise the `follow` list to get a
1128 static void build_follow(struct grammar *g)
1130 int s, check_again, p;
1131 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1132 for (s = 0; s < g->num_syms; s++)
1133 g->follow[s] = INIT_SYMSET;
1137 ### Building itemsets and states
1139 There are three different levels of detail that can be chosen for
1140 building the itemsets and states for the LR grammar. They are:
1142 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1143 itemsets - look-ahead, if used at all, is simply the 'follow' sets
1145 2. LALR(1) where we build look-ahead sets with each item and merge
1146 the LA sets when we find two paths to the same "kernel" set of items.
1147 3. LR(1) where different look-ahead for any item in the set means
1148 a different item set must be created.
1150 ###### forward declarations
1151 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1153 We need to be able to look through existing states to see if a newly
1154 generated state already exists. For now we use a simple sorted linked
1157 An item is a pair of numbers: the production number and the position of
1158 "DOT", which is an index into the body of the production. As the
1159 numbers are not enormous we can combine them into a single "short" and
1160 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1161 production number (so 2000 productions with maximum of 31 symbols in the
1164 Comparing the itemsets will be a little different to comparing symsets
1165 as we want to do the lookup after generating the "kernel" of an
1166 itemset, so we need to ignore the offset=zero items which are added during
1169 To facilitate this, we modify the "DOT" number so that "0" sorts to
1170 the end of the list in the symset, and then only compare items before
1174 static inline unsigned short item_num(int production, int dot)
1176 return production | ((31-dot) << 11);
1178 static inline int item_prod(unsigned short item)
1180 return item & 0x7ff;
1182 static inline int item_dot(unsigned short item)
1184 return (31-(item >> 11)) & 0x1f;
1187 For LR(1) analysis we need to compare not just the itemset in a state
1188 but also the LA sets. As we assign each unique LA set a number, we
1189 can just compare the symset and the data values together.
1192 static int itemset_cmp(struct symset a, struct symset b,
1193 enum grammar_type type)
1199 i < a.cnt && i < b.cnt &&
1200 item_dot(a.syms[i]) > 0 &&
1201 item_dot(b.syms[i]) > 0;
1203 int diff = a.syms[i] - b.syms[i];
1207 diff = a.data[i] - b.data[i];
1212 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1216 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1222 if (type < LR1 || av == -1)
1225 a.data[i] - b.data[i];
1228 It will be helpful to know if an itemset has been "completed" or not,
1229 particularly for LALR where itemsets get merged, at which point they
1230 need to be consider for completion again. So a `completed` flag is
1233 For correct handling of `TK_newline` when parsing, we will need to
1234 know which states (itemsets) can occur at the start of a line, so we
1235 will record a `starts_line` flag too whenever DOT is at the start of a
1238 Finally, for handling `TK_out` we need to know whether productions in the
1239 current state started *before* the most recent indent. A state
1240 doesn't usually keep details of individual productions, so we need to
1241 add one extra detail. `min_prefix` is the smallest non-zero number of
1242 symbols *before* DOT in any production in an itemset. This will allow
1243 us to determine if the the most recent indent is sufficiently recent
1244 to cancel it against a `TK_out`. If it was seen longer ago than the
1245 `min_prefix`, and if the current state cannot be reduced, then the
1246 indented section must have ended in the middle of a syntactic unit, so
1247 an error must be signaled.
1249 And now we can build the list of itemsets. The lookup routine returns
1250 both a success flag and a pointer to where in the list an insert should
1251 happen, so we don't need to search a second time.
1255 struct itemset *next;
1257 struct symset items;
1258 struct symset go_to;
1260 unsigned short precedence;
1266 ###### grammar fields
1267 struct itemset *items;
1271 static int itemset_find(struct grammar *g, struct itemset ***where,
1272 struct symset kernel, enum grammar_type type)
1274 struct itemset **ip;
1276 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1277 struct itemset *i = *ip;
1279 diff = itemset_cmp(i->items, kernel, type);
1292 Adding an itemset may require merging the LA sets if LALR analysis is
1293 happening. If any new LA set adds any symbols that weren't in the old
1294 LA set, we clear the `completed` flag so that the dependants of this
1295 itemset will be recalculated and their LA sets updated.
1297 `add_itemset` must consume the symsets it is passed, either by adding
1298 them to a data structure, of freeing them.
1300 static int add_itemset(struct grammar *g, struct symset ss,
1301 enum assoc assoc, unsigned short precedence,
1302 enum grammar_type type)
1304 struct itemset **where, *is;
1306 int found = itemset_find(g, &where, ss, type);
1308 is = calloc(1, sizeof(*is));
1309 is->state = g->states;
1313 is->precedence = precedence;
1315 is->go_to = INIT_DATASET;
1324 for (i = 0; i < ss.cnt; i++) {
1325 struct symset temp = INIT_SYMSET, s;
1326 if (ss.data[i] == is->items.data[i])
1328 s = set_find(g, is->items.data[i]);
1329 symset_union(&temp, &s);
1330 s = set_find(g, ss.data[i]);
1331 if (symset_union(&temp, &s)) {
1332 is->items.data[i] = save_set(g, temp);
1343 To build all the itemsets, we first insert the initial itemset made from
1344 production zero, complete each itemset, and then generate new itemsets
1345 from old until no new ones can be made.
1347 Completing an itemset means finding all the items where "DOT" is
1348 followed by a nonterminal and adding "DOT=0" items for every production
1349 from that non-terminal - providing each item hasn't already been added.
1350 When we add such an item it might get inserted before the current
1351 one, so to make sure we process it we reset the loop counter to the
1354 If LA sets are needed, the LA set for each new item is found using
1355 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This
1356 may be supplemented by the LA set for the item which produced the new
1359 We also collect a set of all symbols which follow "DOT" (in `done`) as
1360 this is used in the next stage. If any of these symbols are flagged as
1361 `line_like`, then this state must be a `starts_line` state so now is a
1362 good time to record that.
1364 When itemsets are created we assign a precedence to the itemset from the
1365 complete item, if there is one. We ignore the possibility of there
1366 being two and don't (currently) handle precedence in such grammars.
1367 When completing a grammar we ignore any item where DOT is followed by a
1368 terminal with a precedence lower than that for the itemset. Unless the
1369 terminal has right associativity, we also ignore items where the
1370 terminal has the same precedence. The result is that unwanted items are
1371 still in the itemset, but the terminal doesn't get into the go to set,
1372 so the item is ineffective.
1374 ###### complete itemset
1375 for (i = 0; i < is->items.cnt; i++) {
1376 int p = item_prod(is->items.syms[i]);
1377 int bs = item_dot(is->items.syms[i]);
1378 struct production *pr = g->productions[p];
1381 struct symset LA = INIT_SYMSET;
1382 unsigned short sn = 0;
1383 struct symset LAnl = INIT_SYMSET;
1384 unsigned short snnl = 0;
1386 if (is->min_prefix == 0 ||
1387 (bs > 0 && bs < is->min_prefix))
1388 is->min_prefix = bs;
1389 if (bs == pr->body_size)
1392 if (s->precedence && is->precedence &&
1393 is->precedence > s->precedence)
1394 /* This terminal has a low precedence and
1395 * shouldn't be shifted
1398 if (s->precedence && is->precedence &&
1399 is->precedence == s->precedence && s->assoc != Right)
1400 /* This terminal has a matching precedence and is
1401 * not Right-associative, so we mustn't shift it.
1404 if (symset_find(&done, s->num) < 0) {
1405 symset_add(&done, s->num, 0);
1407 if (s->type != Nonterminal)
1410 is->starts_line = 1;
1415 add_first(pr, bs+1, &LA, g, &to_end);
1417 struct symset ss = set_find(g, is->items.data[i]);
1418 symset_union(&LA, &ss);
1420 sn = save_set(g, LA);
1421 LA = set_find(g, sn);
1422 if (symset_find(&LA, TK_newline))
1423 symset_add(&LAnl, TK_newline, 0);
1424 snnl = save_set(g, LAnl);
1425 LAnl = set_find(g, snnl);
1428 /* Add productions for this symbol */
1429 for (p2 = s->first_production;
1430 p2 < g->production_count &&
1431 g->productions[p2]->head == s;
1433 int itm = item_num(p2, 0);
1434 int pos = symset_find(&is->items, itm);
1436 if (g->productions[p2]->line_like)
1437 symset_add(&is->items, itm, snnl);
1439 symset_add(&is->items, itm, sn);
1440 /* Will have re-ordered, so start
1441 * from beginning again */
1443 } else if (type >= LALR) {
1444 struct symset ss = set_find(g, is->items.data[pos]);
1445 struct symset tmp = INIT_SYMSET;
1446 struct symset *la = &LA;
1448 if (g->productions[p2]->line_like)
1450 symset_union(&tmp, &ss);
1451 if (symset_union(&tmp, la)) {
1452 is->items.data[pos] = save_set(g, tmp);
1460 For each symbol we found (and placed in `done`) we collect all the items
1461 for which this symbol is next, and create a new itemset with "DOT"
1462 advanced over the symbol. This is then added to the collection of
1463 itemsets (or merged with a pre-existing itemset).
1465 ###### derive itemsets
1466 // Now we have a completed itemset, so we need to
1467 // compute all the 'next' itemsets and create them
1468 // if they don't exist.
1469 for (i = 0; i < done.cnt; i++) {
1471 unsigned short state;
1472 struct symbol *sym = g->symtab[done.syms[i]];
1473 enum assoc assoc = Non;
1474 unsigned short precedence = 0;
1475 struct symset newitemset = INIT_SYMSET;
1477 newitemset = INIT_DATASET;
1479 for (j = 0; j < is->items.cnt; j++) {
1480 int itm = is->items.syms[j];
1481 int p = item_prod(itm);
1482 int bp = item_dot(itm);
1483 struct production *pr = g->productions[p];
1484 unsigned short la = 0;
1487 if (bp == pr->body_size)
1489 if (pr->body[bp] != sym)
1492 la = is->items.data[j];
1493 pos = symset_find(&newitemset, pr->head->num);
1494 if (bp + 1 == pr->body_size &&
1495 pr->precedence > 0 &&
1496 pr->precedence > precedence) {
1497 // new itemset is reducible and has a precedence.
1498 precedence = pr->precedence;
1502 symset_add(&newitemset, item_num(p, bp+1), la);
1503 else if (type >= LALR) {
1504 // Need to merge la set.
1505 int la2 = newitemset.data[pos];
1507 struct symset ss = set_find(g, la2);
1508 struct symset LA = INIT_SYMSET;
1509 symset_union(&LA, &ss);
1510 ss = set_find(g, la);
1511 if (symset_union(&LA, &ss))
1512 newitemset.data[pos] = save_set(g, LA);
1518 state = add_itemset(g, newitemset, assoc, precedence, type);
1519 if (symset_find(&is->go_to, done.syms[i]) < 0)
1520 symset_add(&is->go_to, done.syms[i], state);
1523 All that is left is to create the initial itemset from production zero, and
1524 with `TK_eof` as the LA set.
1527 static void build_itemsets(struct grammar *g, enum grammar_type type)
1529 struct symset first = INIT_SYMSET;
1532 unsigned short la = 0;
1534 // LA set just has eof
1535 struct symset eof = INIT_SYMSET;
1536 symset_add(&eof, TK_eof, 0);
1537 la = save_set(g, eof);
1538 first = INIT_DATASET;
1540 // production 0, offset 0 (with no data)
1541 symset_add(&first, item_num(0, 0), la);
1542 add_itemset(g, first, Non, 0, type);
1543 for (check_again = 0, is = g->items;
1545 is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1547 struct symset done = INIT_SYMSET;
1558 ### Completing the analysis.
1560 The exact process of analysis depends on which level was requested. For
1561 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1562 `LR1` we need first, but not follow. For `SLR` we need both.
1564 We don't build the "action" tables that you might expect as the parser
1565 doesn't use them. They will be calculated to some extent if needed for
1568 Once we have built everything we allocate arrays for the two lists:
1569 symbols and itemsets. This allows more efficient access during
1570 reporting. The symbols are grouped as terminals and then non-terminals,
1571 and we record the changeover point in `first_nonterm`.
1573 ###### grammar fields
1574 struct symbol **symtab;
1575 struct itemset **statetab;
1578 ###### grammar_analyse
1580 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1584 int snum = TK_reserved;
1585 for (s = g->syms; s; s = s->next)
1586 if (s->num < 0 && s->type == Terminal) {
1590 g->first_nonterm = snum;
1591 for (s = g->syms; s; s = s->next)
1592 if (s->num < 0 && s->type != Virtual) {
1596 for (s = g->syms; s; s = s->next)
1602 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1603 for (s = g->syms; s; s = s->next)
1604 g->symtab[s->num] = s;
1614 build_itemsets(g, type);
1616 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1617 for (is = g->items; is ; is = is->next)
1618 g->statetab[is->state] = is;
1621 ## Reporting on the Grammar
1623 The purpose of the report is to give the grammar developer insight into
1624 how the grammar parser will work. It is basically a structured dump of
1625 all the tables that have been generated, plus a description of any conflicts.
1627 ###### grammar_report
1628 static int grammar_report(struct grammar *g, enum grammar_type type)
1634 return report_conflicts(g, type);
1637 Firstly we have the complete list of symbols, together with the
1638 "FIRST" set if that was generated. We add a mark to each symbol to
1639 show if it is considered to be "line-like" (`<`), or if it is nullable (`.`).
1643 static void report_symbols(struct grammar *g)
1647 printf("SYMBOLS + FIRST:\n");
1649 printf("SYMBOLS:\n");
1651 for (n = 0; n < g->num_syms; n++) {
1652 struct symbol *s = g->symtab[n];
1656 printf(" %c%c%3d%c: ",
1657 s->nullable ? '.':' ',
1658 s->line_like ? '<':' ',
1659 s->num, symtypes[s->type]);
1662 printf(" (%d%s)", s->precedence,
1663 assoc_names[s->assoc]);
1665 if (g->first && s->type == Nonterminal) {
1668 for (j = 0; j < g->first[n].cnt; j++) {
1671 prtxt(g->symtab[g->first[n].syms[j]]->name);
1678 Then we have the follow sets if they were computed.
1680 static void report_follow(struct grammar *g)
1683 printf("FOLLOW:\n");
1684 for (n = 0; n < g->num_syms; n++)
1685 if (g->follow[n].cnt) {
1689 prtxt(g->symtab[n]->name);
1690 for (j = 0; j < g->follow[n].cnt; j++) {
1693 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1699 And finally the item sets. These include the GO TO tables and, for
1700 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1701 it up a bit. First the items, with production number and associativity.
1703 static void report_item(struct grammar *g, int itm)
1705 int p = item_prod(itm);
1706 int dot = item_dot(itm);
1707 struct production *pr = g->productions[p];
1711 prtxt(pr->head->name);
1713 for (i = 0; i < pr->body_size; i++) {
1714 printf(" %s", (dot == i ? ". ": ""));
1715 prtxt(pr->body[i]->name);
1717 if (dot == pr->body_size)
1720 if (pr->precedence && dot == pr->body_size)
1721 printf(" (%d%s)", pr->precedence,
1722 assoc_names[pr->assoc]);
1723 if (dot < pr->body_size &&
1724 pr->body[dot]->precedence) {
1725 struct symbol *s = pr->body[dot];
1726 printf(" [%d%s]", s->precedence,
1727 assoc_names[s->assoc]);
1729 if (pr->line_like == 1)
1730 printf(" $$NEWLINE");
1731 else if (pr->line_like)
1736 The LA sets which are (possibly) reported with each item:
1738 static void report_la(struct grammar *g, int lanum)
1740 struct symset la = set_find(g, lanum);
1744 printf(" LOOK AHEAD(%d)", lanum);
1745 for (i = 0; i < la.cnt; i++) {
1748 prtxt(g->symtab[la.syms[i]]->name);
1753 Then the go to sets:
1755 static void report_goto(struct grammar *g, struct symset gt)
1760 for (i = 0; i < gt.cnt; i++) {
1762 prtxt(g->symtab[gt.syms[i]]->name);
1763 printf(" -> %d\n", gt.data[i]);
1767 Now we can report all the item sets complete with items, LA sets, and GO TO.
1769 static void report_itemsets(struct grammar *g)
1772 printf("ITEM SETS(%d)\n", g->states);
1773 for (s = 0; s < g->states; s++) {
1775 struct itemset *is = g->statetab[s];
1776 printf(" Itemset %d:%s min prefix=%d",
1777 s, is->starts_line?" (startsline)":"", is->min_prefix);
1779 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1781 for (j = 0; j < is->items.cnt; j++) {
1782 report_item(g, is->items.syms[j]);
1783 if (is->items.data != NO_DATA)
1784 report_la(g, is->items.data[j]);
1786 report_goto(g, is->go_to);
1790 ### Reporting conflicts
1792 Conflict detection varies a lot among different analysis levels. However
1793 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1794 LR1 are also similar as they have FOLLOW or LA sets.
1798 ## conflict functions
1800 static int report_conflicts(struct grammar *g, enum grammar_type type)
1803 printf("Conflicts:\n");
1805 cnt = conflicts_lr0(g, type);
1807 cnt = conflicts_slr(g, type);
1809 printf(" - no conflicts\n");
1813 LR0 conflicts are any state which have both a reducible item and
1814 a shiftable item, or two reducible items.
1816 LR05 conflicts only occur if two possible reductions exist,
1817 as shifts always over-ride reductions.
1819 ###### conflict functions
1820 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1824 for (i = 0; i < g->states; i++) {
1825 struct itemset *is = g->statetab[i];
1826 int last_reduce = -1;
1827 int prev_reduce = -1;
1828 int last_shift = -1;
1832 for (j = 0; j < is->items.cnt; j++) {
1833 int itm = is->items.syms[j];
1834 int p = item_prod(itm);
1835 int bp = item_dot(itm);
1836 struct production *pr = g->productions[p];
1838 if (bp == pr->body_size) {
1839 prev_reduce = last_reduce;
1843 if (pr->body[bp]->type == Terminal)
1846 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1847 printf(" State %d has both SHIFT and REDUCE:\n", i);
1848 report_item(g, is->items.syms[last_shift]);
1849 report_item(g, is->items.syms[last_reduce]);
1852 if (prev_reduce >= 0) {
1853 printf(" State %d has 2 (or more) reducible items\n", i);
1854 report_item(g, is->items.syms[prev_reduce]);
1855 report_item(g, is->items.syms[last_reduce]);
1862 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1863 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1864 in the source of the look ahead set.
1866 We build two datasets to reflect the "action" table: one which maps
1867 terminals to items where that terminal could be shifted and another
1868 which maps terminals to items that could be reduced when the terminal
1869 is in look-ahead. We report when we get conflicts between the two.
1871 As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
1872 terminal, we ignore it. NEWLINES are handled specially with its own
1873 rules for when to shift and when to reduce. Conflicts are expected,
1874 but handled internally.
1876 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1881 for (i = 0; i < g->states; i++) {
1882 struct itemset *is = g->statetab[i];
1883 struct symset shifts = INIT_DATASET;
1884 struct symset reduce = INIT_DATASET;
1888 /* First collect the shifts */
1889 for (j = 0; j < is->items.cnt; j++) {
1890 unsigned short itm = is->items.syms[j];
1891 int p = item_prod(itm);
1892 int bp = item_dot(itm);
1893 struct production *pr = g->productions[p];
1896 if (bp >= pr->body_size ||
1897 pr->body[bp]->type != Terminal)
1902 if (s->precedence && is->precedence)
1903 /* Precedence resolves this, so no conflict */
1906 if (symset_find(&shifts, s->num) < 0)
1907 symset_add(&shifts, s->num, itm);
1909 /* Now look for reductions and conflicts */
1910 for (j = 0; j < is->items.cnt; j++) {
1911 unsigned short itm = is->items.syms[j];
1912 int p = item_prod(itm);
1913 int bp = item_dot(itm);
1914 struct production *pr = g->productions[p];
1916 if (bp < pr->body_size)
1921 la = g->follow[pr->head->num];
1923 la = set_find(g, is->items.data[j]);
1925 for (k = 0; k < la.cnt; k++) {
1926 int pos = symset_find(&shifts, la.syms[k]);
1927 if (pos >= 0 && la.syms[k] != TK_newline) {
1928 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1930 prtxt(g->symtab[la.syms[k]]->name);
1932 report_item(g, shifts.data[pos]);
1933 report_item(g, itm);
1935 pos = symset_find(&reduce, la.syms[k]);
1937 symset_add(&reduce, la.syms[k], itm);
1940 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1941 prtxt(g->symtab[la.syms[k]]->name);
1943 report_item(g, itm);
1944 report_item(g, reduce.data[pos]);
1948 symset_free(shifts);
1949 symset_free(reduce);
1955 ## Generating the parser
1957 The exported part of the parser is the `parse_XX` function, where the name
1958 `XX` is based on the name of the parser files.
1960 This takes a `code_node`, a partially initialized `token_config`, and an
1961 optional `FILE` to send tracing to. The `token_config` gets the list of
1962 known words added and then is used with the `code_node` to initialize the
1965 `parse_XX` then calls the library function `parser_run` to actually complete
1966 the parse. This needs the `states` table and functions to call the various
1967 pieces of code provided in the grammar file, so they are generated first.
1969 ###### parser_generate
1971 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1972 struct code_node *pre_reduce)
1978 gen_reduce(f, g, file, pre_reduce);
1981 fprintf(f, "#line 0 \"gen_parser\"\n");
1982 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1985 fprintf(f, "\tstruct token_state *tokens;\n");
1986 fprintf(f, "\tconfig->words_marks = known;\n");
1987 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1988 fprintf(f, "\ttokens = token_open(code, config);\n");
1989 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1990 fprintf(f, "\ttoken_close(tokens);\n");
1991 fprintf(f, "\treturn rv;\n");
1992 fprintf(f, "}\n\n");
1995 ### Known words table
1997 The known words table is simply an array of terminal symbols.
1998 The table of nonterminals used for tracing is a similar array.
2002 static void gen_known(FILE *f, struct grammar *g)
2005 fprintf(f, "#line 0 \"gen_known\"\n");
2006 fprintf(f, "static const char *known[] = {\n");
2007 for (i = TK_reserved;
2008 i < g->num_syms && g->symtab[i]->type == Terminal;
2010 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
2011 g->symtab[i]->name.txt);
2012 fprintf(f, "};\n\n");
2015 static void gen_non_term(FILE *f, struct grammar *g)
2018 fprintf(f, "#line 0 \"gen_non_term\"\n");
2019 fprintf(f, "static const char *non_term[] = {\n");
2020 for (i = TK_reserved;
2023 if (g->symtab[i]->type == Nonterminal)
2024 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
2025 g->symtab[i]->name.txt);
2026 fprintf(f, "};\n\n");
2029 ### States and the goto tables.
2031 For each state we record the goto table and details of the reducible
2032 production if there is one.
2033 Some of the details of the reducible production are stored in the
2034 `do_reduce` function to come later. Here we store the production
2035 number, the body size (useful for stack management), and the resulting
2036 symbol (useful for knowing how to free data later).
2038 The go to table is stored in a simple array of `sym` and corresponding
2041 ###### exported types
2049 const struct lookup * go_to;
2060 static void gen_goto(FILE *f, struct grammar *g)
2063 fprintf(f, "#line 0 \"gen_goto\"\n");
2064 for (i = 0; i < g->states; i++) {
2066 fprintf(f, "static const struct lookup goto_%d[] = {\n",
2068 struct symset gt = g->statetab[i]->go_to;
2069 for (j = 0; j < gt.cnt; j++)
2070 fprintf(f, "\t{ %d, %d },\n",
2071 gt.syms[j], gt.data[j]);
2076 static void gen_states(FILE *f, struct grammar *g)
2079 fprintf(f, "#line 0 \"gen_states\"\n");
2080 fprintf(f, "static const struct state states[] = {\n");
2081 for (i = 0; i < g->states; i++) {
2082 struct itemset *is = g->statetab[i];
2083 int j, prod = -1, prod_len;
2085 for (j = 0; j < is->items.cnt; j++) {
2086 int itm = is->items.syms[j];
2087 int p = item_prod(itm);
2088 int bp = item_dot(itm);
2089 struct production *pr = g->productions[p];
2091 if (bp < pr->body_size)
2093 /* This is what we reduce - choose longest */
2094 if (prod < 0 || prod_len < pr->body_size) {
2096 prod_len = pr->body_size;
2101 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n",
2102 i, is->go_to.cnt, i, prod,
2103 g->productions[prod]->body_size,
2104 g->productions[prod]->head->num,
2106 g->productions[prod]->line_like,
2109 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n",
2110 i, is->go_to.cnt, i,
2111 is->starts_line, is->min_prefix);
2113 fprintf(f, "};\n\n");
2116 ### The `do_reduce` function and the code
2118 When the parser engine decides to reduce a production, it calls
2119 `do_reduce` which runs the code that was included with the production,
2122 This code needs to be able to store data somewhere. Rather than
2123 requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
2124 buffer and have `do_reduce` return the size to be saved.
2126 In order for the code to access "global" context, we pass in the
2127 "config" pointer that was passed to the parser function. If the `struct
2128 token_config` is embedded in some larger structure, the reducing code
2129 can access the larger structure using pointer manipulation. Performing
2130 that pointer manipulation, and any other common code or variables for
2131 all reduce actions, can be provided in code node called "reduce" which
2132 is passed around in `pre_reduce` which you might have already noticed.
2134 The code fragments require translation when written out. Any `$N` needs
2135 to be converted to a reference either to that buffer (if `$0`) or to the
2136 structure returned by a previous reduction. These pointers need to be
2137 cast to the appropriate type for each access. All this is handled in
2140 `gen_code` also allows symbol references to contain a '`<`' as in
2141 '`$<2`'. This is particularly useful for references (or pointers), but
2142 can be used with structures too. The `<` implies that the value is
2143 being moved out, so the object will not be automatically freed. It is
2144 equivalent to assigning `NULL` to the pointer or filling a structure
2147 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2148 and possibly a number. A number by itself (other than zero) selects a
2149 symbol from the body of the production. A sequence of letters selects
2150 the shortest symbol in the body which contains those letters in the
2151 given order. If a number follows the letters, then a later occurrence
2152 of that symbol is chosen. So "`$AB2`" will refer to the structure
2153 attached to the second occurrence of the shortest symbol which contains
2154 an `A` followed by a `B`. If there is no unique shortest system, or if
2155 the number given is too large, then the symbol reference is not
2156 transformed, and will cause an error when the code is compiled.
2160 static int textchr(struct text t, char c, int s)
2163 for (i = s; i < t.len; i++)
2169 static int subseq_match(char *seq, int slen, struct text name)
2173 st = textchr(name, *seq, st);
2183 static int choose_sym(char **namep, int len, struct production *p)
2185 char *name = *namep;
2194 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2200 while (len > 0 && (c >= '0' && c <= '9')) {
2203 n = n * 10 + (c - '0');
2207 if (name == *namep || n > p->body_size)
2213 for (i = 0; i < p->body_size; i++) {
2214 if (!subseq_match(nam, namlen, p->body[i]->name))
2216 if (slen == 0 || p->body[i]->name.len < slen) {
2218 slen = p->body[i]->name.len;
2220 if (s >= 0 && p->body[i] != p->body[s] &&
2221 p->body[i]->name.len == p->body[s]->name.len)
2222 /* not unique, so s cannot be used */
2229 for (i = 0; i < p->body_size; i++)
2230 if (p->body[i] == p->body[s]) {
2235 if (n > 0 || i == p->body_size)
2241 static void gen_code(struct production *p, FILE *f, struct grammar *g)
2244 char *used = calloc(1, p->body_size);
2247 fprintf(f, "\t\t\t");
2248 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2262 n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2271 fprintf(f, "(*(struct %.*s*%s)ret)",
2272 p->head->struct_name.len,
2273 p->head->struct_name.txt,
2274 p->head->isref ? "*":"");
2275 else if (p->body[n-1]->type == Terminal)
2276 fprintf(f, "(*(struct token *)body[%d])",
2278 else if (p->body[n-1]->struct_name.txt == NULL)
2279 fprintf(f, "$%d", n);
2281 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2282 p->body[n-1]->struct_name.len,
2283 p->body[n-1]->struct_name.txt,
2284 p->body[n-1]->isref ? "*":"", n-1);
2290 for (i = 0; i < p->body_size; i++) {
2291 if (p->body[i]->struct_name.txt &&
2293 // assume this has been copied out
2294 if (p->body[i]->isref)
2295 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2297 fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2305 static void gen_reduce(FILE *f, struct grammar *g, char *file,
2306 struct code_node *code)
2309 fprintf(f, "#line 1 \"gen_reduce\"\n");
2310 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2312 fprintf(f, "\tint ret_size = 0;\n");
2314 code_node_print(f, code, file);
2316 fprintf(f, "#line 4 \"gen_reduce\"\n");
2317 fprintf(f, "\tswitch(prod) {\n");
2318 for (i = 0; i < g->production_count; i++) {
2319 struct production *p = g->productions[i];
2320 fprintf(f, "\tcase %d:\n", i);
2323 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2327 if (p->head->struct_name.txt)
2328 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2329 p->head->struct_name.len,
2330 p->head->struct_name.txt,
2331 p->head->isref ? "*":"");
2333 fprintf(f, "\t\tbreak;\n");
2335 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2340 As each non-terminal can potentially cause a different type of data
2341 structure to be allocated and filled in, we need to be able to free it when
2344 It is particularly important to have fine control over freeing during error
2345 recovery where individual stack frames might need to be freed.
2347 For this, the grammar author is required to defined a `free_XX` function for
2348 each structure that is used by a non-terminal. `do_free` will call whichever
2349 is appropriate given a symbol number, and will call `free` (as is
2350 appropriate for tokens) on any terminal symbol.
2354 static void gen_free(FILE *f, struct grammar *g)
2358 fprintf(f, "#line 0 \"gen_free\"\n");
2359 fprintf(f, "static void do_free(short sym, void *asn)\n");
2361 fprintf(f, "\tif (!asn) return;\n");
2362 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2363 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2364 fprintf(f, "\tswitch(sym) {\n");
2366 for (i = 0; i < g->num_syms; i++) {
2367 struct symbol *s = g->symtab[i];
2369 s->type != Nonterminal ||
2370 s->struct_name.txt == NULL)
2373 fprintf(f, "\tcase %d:\n", s->num);
2375 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2377 s->struct_name.txt);
2378 fprintf(f, "\t\tfree(asn);\n");
2380 fprintf(f, "\t\tfree_%.*s(asn);\n",
2382 s->struct_name.txt);
2383 fprintf(f, "\t\tbreak;\n");
2385 fprintf(f, "\t}\n}\n\n");
2388 ## The main routine.
2390 There are three key parts to the "main" function of parsergen: processing
2391 the arguments, loading the grammar file, and dealing with the grammar.
2393 ### Argument processing.
2395 Command line options allow the selection of analysis mode, name of output
2396 file, and whether or not a report should be generated. By default we create
2397 a report only if no code output was requested.
2399 The `parse_XX` function name uses the basename of the output file which
2400 should not have a suffix (`.c`). `.c` is added to the given name for the
2401 code, and `.h` is added for the header (if header text is specifed in the
2408 static const struct option long_options[] = {
2409 { "LR0", 0, NULL, '0' },
2410 { "LR05", 0, NULL, '5' },
2411 { "SLR", 0, NULL, 'S' },
2412 { "LALR", 0, NULL, 'L' },
2413 { "LR1", 0, NULL, '1' },
2414 { "tag", 1, NULL, 't' },
2415 { "report", 0, NULL, 'R' },
2416 { "output", 1, NULL, 'o' },
2417 { NULL, 0, NULL, 0 }
2419 const char *options = "05SL1t:Ro:";
2421 ###### process arguments
2423 char *outfile = NULL;
2428 enum grammar_type type = LR05;
2429 while ((opt = getopt_long(argc, argv, options,
2430 long_options, NULL)) != -1) {
2445 outfile = optarg; break;
2447 tag = optarg; break;
2449 fprintf(stderr, "Usage: parsergen ...\n");
2454 infile = argv[optind++];
2456 fprintf(stderr, "No input file given\n");
2459 if (outfile && report == 1)
2462 if (name && strchr(name, '/'))
2463 name = strrchr(name, '/')+1;
2465 if (optind < argc) {
2466 fprintf(stderr, "Excess command line arguments\n");
2470 ### Loading the grammar file
2472 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2475 Once we have extracted the code (with `mdcode`) we expect to find three
2476 or four sections: header, code, grammar, reduce.
2477 Anything else that is not excluded by the `--tag` option is an error.
2479 "header", "code", and "reduce" are optional, though it is hard to build
2480 a working parser without the first two. "grammar" must be provided.
2484 #include <sys/mman.h>
2489 static void pr_err(char *msg)
2492 fprintf(stderr, "%s\n", msg);
2496 struct section *table;
2500 fd = open(infile, O_RDONLY);
2502 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2503 infile, strerror(errno));
2506 len = lseek(fd, 0, 2);
2507 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2508 table = code_extract(file, file+len, pr_err);
2510 struct code_node *hdr = NULL;
2511 struct code_node *code = NULL;
2512 struct code_node *gram = NULL;
2513 struct code_node *pre_reduce = NULL;
2514 for (s = table; s; s = s->next) {
2515 struct text sec = s->section;
2516 if (tag && !strip_tag(&sec, tag))
2518 if (text_is(sec, "header"))
2520 else if (text_is(sec, "code"))
2522 else if (text_is(sec, "grammar"))
2524 else if (text_is(sec, "reduce"))
2525 pre_reduce = s->code;
2527 fprintf(stderr, "Unknown content section: %.*s\n",
2528 s->section.len, s->section.txt);
2533 ### Processing the input
2535 As we need to append an extention to a filename and then open it for
2536 writing, and we need to do this twice, it helps to have a separate function.
2540 static FILE *open_ext(char *base, char *ext)
2542 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2544 strcat(strcpy(fn, base), ext);
2550 If we can read the grammar, then we analyse and optionally report on it. If
2551 the report finds conflicts we will exit with an error status.
2553 ###### process input
2554 struct grammar *g = NULL;
2556 fprintf(stderr, "No grammar section provided\n");
2559 g = grammar_read(gram);
2561 fprintf(stderr, "Failure to parse grammar\n");
2566 grammar_analyse(g, type);
2568 if (grammar_report(g, type))
2572 If a "headers" section is defined, we write it out and include a declaration
2573 for the `parse_XX` function so it can be used from separate code.
2575 if (rv == 0 && hdr && outfile) {
2576 FILE *f = open_ext(outfile, ".h");
2578 code_node_print(f, hdr, infile);
2579 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2583 fprintf(stderr, "Cannot create %s.h\n",
2589 And if all goes well and an output file was provided, we create the `.c`
2590 file with the code section (if any) and the parser tables and function.
2592 if (rv == 0 && outfile) {
2593 FILE *f = open_ext(outfile, ".c");
2596 code_node_print(f, code, infile);
2597 gen_parser(f, g, infile, name, pre_reduce);
2600 fprintf(stderr, "Cannot create %s.c\n",
2606 And that about wraps it up. We need to set the locale so that UTF-8 is
2607 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2609 ###### File: parsergen.mk
2610 parsergen : parsergen.o libscanner.o libmdcode.o
2611 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2618 int main(int argc, char *argv[])
2623 setlocale(LC_ALL,"");
2625 ## process arguments
2632 ## The SHIFT/REDUCE parser
2634 Having analysed the grammar and generated all the tables, we only need
2635 the shift/reduce engine to bring it all together.
2637 ### Goto table lookup
2639 The parser generator has nicely provided us with goto tables sorted by
2640 symbol number. We need a binary search function to find a symbol in the
2643 ###### parser functions
2645 static int search(const struct state *l, int sym)
2648 int hi = l->go_to_cnt;
2652 while (lo + 1 < hi) {
2653 int mid = (lo + hi) / 2;
2654 if (l->go_to[mid].sym <= sym)
2659 if (l->go_to[lo].sym == sym)
2660 return l->go_to[lo].state;
2665 ### Memory allocation
2667 The `scanner` returns tokens in a local variable - we want them in allocated
2668 memory so they can live in the `asn_stack`. Similarly the `asn` produced by
2669 a reduce is in a large buffer. Both of these require some allocation and
2670 copying, hence `memdup` and `tok_copy`.
2672 ###### parser includes
2675 ###### parser functions
2677 void *memdup(void *m, int len)
2683 memcpy(ret, m, len);
2687 static struct token *tok_copy(struct token tk)
2689 struct token *new = malloc(sizeof(*new));
2694 ### The state stack.
2696 The core data structure for the parser is the stack. This tracks all
2697 the symbols that have been recognised or partially recognised.
2699 The stack usually won't grow very large - maybe a few tens of entries.
2700 So we dynamically grow an array as required but never bother to
2701 shrink it down again.
2703 We keep the stack as two separate allocations. One, `asn_stack` stores
2704 the "abstract syntax nodes" which are created by each reduction. When
2705 we call `do_reduce` we need to pass an array of the `asn`s of the body
2706 of the production, and by keeping a separate `asn` stack, we can just
2707 pass a pointer into this stack.
2709 The other allocation stores all other stack fields of which there are
2710 several. The `state` is the most important one and guides the parsing
2711 process. The `sym` is nearly unnecessary as it is implicit in the
2712 state. However when we want to free entries from the `asn_stack`, it
2713 helps to know what type they are so we can call the right freeing
2714 function. The symbol leads us to the right free function through
2717 The `indents` count tracks the line indents with-in the symbol or
2718 immediately follow it. These are used to allow indent information to
2719 guide parsing and error recovery.
2721 `since_newline` tracks how many stack frames since the last
2722 start-of-line (whether indented or not). So if `since_newline` is
2723 zero, then this symbol is at the start of a line. Similarly
2724 `since_indent` counts the number of states since an indent, it is zero
2725 precisely when `indents` is not zero.
2727 `newline_permitted` keeps track of whether newlines should be ignored
2730 The stack is most properly seen as alternating states and symbols -
2731 states, like the 'DOT' in items, are between symbols. Each frame in
2732 our stack holds a state and the symbol that was before it. The
2733 bottom of stack holds the start state but no symbol, as nothing came
2734 before the beginning. As we need to store some value, `TK_eof` is used
2735 to mark the beginning of the file as well as the end.
2737 ###### parser functions
2742 short newline_permitted;
2746 short since_newline;
2756 Two operations are needed on the stack - shift (which is like push) and pop.
2758 Shift applies not only to terminals but also to non-terminals. When we
2759 reduce a production we will pop off frames corresponding to the body
2760 symbols, then push on a frame for the head of the production. This last
2761 is exactly the same process as shifting in a terminal so we use the same
2762 function for both. In both cases we provide the symbol, the number of
2763 indents the symbol contains (which will be zero for a terminal symbol)
2764 and a flag indicating the the symbol was at (or was reduced from a
2765 symbol which was at) the start of a line. The state is deduced from the
2766 current top-of-stack state and the new symbol.
2768 To simplify other code we arrange for `shift` to fail if there is no `goto`
2769 state for the symbol. This is useful in basic parsing due to our design
2770 that we shift when we can, and reduce when we cannot. So the `shift`
2771 function reports if it could.
2773 `shift` is also used to push state zero onto the stack, so if the
2774 stack is empty, it always chooses zero as the next state.
2776 So `shift` finds the next state. If that succeeds it extends the
2777 allocations if needed and pushes all the information onto the stacks.
2779 Newlines are permitted after a `starts_line` state until an internal
2780 indent. If the new frame has neither a `starts_line` state nor an
2781 indent, newlines are permitted if the previous stack frame permitted
2784 ###### parser functions
2786 static int shift(struct parser *p,
2787 short sym, short indents, short start_of_line,
2789 const struct state states[])
2791 // Push an entry onto the stack
2792 struct frame next = {0};
2793 int newstate = p->tos
2794 ? search(&states[p->stack[p->tos-1].state],
2799 if (p->tos >= p->stack_size) {
2800 p->stack_size += 10;
2801 p->stack = realloc(p->stack, p->stack_size
2802 * sizeof(p->stack[0]));
2803 p->asn_stack = realloc(p->asn_stack, p->stack_size
2804 * sizeof(p->asn_stack[0]));
2807 next.indents = indents;
2808 next.state = newstate;
2809 if (states[newstate].starts_line)
2810 next.newline_permitted = 1;
2812 next.newline_permitted = 0;
2814 next.newline_permitted =
2815 p->stack[p->tos-1].newline_permitted;
2817 next.newline_permitted = 0;
2819 if (!start_of_line) {
2821 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2823 next.since_newline = 1;
2826 next.since_indent = 0;
2828 next.since_indent = p->stack[p->tos-1].since_indent + 1;
2830 next.since_indent = 1;
2832 p->stack[p->tos] = next;
2833 p->asn_stack[p->tos] = asn;
2838 `pop` primarily moves the top of stack (`tos`) back down the required
2839 amount and frees any `asn` entries that need to be freed. It also
2840 collects a summary of the indents and line starts in the symbols that
2841 are being removed. It is called _after_ we reduce a production, just
2842 before we `shift` the nonterminal in.
2844 ###### parser functions
2846 static int pop(struct parser *p, int num,
2847 short *start_of_line,
2848 void(*do_free)(short sym, void *asn))
2855 for (i = 0; i < num; i++) {
2856 sol |= !p->stack[p->tos+i].since_newline;
2857 indents += p->stack[p->tos+i].indents;
2858 do_free(p->stack[p->tos+i].sym,
2859 p->asn_stack[p->tos+i]);
2862 *start_of_line = sol;
2866 ### The heart of the parser.
2868 Now we have the parser. For each token we might shift it, trigger a
2869 reduction, or start error handling. 2D tokens (IN, OUT, EOL) also need
2872 We return whatever `asn` was returned by reducing production zero.
2874 If we can neither shift nor reduce we have an error to handle. We pop
2875 single entries off the stack until we can shift the `TK_error` symbol,
2876 then drop input tokens until we find one we can shift into the new error
2877 state. We need to ensure that something is dropped or shifted after an
2878 error, or we could get into an infinite loop, only shifting in
2879 `TK_error`, then reducing. So we track if there has been a shift since
2880 the last error, and if not the next error always discards one token.
2882 When we find `TK_in` and `TK_out` tokens which report indents we need
2883 to handle them directly as the grammar cannot express what we want to
2886 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2887 record how many indents there are following the previous token.
2889 `TK_out` tokens must be canceled against an indent count
2890 within the stack. If we can reduce some symbols that are all since
2891 the most recent indent, then we do that first. If the minimum prefix
2892 of the current state then extends back before the most recent indent,
2893 that indent can be cancelled. If the minimum prefix is shorter then
2894 the indent had ended prematurely and we must start error handling, which
2895 is still a work-in-progress.
2897 `TK_newline` tokens are ignored unless the top stack frame records
2898 that they are permitted. In that case they will not be considered for
2899 shifting if it is possible to reduce some symbols that are all since
2900 the most recent start of line. This is how a newline forcibly
2901 terminates any line-like structure - we try to reduce down to at most
2902 one symbol for each line where newlines are allowed.
2903 A consequence of this is that a rule like
2905 ###### Example: newlines - broken
2909 IfStatement -> Newlines if ....
2911 cannot work, as the NEWLINE will never be shifted as the empty string
2912 will be reduced first. Optional sets of newlines need to be include
2913 in the thing that preceed:
2915 ###### Example: newlines - works
2919 IfStatement -> If ....
2921 Here the NEWLINE will be shifted because nothing can be reduced until
2924 When during error handling we discard tokens read in, we want to keep
2925 discarding until we see one that is recognised. If we had a full set
2926 of LR(1) grammar states, this would mean looking in the look-ahead set,
2927 but we don't keep a full look-ahead set. We only record the subset
2928 that leads to SHIFT. We can, however, deduce the look-ahead set by
2929 looking at the SHIFT subsets for all states that we can get to by
2930 reducing zero or more times. So we need a little function which
2931 checks if a given token is in any of these look-ahead sets.
2933 ###### parser includes
2938 static int in_lookahead(struct token *tk, const struct state *states, int state)
2940 while (state >= 0) {
2941 if (search(&states[state], tk->num) >= 0)
2943 if (states[state].reduce_prod < 0)
2945 state = search(&states[state], states[state].reduce_sym);
2950 void *parser_run(struct token_state *tokens,
2951 const struct state states[],
2952 int (*do_reduce)(int, void**, struct token_config*, void*),
2953 void (*do_free)(short, void*),
2954 FILE *trace, const char *non_term[],
2955 struct token_config *config)
2957 struct parser p = { 0 };
2958 struct token *tk = NULL;
2960 int shift_since_err = 1;
2963 shift(&p, TK_eof, 0, 1, NULL, states);
2964 while (!accepted && p.tos > 0) {
2965 struct token *err_tk;
2966 struct frame *tos = &p.stack[p.tos-1];
2968 tk = tok_copy(token_next(tokens));
2969 parser_trace(trace, &p,
2970 tk, states, non_term, config->known_count);
2972 if (tk->num == TK_in) {
2974 tos->since_newline = 0;
2975 tos->since_indent = 0;
2976 if (!states[tos->state].starts_line)
2977 tos->newline_permitted = 0;
2980 parser_trace_action(trace, "Record");
2983 if (tk->num == TK_out) {
2984 if (states[tos->state].reduce_size >= 0 &&
2985 states[tos->state].reduce_size <= tos->since_indent)
2987 if (states[tos->state].min_prefix >= tos->since_indent) {
2989 struct frame *in = tos - tos->since_indent;
2991 if (in->indents == 0) {
2992 /* Reassess since_indent and newline_permitted */
2994 in->since_indent = in[-1].since_indent + 1;
2995 in->newline_permitted = in[-1].newline_permitted;
2997 in->since_indent = 0;
2998 in->newline_permitted = 0;
3000 if (states[in->state].starts_line)
3001 in->newline_permitted = 1;
3004 in->since_indent = in[-1].since_indent + 1;
3005 if (states[in->state].starts_line)
3006 in->newline_permitted = 1;
3008 in->newline_permitted = in[-1].newline_permitted;
3013 parser_trace_action(trace, "Cancel");
3016 // fall through to error handling as both SHIFT and REDUCE
3019 if (tk->num == TK_newline) {
3020 if (!tos->newline_permitted) {
3023 parser_trace_action(trace, "Discard");
3026 if (tos->since_newline > 1 &&
3027 states[tos->state].reduce_size >= 0 &&
3028 states[tos->state].reduce_size <= tos->since_newline)
3031 if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
3032 shift_since_err = 1;
3034 parser_trace_action(trace, "Shift");
3038 if (states[tos->state].reduce_prod >= 0 &&
3039 states[tos->state].newline_only &&
3040 !(tk->num == TK_newline ||
3041 tk->num == TK_eof ||
3042 tk->num == TK_out ||
3043 (tos->indents == 0 && tos->since_newline == 0))) {
3044 /* Anything other than newline or out or eof
3045 * in an error unless we are already at start
3046 * of line, as this production must end at EOL.
3048 } else if (states[tos->state].reduce_prod >= 0) {
3051 const struct state *nextstate = &states[tos->state];
3052 int prod = nextstate->reduce_prod;
3053 int size = nextstate->reduce_size;
3055 static char buf[16*1024];
3056 short indents, start_of_line;
3058 body = p.asn_stack + (p.tos - size);
3060 bufsize = do_reduce(prod, body, config, buf);
3062 indents = pop(&p, size, &start_of_line,
3064 res = memdup(buf, bufsize);
3065 memset(buf, 0, bufsize);
3066 if (!shift(&p, nextstate->reduce_sym,
3067 indents, start_of_line,
3069 if (prod != 0) abort();
3073 parser_trace_action(trace, "Reduce");
3076 /* Error. We walk up the stack until we
3077 * find a state which will accept TK_error.
3078 * We then shift in TK_error and see what state
3079 * that takes us too.
3080 * Then we discard input tokens until
3081 * we find one that is acceptable.
3083 parser_trace_action(trace, "ERROR");
3084 short indents = 0, start_of_line;
3086 err_tk = tok_copy(*tk);
3088 shift(&p, TK_error, 0, 0,
3089 err_tk, states) == 0)
3090 // discard this state
3091 indents += pop(&p, 1, &start_of_line, do_free);
3094 // no state accepted TK_error
3097 if (!shift_since_err) {
3098 /* must discard at least one token to avoid
3101 if (tk->num == TK_eof)
3104 tk = tok_copy(token_next(tokens));
3106 shift_since_err = 0;
3107 tos = &p.stack[p.tos-1];
3108 while (!in_lookahead(tk, states, tos->state) &&
3109 tk->num != TK_eof) {
3111 tk = tok_copy(token_next(tokens));
3112 shift_since_err = 1;
3113 if (tk->num == TK_in)
3115 if (tk->num == TK_out) {
3119 // FIXME update since_indent here
3122 tos->indents += indents;
3125 pop(&p, p.tos, NULL, do_free);
3131 ###### exported functions
3132 void *parser_run(struct token_state *tokens,
3133 const struct state states[],
3134 int (*do_reduce)(int, void**, struct token_config*, void*),
3135 void (*do_free)(short, void*),
3136 FILE *trace, const char *non_term[],
3137 struct token_config *config);
3141 Being able to visualize the parser in action can be invaluable when
3142 debugging the parser code, or trying to understand how the parse of a
3143 particular grammar progresses. The stack contains all the important
3144 state, so just printing out the stack every time around the parse loop
3145 can make it possible to see exactly what is happening.
3147 This doesn't explicitly show each SHIFT and REDUCE action. However they
3148 are easily deduced from the change between consecutive lines, and the
3149 details of each state can be found by cross referencing the states list
3150 in the stack with the "report" that parsergen can generate.
3152 For terminal symbols, we just dump the token. For non-terminals we
3153 print the name of the symbol. The look ahead token is reported at the
3154 end inside square brackets.
3156 ###### parser functions
3158 static char *reserved_words[] = {
3159 [TK_error] = "ERROR",
3162 [TK_newline] = "NEWLINE",
3165 static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
3167 fprintf(trace, "(%d", f->state);
3168 if (states[f->state].starts_line)
3169 fprintf(trace, "s");
3170 if (f->newline_permitted)
3171 fprintf(trace, "n%d", f->since_newline);
3172 fprintf(trace, ") ");
3175 void parser_trace(FILE *trace, struct parser *p,
3176 struct token *tk, const struct state states[],
3177 const char *non_term[], int knowns)
3182 for (i = 0; i < p->tos; i++) {
3183 struct frame *f = &p->stack[i];
3186 if (sym < TK_reserved &&
3187 reserved_words[sym] != NULL)
3188 fputs(reserved_words[sym], trace);
3189 else if (sym < TK_reserved + knowns) {
3190 struct token *t = p->asn_stack[i];
3191 text_dump(trace, t->txt, 20);
3193 fputs(non_term[sym - TK_reserved - knowns],
3196 fprintf(trace, ".%d", f->indents);
3197 if (f->since_newline == 0)
3201 parser_trace_state(trace, f, states);
3203 fprintf(trace, "[");
3204 if (tk->num < TK_reserved &&
3205 reserved_words[tk->num] != NULL)
3206 fputs(reserved_words[tk->num], trace);
3208 text_dump(trace, tk->txt, 20);
3209 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3212 void parser_trace_action(FILE *trace, char *action)
3215 fprintf(trace, " - %s\n", action);
3220 The obvious example for a parser is a calculator.
3222 As `scanner` provides number parsing function using `libgmp` it is not much
3223 work to perform arbitrary rational number calculations.
3225 This calculator takes one expression, or an equality test, per line.
3226 The results are printed and if any equality test fails, the program
3227 exits with an error.
3229 ###### File: parsergen.mk
3230 calc.c calc.h : parsergen parsergen.mdc
3231 ./parsergen --tag calc -o calc parsergen.mdc
3232 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3233 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3235 ./calc parsergen.mdc
3240 #include "parse_number.h"
3241 // what do we use for a demo-grammar? A calculator of course.
3253 #include <sys/mman.h>
3259 #include "scanner.h"
3264 static void free_number(struct number *n)
3270 static int text_is(struct text t, char *s)
3272 return (strlen(s) == t.len &&
3273 strncmp(s, t.txt, t.len) == 0);
3276 int main(int argc, char *argv[])
3278 int fd = open(argv[1], O_RDONLY);
3279 int len = lseek(fd, 0, 2);
3280 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3281 struct section *table = code_extract(file, file+len, NULL);
3283 struct token_config config = {
3284 .ignored = (1 << TK_line_comment)
3287 .number_chars = ".,_+-",
3291 for (s = table; s; s = s->next)
3292 if (text_is(s->section, "example: input"))
3293 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3295 struct section *t = table->next;
3296 code_free(table->code);
3308 Session -> Session Line
3311 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3312 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3313 gmp_printf(" or as a decimal: %Fg\n", fl);
3317 | Expression = Expression NEWLINE ${
3318 if (mpq_equal($1.val, $3.val))
3319 gmp_printf("Both equal %Qd\n", $1.val);
3321 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
3326 | NEWLINE ${ printf("Blank line\n"); }$
3327 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3330 Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3331 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3332 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3333 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3334 | Expression // Expression ${ {
3337 mpz_init(z0); mpz_init(z1); mpz_init(z2);
3338 mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3339 mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3340 mpz_tdiv_q(z0, z1, z2);
3341 mpq_set_z($0.val, z0);
3342 mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3344 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3345 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3350 3.1415926535 - 355/113
3352 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3354 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1