1 # LR(1) Parser Generator with 2D support #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 "2D support" means that indentation and line breaks can be significant.
9 There are several distinct sections.
11 1. `grammar_read` will read a grammar from a literate-code file and
12 store the productions, symbols, and code fragments.
13 2. `grammar_analyse` will take that grammar and build LR parsing
14 states and look-ahead tables.
15 3. `grammar_report` will present the details of the analysis
16 in a readable format and will report any conflicts.
17 4. `parser_generate` will write out C code files with various
18 tables and with the code fragments arranged in useful places.
19 5. `parser_run` is a library function intended to be linked together
20 with the generated parser tables to complete the implementation of
22 6. Finally `calc` is a test grammar for a simple calculator. The
23 `parsergen` program built from the C code in this file can extract
24 that grammar directly from this file and process it.
26 ###### File: parsergen.c
31 ## forward declarations
42 ###### File: libparser.c
49 ###### File: parsergen.mk
52 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
55 ## Reading the grammar
57 The grammar must be stored in a markdown literate code file as parsed by
58 "[mdcode][]". It may have up to four top level (i.e. unreferenced)
59 sections: `header`, `code`, `grammar` and `reduce`. The first two, if
60 present, will be literally copied into the generated `.c` and `.h`
61 files. The third is required and contains the grammar which is
62 tokenised with "[scanner][]". `reduce` can provide variable
63 declarations and common intialization code for all the reduction code
64 fragments in the grammar.
66 If the `--tag` option is given, then any top level header that doesn't
67 start with the tag is ignored, and the tag is striped from the rest. So
68 `--tag Foo` means that the four sections expected are `Foo: header`,
69 `Foo: code`, `Foo: grammar` and `Foo: reduce`. The tag `calc` is used
70 to extract the sample calculator from this file.
73 [scanner]: scanner.html
79 ###### parser includes
83 The grammar contains production sets, precedence/associativity
84 declarations, general terminal declarations, and data type declarations.
85 These are all parsed with _ad hoc_ parsing as we don't have a parser
88 The precedence and associativity can be set for each production, and
89 can be inherited from symbols. The data type (either structure or a
90 reference to a structure) is potentially defined for each non-terminal
91 and describes what C structure is used to store information about each
95 enum assoc {Left, Right, Non};
96 char *assoc_names[] = {"Left","Right","Non"};
99 struct text struct_name;
102 unsigned short precedence;
106 unsigned short precedence;
115 The strings reported by `mdcode` and `scanner` are `struct text` which
116 have length rather than being nul terminated. To help with printing and
117 comparing we define `text_is` and `prtxt`, which should possibly go in
118 `mdcode`. `scanner` does provide `text_dump` which is useful for
119 strings which might contain control characters.
121 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
122 because that is what we need to detect tags.
125 static int text_is(struct text t, char *s)
127 return (strlen(s) == t.len &&
128 strncmp(s, t.txt, t.len) == 0);
130 static void prtxt(struct text t)
132 printf("%.*s", t.len, t.txt);
135 static int strip_tag(struct text *t, char *tag)
137 int skip = strlen(tag) + 1;
138 if (skip >= t->len ||
139 strncmp(t->txt, tag, skip-1) != 0 ||
140 t->txt[skip-1] != ':')
142 while (skip < t->len && t->txt[skip] == ' ')
151 Productions are comprised primarily of symbols - terminal and
152 non-terminal. We do not make a syntactic distinction between the two,
153 though non-terminals must be identifiers. Non-terminal symbols are
154 those which appear in the head of a production, terminal symbols are
155 those which don't. There are also "virtual" symbols used for precedence
156 marking discussed later, and sometimes we won't know what type a symbol
159 To help with code safety it is possible to declare the terminal symbols.
160 If this is done, then any symbol used in a production that does not
161 appear in the head of any production and is not declared as a terminal
162 is treated as an error.
164 ###### forward declarations
165 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
166 char *symtypes = "UVTN";
169 ###### grammar fields
170 int terminals_declared;
172 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
173 table of known symbols and the resulting parser will report them as
174 `TK_reserved + N`. A small set of identifiers are reserved for the
175 different token types that `scanner` can report.
179 static struct { int num; char *name; } reserved_words[] = {
180 { TK_error, "ERROR" },
181 { TK_number, "NUMBER" },
182 { TK_ident, "IDENTIFIER" },
184 { TK_string, "STRING" },
185 { TK_multi_string, "MULTI_STRING" },
188 { TK_newline, "NEWLINE" },
194 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
195 recognised. The former is automatically expected at the end of the text
196 being parsed. The latter are always ignored by the parser.
198 All of these symbols are stored in a simple symbol table. We use the
199 `struct text` from `mdcode` to store the name and link them together
200 into a sorted list using an insertion sort.
202 We don't have separate `find` and `insert` functions as any symbol we
203 find needs to be remembered. We simply expect `find` to always return a
204 symbol, but its type might be `Unknown`.
213 ###### grammar fields
218 static struct symbol *sym_find(struct grammar *g, struct text s)
220 struct symbol **l = &g->syms;
225 (cmp = text_cmp((*l)->name, s)) < 0)
229 n = calloc(1, sizeof(*n));
238 static void symbols_init(struct grammar *g)
240 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
242 for (i = 0; i < entries; i++) {
245 t.txt = reserved_words[i].name;
246 t.len = strlen(t.txt);
249 s->num = reserved_words[i].num;
253 ### Data types and precedence.
255 Data type specification, precedence specification, and declaration of
256 terminals are all introduced by a dollar sign at the start of the line.
257 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
258 precedence, if it is `TERM` then the line declares terminals without
259 precedence, otherwise it specifies a data type.
261 The data type name is simply stored and applied to the head of all
262 subsequent productions. It must be the name of a structure optionally
263 preceded by an asterisk which means a reference or "pointer". So
264 `$expression` maps to `struct expression` and `$*statement` maps to
265 `struct statement *`.
267 Any productions given before the first data type declaration will have
268 no data type associated with them and can carry no information. In
269 order to allow other non-terminals to have no type, the data type
270 `$void` can be given. This does *not* mean that `struct void` will be
271 used, but rather than no type will be associated with subsequent
274 The precedence line must contain a list of symbols, either terminal
275 symbols or virtual symbols. It can only contain symbols that have not
276 been seen yet, so precedence declaration must precede the productions
279 A "virtual" symbol is an identifier preceded by `$$`. Virtual symbols
280 carry precedence information but are not included in the grammar. A
281 production can declare that it inherits the precedence of a given
284 This use for `$$` precludes it from being used as a symbol in the
285 described language. Two other symbols: `${` and `}$` are also
288 Each new precedence line introduces a new precedence level and
289 declares how it associates. This level is stored in each symbol
290 listed and may be inherited by any production which uses the symbol. A
291 production inherits from the last symbol which has a precedence.
293 The symbols on the first precedence line have the lowest precedence.
294 Subsequent lines introduce symbols with higher precedence and so bind
297 Note that a declaration line (or "dollar line") can start with either of
298 two different marks: "$" or "$*". The `dollar_line()` function defined
299 here is told which was found in the `isref` argument.
301 ###### grammar fields
302 struct text current_type;
307 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
308 static const char *known[] = { "$$", "${", "}$" };
311 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
313 struct token t = token_next(ts);
319 if (t.num != TK_ident) {
320 err = "type or assoc expected after '$'";
323 if (text_is(t.txt, "LEFT"))
325 else if (text_is(t.txt, "RIGHT"))
327 else if (text_is(t.txt, "NON"))
329 else if (text_is(t.txt, "TERM")) {
331 g->terminals_declared = 1;
333 g->current_type = t.txt;
334 g->type_isref = isref;
335 if (text_is(t.txt, "void"))
336 g->current_type.txt = NULL;
338 if (t.num != TK_newline) {
339 err = "Extra tokens after type name";
346 err = "$* cannot be followed by a precedence";
350 // This is a precedence or TERM line, need some symbols.
354 while (t.num != TK_newline) {
355 enum symtype type = Terminal;
357 if (t.num == TK_virtual) {
360 if (t.num != TK_ident) {
361 err = "$$ must be followed by a word";
365 err = "Virtual symbols not permitted on $TERM line";
368 } else if (t.num != TK_ident &&
370 err = "Illegal token in precedence line";
373 s = sym_find(g, t.txt);
374 if (s->type != Unknown) {
375 err = "Symbols in precedence/TERM line must not already be known.";
380 s->precedence = g->prec_levels;
387 err = "No symbols given on precedence/TERM line";
391 while (t.num != TK_newline && t.num != TK_eof)
398 A production either starts with an identifier which is the head
399 non-terminal, or a vertical bar (`|`) in which case this production
400 uses the same head as the previous one. The identifier must be
401 followed by a `->` mark. All productions for a given non-terminal must
402 be grouped together with the `nonterminal ->` given only once.
404 After this a (possibly empty) sequence of identifiers and marks form
405 the body of the production. A virtual symbol may be given after the
406 body by preceding it with `$$`. If a virtual symbol is given, the
407 precedence of the production is that for the virtual symbol. If none
408 is given, the precedence is inherited from the last symbol in the
409 production which has a precedence specified.
411 After the optional precedence may come the `${` mark. This indicates
412 the start of a code fragment. If present, this must be on the same
413 line as the start of the production.
415 All of the text from the `${` through to the matching `}$` is
416 collected and forms the code-fragment for the production. It must all
417 be in one `code_node` of the literate code. The `}$` must be
418 at the end of a line.
420 Text in the code fragment will undergo substitutions where `$N` or
421 `$<N`,for some numeric `N` (or non-numeric indicator as described
422 later), will be replaced with a variable holding the parse information
423 for the particular symbol in the production. `$0` is the head of the
424 production, `$1` is the first symbol of the body, etc. The type of `$N`
425 for a terminal symbol is `struct token`. For a non-terminal, it is
426 whatever has been declared for that symbol. The `<` may be included and
427 means that the value (usually a reference) is being moved out, so it
428 will not automatically be freed. The effect of using '<' is that the
429 variable is cleared to all-zeros.
431 While building productions we will need to add to an array which needs to
435 static void array_add(void *varray, int *cnt, void *new)
437 void ***array = varray;
440 current = ((*cnt-1) | (step-1))+1;
441 if (*cnt == current) {
444 *array = realloc(*array, current * sizeof(void*));
446 (*array)[*cnt] = new;
450 Collecting the code fragment simply involves reading tokens until we
451 hit the end token `}$`, and noting the character position of the start and
455 static struct text collect_code(struct token_state *state,
460 code.txt = start.txt.txt + start.txt.len;
462 t = token_next(state);
463 while (t.node == start.node &&
464 t.num != TK_close && t.num != TK_error &&
466 if (t.num == TK_close && t.node == start.node)
467 code.len = t.txt.txt - code.txt;
473 Now we have all the bits we need to parse a full production.
478 ###### grammar fields
479 struct production **productions;
480 int production_count;
482 ###### production fields
484 struct symbol **body;
490 int first_production;
493 static char *parse_production(struct grammar *g,
495 struct token_state *state)
497 /* Head has already been parsed. */
500 struct production p, *pp;
502 memset(&p, 0, sizeof(p));
504 tk = token_next(state);
505 while (tk.num == TK_ident || tk.num == TK_mark) {
506 struct symbol *bs = sym_find(g, tk.txt);
507 if (bs->type == Unknown) {
508 if (!g->terminals_declared)
511 if (bs->type == Virtual) {
512 err = "Virtual symbol not permitted in production";
515 if (bs->precedence) {
516 p.precedence = bs->precedence;
519 array_add(&p.body, &p.body_size, bs);
520 tk = token_next(state);
522 if (tk.num == TK_virtual) {
524 tk = token_next(state);
525 if (tk.num != TK_ident) {
526 err = "word required after $$";
529 vs = sym_find(g, tk.txt);
530 if (vs->num == TK_newline)
532 else if (vs->num == TK_out)
534 else if (vs->precedence == 0) {
535 err = "symbol after $$ must have precedence";
538 p.precedence = vs->precedence;
541 tk = token_next(state);
543 if (tk.num == TK_open) {
544 p.code_line = tk.line;
545 p.code = collect_code(state, tk);
546 if (p.code.txt == NULL) {
547 err = "code fragment not closed properly";
550 tk = token_next(state);
553 if (tk.num != TK_newline && tk.num != TK_eof) {
554 err = "stray tokens at end of line";
557 pp = malloc(sizeof(*pp));
559 array_add(&g->productions, &g->production_count, pp);
562 while (tk.num != TK_newline && tk.num != TK_eof)
563 tk = token_next(state);
568 With the ability to parse productions and dollar-lines, we have nearly all
569 that we need to parse a grammar from a `code_node`.
571 The head of the first production will effectively be the `start` symbol
572 of the grammar. However it won't _actually_ be so. Processing the
573 grammar is greatly simplified if the real start symbol only has a single
574 production, and expects `$eof` as the final terminal. So when we find
575 the first explicit production we insert an extra implicit production as
576 production zero which looks like
578 ###### Example: production 0
581 where `START` is the first non-terminal given.
583 ###### create production zero
584 struct production *p = calloc(1,sizeof(*p));
585 struct text start = {"$start",6};
586 struct text eof = {"$eof",4};
587 struct text code = {"$0 = $<1;", 9};
588 p->head = sym_find(g, start);
589 p->head->type = Nonterminal;
590 p->head->struct_name = g->current_type;
591 p->head->isref = g->type_isref;
592 if (g->current_type.txt)
594 array_add(&p->body, &p->body_size, head);
595 array_add(&p->body, &p->body_size, sym_find(g, eof));
596 p->head->first_production = g->production_count;
597 array_add(&g->productions, &g->production_count, p);
599 Now we are ready to read in the grammar. We ignore comments
600 and strings so that the marks which introduce them can be
601 used as terminals in the grammar. We don't ignore numbers
602 even though we don't allow them as that causes the scanner
603 to produce errors that the parser is better positioned to handle.
604 We also ignore indentation, but we expect and use newlines.
606 To allow comments in the grammar, we explicitly check for "//" as the
607 first token on the line and those lines are skipped. "//" can still be
608 used as a terminal anywhere that a terminal is expected.
611 static struct grammar *grammar_read(struct code_node *code)
613 struct token_config conf = {
616 .words_marks = known,
617 .known_count = sizeof(known)/sizeof(known[0]),
619 .ignored = (1 << TK_line_comment)
620 | (1 << TK_block_comment)
623 | (1 << TK_multi_string)
628 struct token_state *state = token_open(code, &conf);
630 struct symbol *head = NULL;
634 g = calloc(1, sizeof(*g));
637 for (tk = token_next(state); tk.num != TK_eof;
638 tk = token_next(state)) {
639 if (tk.num == TK_newline)
641 if (tk.num == TK_ident) {
643 head = sym_find(g, tk.txt);
644 if (head->type == Nonterminal)
645 err = "This non-terminal has already be used.";
646 else if (head->type == Virtual)
647 err = "Virtual symbol not permitted in head of production";
649 head->type = Nonterminal;
650 head->struct_name = g->current_type;
651 head->isref = g->type_isref;
652 if (g->production_count == 0) {
653 ## create production zero
655 head->first_production = g->production_count;
656 tk = token_next(state);
657 if (tk.num == TK_mark &&
658 text_is(tk.txt, "->"))
659 err = parse_production(g, head, state);
661 err = "'->' missing in production";
663 } else if (tk.num == TK_mark
664 && text_is(tk.txt, "|")) {
665 // another production for same non-term
667 err = parse_production(g, head, state);
669 err = "First production must have a head";
670 } else if (tk.num == TK_mark
671 && text_is(tk.txt, "$")) {
672 err = dollar_line(state, g, 0);
673 } else if (tk.num == TK_mark
674 && text_is(tk.txt, "$*")) {
675 err = dollar_line(state, g, 1);
676 } else if (tk.num == TK_mark
677 && text_is(tk.txt, "//")) {
678 while (tk.num != TK_newline &&
680 tk = token_next(state);
682 err = "Unrecognised token at start of line.";
688 if (g->terminals_declared) {
691 for (s = g->syms; s; s = s->next) {
692 if (s->type != Unknown)
695 fprintf(stderr, "Token %.*s not declared\n",
696 s->name.len, s->name.txt);
699 free(g); // FIXME free content
705 fprintf(stderr, "Error at line %d: %s\n",
708 free(g); // FIXME free content
712 ## Analysing the grammar
714 The central task in analysing the grammar is to determine a set of
715 states to drive the parsing process. This will require finding various
716 sets of symbols and of "items". Some of these sets will need to attach
717 information to each element in the set, so they are more like maps.
719 Each "item" may have a 'look-ahead' or `LA` set associated with it.
720 Many of these will be the same as each other. To avoid memory wastage,
721 and to simplify some comparisons of sets, these sets will be stored in a
722 list of unique sets, each assigned a number.
724 Once we have the data structures in hand to manage these sets and lists,
725 we can start setting the 'nullable' flag, build the 'FIRST' sets, and
726 then create the item sets which define the various states.
730 Though we don't only store symbols in these sets, they are the main
731 things we store, so they are called symbol sets or "symsets".
733 A symset has a size, an array of shorts, and an optional array of data
734 values, which are also shorts. If the array of data is not present, we
735 store an impossible pointer, as `NULL` is used to indicate that no
736 memory has been allocated yet;
741 unsigned short *syms, *data;
743 #define NO_DATA ((unsigned short *)1)
744 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
745 const struct symset INIT_DATASET = { 0, NULL, NULL };
747 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
748 using an insertion sort. We don't explicitly record the amount of
749 allocated space as it can be derived directly from the current `cnt`
750 using `((cnt - 1) | 7) + 1`.
753 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
756 int current = ((s->cnt-1) | 7) + 1;
757 if (current == s->cnt) {
759 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
760 if (s->data != NO_DATA)
761 s->data = realloc(s->data, sizeof(*s->data) * current);
764 while (i > 0 && s->syms[i-1] > key) {
765 s->syms[i] = s->syms[i-1];
766 if (s->data != NO_DATA)
767 s->data[i] = s->data[i-1];
771 if (s->data != NO_DATA)
776 Finding a symbol (or item) in a `symset` uses a simple binary search.
777 We return the index where the value was found (so data can be accessed),
778 or `-1` to indicate failure.
780 static int symset_find(struct symset *ss, unsigned short key)
787 while (lo + 1 < hi) {
788 int mid = (lo + hi) / 2;
789 if (ss->syms[mid] <= key)
794 if (ss->syms[lo] == key)
799 We will often want to form the union of two symsets. When we do, we
800 will often want to know if anything changed (as that might mean there is
801 more work to do). So `symset_union` reports whether anything was added
802 to the first set. We use a slow quadratic approach as these sets are
803 rarely large. If profiling shows this to be a problem it can be
806 static int symset_union(struct symset *a, struct symset *b)
810 for (i = 0; i < b->cnt; i++)
811 if (symset_find(a, b->syms[i]) < 0) {
812 unsigned short data = 0;
813 if (b->data != NO_DATA)
815 symset_add(a, b->syms[i], data);
821 And of course we must be able to free a symset.
823 static void symset_free(struct symset ss)
826 if (ss.data != NO_DATA)
832 Some symsets are simply stored somewhere appropriate in the `grammar`
833 data structure, others needs a bit of indirection. This applies
834 particularly to "LA" sets. These will be paired with "items" in an
835 "itemset". As itemsets will be stored in a symset, the "LA" set needs
836 to be stored in the `data` field, so we need a mapping from a small
837 (short) number to an LA `symset`.
839 As mentioned earlier this involves creating a list of unique symsets.
841 For now, we just use a linear list sorted by insertion. A skiplist
842 would be more efficient and may be added later.
849 struct setlist *next;
852 ###### grammar fields
853 struct setlist *sets;
858 static int ss_cmp(struct symset a, struct symset b)
861 int diff = a.cnt - b.cnt;
864 for (i = 0; i < a.cnt; i++) {
865 diff = (int)a.syms[i] - (int)b.syms[i];
872 static int save_set(struct grammar *g, struct symset ss)
874 struct setlist **sl = &g->sets;
878 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
885 s = malloc(sizeof(*s));
894 Finding a set by number is currently performed by a simple linear
895 search. If this turns out to hurt performance, we can store the sets in
896 a dynamic array like the productions.
898 static struct symset set_find(struct grammar *g, int num)
900 struct setlist *sl = g->sets;
901 while (sl && sl->num != num)
906 ### Setting `nullable`
908 We set `nullable` on the head symbol for any production for which all
909 the body symbols (if any) are nullable. As this is a recursive
910 definition, any change in the `nullable` setting means that we need to
911 re-evaluate where it needs to be set.
913 We simply loop around performing the same calculations until no more
920 static void set_nullable(struct grammar *g)
923 while (check_again) {
926 for (p = 0; p < g->production_count; p++) {
927 struct production *pr = g->productions[p];
930 if (pr->head->nullable)
932 for (s = 0; s < pr->body_size; s++)
933 if (! pr->body[s]->nullable)
935 if (s == pr->body_size) {
936 pr->head->nullable = 1;
943 ### Setting `line_like`
945 In order to be able to ignore newline tokens when not relevant, but
946 still include them in the parse when needed, we will need to know
947 which states can start a "line-like" section of code. We ignore
948 newlines when there is an indent since the most recent start of a
951 A "line_like" symbol is simply any symbol that can derive a NEWLINE.
952 If a symbol cannot derive a NEWLINE, then it is only part of a line -
953 so is word-like. If it can derive a NEWLINE, then we consider it to
956 Clearly the `TK_newline` token can derive a NEWLINE. Any symbol which
957 is the head of a production that contains a line_like symbol is also a
958 line-like symbol. We use a new field `line_like` to record this
959 attribute of symbols, and compute it in a repetitive manner similar to
966 static void set_line_like(struct grammar *g)
969 g->symtab[TK_newline]->line_like = 1;
970 while (check_again) {
973 for (p = 0; p < g->production_count; p++) {
974 struct production *pr = g->productions[p];
977 if (pr->head->line_like)
980 for (s = 0 ; s < pr->body_size; s++) {
981 if (pr->body[s]->line_like) {
982 pr->head->line_like = 1;
991 ### Building the `first` sets
993 When calculating what can follow a particular non-terminal, we will need
994 to know what the "first" terminal in any subsequent non-terminal might
995 be. So we calculate the `first` set for every non-terminal and store
996 them in an array. We don't bother recording the "first" set for
997 terminals as they are trivial.
999 As the "first" for one symbol might depend on the "first" of another, we
1000 repeat the calculations until no changes happen, just like with
1001 `set_nullable`. This makes use of the fact that `symset_union` reports
1002 if any change happens.
1004 The core of this, which finds the "first" of part of a production body,
1005 will be reused for computing the "follow" sets, so we split that out
1006 into a separate function, `add_first`, which also reports if it got all
1007 the way to the end of the production without finding a non-nullable
1010 ###### grammar fields
1011 struct symset *first;
1015 static int add_first(struct production *pr, int start,
1016 struct symset *target, struct grammar *g,
1021 for (s = start; s < pr->body_size; s++) {
1022 struct symbol *bs = pr->body[s];
1023 if (bs->type == Terminal) {
1024 if (symset_find(target, bs->num) < 0) {
1025 symset_add(target, bs->num, 0);
1029 } else if (symset_union(target, &g->first[bs->num]))
1035 *to_end = (s == pr->body_size);
1039 static void build_first(struct grammar *g)
1041 int check_again = 1;
1043 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1044 for (s = 0; s < g->num_syms; s++)
1045 g->first[s] = INIT_SYMSET;
1047 while (check_again) {
1050 for (p = 0; p < g->production_count; p++) {
1051 struct production *pr = g->productions[p];
1052 struct symset *head = &g->first[pr->head->num];
1054 if (add_first(pr, 0, head, g, NULL))
1060 ### Building the `follow` sets.
1062 There are two different situations when we will want to generate
1063 "follow" sets. If we are doing an SLR analysis, we want to generate a
1064 single "follow" set for each non-terminal in the grammar. That is what
1065 is happening here. If we are doing an LALR or LR analysis we will want
1066 to generate a separate "LA" set for each item. We do that later in
1069 There are two parts to generating a "follow" set. Firstly we look at
1070 every place that any non-terminal appears in the body of any production,
1071 and we find the set of possible "first" symbols after there. This is
1072 done using `add_first` above and only needs to be done once as the
1073 "first" sets are now stable and will not change.
1075 ###### grammar fields
1076 struct symset *follow;
1080 for (p = 0; p < g->production_count; p++) {
1081 struct production *pr = g->productions[p];
1084 for (b = 0; b < pr->body_size - 1; b++) {
1085 struct symbol *bs = pr->body[b];
1086 if (bs->type == Terminal)
1088 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1092 The second part is to add the "follow" set of the head of a production
1093 to the "follow" sets of the final symbol in the production, and any
1094 other symbol which is followed only by `nullable` symbols. As this
1095 depends on "follow" itself we need to repeatedly perform the process
1096 until no further changes happen.
1098 Rather than 2 nested loops, one that repeats the whole process until
1099 there is no change, and one that iterates of the set of productions, we
1100 combine these two functions into a single loop.
1104 for (check_again = 0, p = 0;
1105 p < g->production_count;
1106 p < g->production_count-1
1107 ? p++ : check_again ? (check_again = 0, p = 0)
1109 struct production *pr = g->productions[p];
1112 for (b = pr->body_size - 1; b >= 0; b--) {
1113 struct symbol *bs = pr->body[b];
1114 if (bs->type == Terminal)
1116 if (symset_union(&g->follow[bs->num],
1117 &g->follow[pr->head->num]))
1124 We now just need to create and initialise the `follow` list to get a
1128 static void build_follow(struct grammar *g)
1130 int s, check_again, p;
1131 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1132 for (s = 0; s < g->num_syms; s++)
1133 g->follow[s] = INIT_SYMSET;
1137 ### Building itemsets and states
1139 There are three different levels of detail that can be chosen for
1140 building the itemsets and states for the LR grammar. They are:
1142 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1143 itemsets - look-ahead, if used at all, is simply the 'follow' sets
1145 2. LALR(1) where we build look-ahead sets with each item and merge
1146 the LA sets when we find two paths to the same "kernel" set of items.
1147 3. LR(1) where different look-ahead for any item in the set means
1148 a different item set must be created.
1150 ###### forward declarations
1151 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1153 We need to be able to look through existing states to see if a newly
1154 generated state already exists. For now we use a simple sorted linked
1157 An item is a pair of numbers: the production number and the position of
1158 "DOT", which is an index into the body of the production. As the
1159 numbers are not enormous we can combine them into a single "short" and
1160 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1161 production number (so 2000 productions with maximum of 31 symbols in the
1164 Comparing the itemsets will be a little different to comparing symsets
1165 as we want to do the lookup after generating the "kernel" of an
1166 itemset, so we need to ignore the offset=zero items which are added during
1169 To facilitate this, we modify the "DOT" number so that "0" sorts to
1170 the end of the list in the symset, and then only compare items before
1174 static inline unsigned short item_num(int production, int dot)
1176 return production | ((31-dot) << 11);
1178 static inline int item_prod(unsigned short item)
1180 return item & 0x7ff;
1182 static inline int item_dot(unsigned short item)
1184 return (31-(item >> 11)) & 0x1f;
1187 For LR(1) analysis we need to compare not just the itemset in a state
1188 but also the LA sets. As we assign each unique LA set a number, we
1189 can just compare the symset and the data values together.
1192 static int itemset_cmp(struct symset a, struct symset b,
1193 enum grammar_type type)
1199 i < a.cnt && i < b.cnt &&
1200 item_dot(a.syms[i]) > 0 &&
1201 item_dot(b.syms[i]) > 0;
1203 int diff = a.syms[i] - b.syms[i];
1207 diff = a.data[i] - b.data[i];
1212 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1216 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1222 if (type < LR1 || av == -1)
1225 a.data[i] - b.data[i];
1228 It will be helpful to know if an itemset has been "completed" or not,
1229 particularly for LALR where itemsets get merged, at which point they
1230 need to be consider for completion again. So a `completed` flag is
1233 For correct handling of `TK_newline` when parsing, we will need to
1234 know which states (itemsets) can occur at the start of a line, so we
1235 will record a `starts_line` flag too whenever DOT is at the start of a
1238 Finally, for handling `TK_out` we need to know whether productions in the
1239 current state started *before* the most recent indent. A state
1240 doesn't usually keep details of individual productions, so we need to
1241 add one extra detail. `min_prefix` is the smallest non-zero number of
1242 symbols *before* DOT in any production in an itemset. This will allow
1243 us to determine if the the most recent indent is sufficiently recent
1244 to cancel it against a `TK_out`. If it was seen longer ago than the
1245 `min_prefix`, and if the current state cannot be reduced, then the
1246 indented section must have ended in the middle of a syntactic unit, so
1247 an error must be signaled.
1249 And now we can build the list of itemsets. The lookup routine returns
1250 both a success flag and a pointer to where in the list an insert should
1251 happen, so we don't need to search a second time.
1255 struct itemset *next;
1257 struct symset items;
1258 struct symset go_to;
1260 unsigned short precedence;
1266 ###### grammar fields
1267 struct itemset *items;
1271 static int itemset_find(struct grammar *g, struct itemset ***where,
1272 struct symset kernel, enum grammar_type type)
1274 struct itemset **ip;
1276 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1277 struct itemset *i = *ip;
1279 diff = itemset_cmp(i->items, kernel, type);
1292 Adding an itemset may require merging the LA sets if LALR analysis is
1293 happening. If any new LA set adds any symbols that weren't in the old
1294 LA set, we clear the `completed` flag so that the dependants of this
1295 itemset will be recalculated and their LA sets updated.
1297 `add_itemset` must consume the symsets it is passed, either by adding
1298 them to a data structure, of freeing them.
1300 static int add_itemset(struct grammar *g, struct symset ss,
1301 enum assoc assoc, unsigned short precedence,
1302 enum grammar_type type)
1304 struct itemset **where, *is;
1306 int found = itemset_find(g, &where, ss, type);
1308 is = calloc(1, sizeof(*is));
1309 is->state = g->states;
1313 is->precedence = precedence;
1315 is->go_to = INIT_DATASET;
1324 for (i = 0; i < ss.cnt; i++) {
1325 struct symset temp = INIT_SYMSET, s;
1326 if (ss.data[i] == is->items.data[i])
1328 s = set_find(g, is->items.data[i]);
1329 symset_union(&temp, &s);
1330 s = set_find(g, ss.data[i]);
1331 if (symset_union(&temp, &s)) {
1332 is->items.data[i] = save_set(g, temp);
1343 To build all the itemsets, we first insert the initial itemset made from
1344 production zero, complete each itemset, and then generate new itemsets
1345 from old until no new ones can be made.
1347 Completing an itemset means finding all the items where "DOT" is
1348 followed by a nonterminal and adding "DOT=0" items for every production
1349 from that non-terminal - providing each item hasn't already been added.
1350 When we add such an item it might get inserted before the current
1351 one, so to make sure we process it we reset the loop counter to the
1354 If LA sets are needed, the LA set for each new item is found using
1355 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This
1356 may be supplemented by the LA set for the item which produced the new
1359 We also collect a set of all symbols which follow "DOT" (in `done`) as
1360 this is used in the next stage. If any of these symbols are flagged as
1361 `line_like`, then this state must be a `starts_line` state so now is a
1362 good time to record that.
1364 When itemsets are created we assign a precedence to the itemset from the
1365 complete item, if there is one. We ignore the possibility of there
1366 being two and don't (currently) handle precedence in such grammars.
1367 When completing a grammar we ignore any item where DOT is followed by a
1368 terminal with a precedence lower than that for the itemset. Unless the
1369 terminal has right associativity, we also ignore items where the
1370 terminal has the same precedence. The result is that unwanted items are
1371 still in the itemset, but the terminal doesn't get into the go to set,
1372 so the item is ineffective.
1374 ###### complete itemset
1375 for (i = 0; i < is->items.cnt; i++) {
1376 int p = item_prod(is->items.syms[i]);
1377 int bs = item_dot(is->items.syms[i]);
1378 struct production *pr = g->productions[p];
1381 struct symset LA = INIT_SYMSET;
1382 unsigned short sn = 0;
1383 struct symset LAnl = INIT_SYMSET;
1384 unsigned short snnl = 0;
1386 if (is->min_prefix == 0 ||
1387 (bs > 0 && bs < is->min_prefix))
1388 is->min_prefix = bs;
1389 if (bs == pr->body_size)
1392 if (s->precedence && is->precedence &&
1393 is->precedence > s->precedence)
1394 /* This terminal has a low precedence and
1395 * shouldn't be shifted
1398 if (s->precedence && is->precedence &&
1399 is->precedence == s->precedence && s->assoc != Right)
1400 /* This terminal has a matching precedence and is
1401 * not Right-associative, so we mustn't shift it.
1404 if (symset_find(&done, s->num) < 0) {
1405 symset_add(&done, s->num, 0);
1407 if (s->type != Nonterminal)
1410 is->starts_line = 1;
1415 add_first(pr, bs+1, &LA, g, &to_end);
1417 struct symset ss = set_find(g, is->items.data[i]);
1418 symset_union(&LA, &ss);
1420 sn = save_set(g, LA);
1421 LA = set_find(g, sn);
1422 if (symset_find(&LA, TK_newline))
1423 symset_add(&LAnl, TK_newline, 0);
1424 snnl = save_set(g, LAnl);
1425 LAnl = set_find(g, snnl);
1428 /* Add productions for this symbol */
1429 for (p2 = s->first_production;
1430 p2 < g->production_count &&
1431 g->productions[p2]->head == s;
1433 int itm = item_num(p2, 0);
1434 int pos = symset_find(&is->items, itm);
1436 if (g->productions[p2]->line_like)
1437 symset_add(&is->items, itm, snnl);
1439 symset_add(&is->items, itm, sn);
1440 /* Will have re-ordered, so start
1441 * from beginning again */
1443 } else if (type >= LALR) {
1444 struct symset ss = set_find(g, is->items.data[pos]);
1445 struct symset tmp = INIT_SYMSET;
1446 struct symset *la = &LA;
1448 if (g->productions[p2]->line_like)
1450 symset_union(&tmp, &ss);
1451 if (symset_union(&tmp, la)) {
1452 is->items.data[pos] = save_set(g, tmp);
1460 For each symbol we found (and placed in `done`) we collect all the items
1461 for which this symbol is next, and create a new itemset with "DOT"
1462 advanced over the symbol. This is then added to the collection of
1463 itemsets (or merged with a pre-existing itemset).
1465 ###### derive itemsets
1466 // Now we have a completed itemset, so we need to
1467 // compute all the 'next' itemsets and create them
1468 // if they don't exist.
1469 for (i = 0; i < done.cnt; i++) {
1471 unsigned short state;
1472 struct symbol *sym = g->symtab[done.syms[i]];
1473 enum assoc assoc = Non;
1474 unsigned short precedence = 0;
1475 struct symset newitemset = INIT_SYMSET;
1477 newitemset = INIT_DATASET;
1479 for (j = 0; j < is->items.cnt; j++) {
1480 int itm = is->items.syms[j];
1481 int p = item_prod(itm);
1482 int bp = item_dot(itm);
1483 struct production *pr = g->productions[p];
1484 unsigned short la = 0;
1487 if (bp == pr->body_size)
1489 if (pr->body[bp] != sym)
1492 la = is->items.data[j];
1493 pos = symset_find(&newitemset, pr->head->num);
1494 if (bp + 1 == pr->body_size &&
1495 pr->precedence > 0 &&
1496 pr->precedence > precedence) {
1497 // new itemset is reducible and has a precedence.
1498 precedence = pr->precedence;
1502 symset_add(&newitemset, item_num(p, bp+1), la);
1503 else if (type >= LALR) {
1504 // Need to merge la set.
1505 int la2 = newitemset.data[pos];
1507 struct symset ss = set_find(g, la2);
1508 struct symset LA = INIT_SYMSET;
1509 symset_union(&LA, &ss);
1510 ss = set_find(g, la);
1511 if (symset_union(&LA, &ss))
1512 newitemset.data[pos] = save_set(g, LA);
1518 state = add_itemset(g, newitemset, assoc, precedence, type);
1519 if (symset_find(&is->go_to, done.syms[i]) < 0)
1520 symset_add(&is->go_to, done.syms[i], state);
1523 All that is left is to create the initial itemset from production zero, and
1524 with `TK_eof` as the LA set.
1527 static void build_itemsets(struct grammar *g, enum grammar_type type)
1529 struct symset first = INIT_SYMSET;
1532 unsigned short la = 0;
1534 // LA set just has eof
1535 struct symset eof = INIT_SYMSET;
1536 symset_add(&eof, TK_eof, 0);
1537 la = save_set(g, eof);
1538 first = INIT_DATASET;
1540 // production 0, offset 0 (with no data)
1541 symset_add(&first, item_num(0, 0), la);
1542 add_itemset(g, first, Non, 0, type);
1543 for (check_again = 0, is = g->items;
1545 is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1547 struct symset done = INIT_SYMSET;
1558 ### Completing the analysis.
1560 The exact process of analysis depends on which level was requested. For
1561 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1562 `LR1` we need first, but not follow. For `SLR` we need both.
1564 We don't build the "action" tables that you might expect as the parser
1565 doesn't use them. They will be calculated to some extent if needed for
1568 Once we have built everything we allocate arrays for the two lists:
1569 symbols and itemsets. This allows more efficient access during
1570 reporting. The symbols are grouped as terminals and then non-terminals,
1571 and we record the changeover point in `first_nonterm`.
1573 ###### grammar fields
1574 struct symbol **symtab;
1575 struct itemset **statetab;
1578 ###### grammar_analyse
1580 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1584 int snum = TK_reserved;
1585 for (s = g->syms; s; s = s->next)
1586 if (s->num < 0 && s->type == Terminal) {
1590 g->first_nonterm = snum;
1591 for (s = g->syms; s; s = s->next)
1592 if (s->num < 0 && s->type != Virtual) {
1596 for (s = g->syms; s; s = s->next)
1602 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1603 for (s = g->syms; s; s = s->next)
1604 g->symtab[s->num] = s;
1614 build_itemsets(g, type);
1616 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1617 for (is = g->items; is ; is = is->next)
1618 g->statetab[is->state] = is;
1621 ## Reporting on the Grammar
1623 The purpose of the report is to give the grammar developer insight into
1624 how the grammar parser will work. It is basically a structured dump of
1625 all the tables that have been generated, plus a description of any conflicts.
1627 ###### grammar_report
1628 static int grammar_report(struct grammar *g, enum grammar_type type)
1634 return report_conflicts(g, type);
1637 Firstly we have the complete list of symbols, together with the
1638 "FIRST" set if that was generated. We add a mark to each symbol to
1639 show if it is considered to be "line-like" (`<`), or if it is nullable (`.`).
1643 static void report_symbols(struct grammar *g)
1647 printf("SYMBOLS + FIRST:\n");
1649 printf("SYMBOLS:\n");
1651 for (n = 0; n < g->num_syms; n++) {
1652 struct symbol *s = g->symtab[n];
1656 printf(" %c%c%3d%c: ",
1657 s->nullable ? '.':' ',
1658 s->line_like ? '<':' ',
1659 s->num, symtypes[s->type]);
1662 printf(" (%d%s)", s->precedence,
1663 assoc_names[s->assoc]);
1665 if (g->first && s->type == Nonterminal) {
1668 for (j = 0; j < g->first[n].cnt; j++) {
1671 prtxt(g->symtab[g->first[n].syms[j]]->name);
1678 Then we have the follow sets if they were computed.
1680 static void report_follow(struct grammar *g)
1683 printf("FOLLOW:\n");
1684 for (n = 0; n < g->num_syms; n++)
1685 if (g->follow[n].cnt) {
1689 prtxt(g->symtab[n]->name);
1690 for (j = 0; j < g->follow[n].cnt; j++) {
1693 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1699 And finally the item sets. These include the GO TO tables and, for
1700 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1701 it up a bit. First the items, with production number and associativity.
1703 static void report_item(struct grammar *g, int itm)
1705 int p = item_prod(itm);
1706 int dot = item_dot(itm);
1707 struct production *pr = g->productions[p];
1711 prtxt(pr->head->name);
1713 for (i = 0; i < pr->body_size; i++) {
1714 printf(" %s", (dot == i ? ". ": ""));
1715 prtxt(pr->body[i]->name);
1717 if (dot == pr->body_size)
1720 if (pr->precedence && dot == pr->body_size)
1721 printf(" (%d%s)", pr->precedence,
1722 assoc_names[pr->assoc]);
1723 if (dot < pr->body_size &&
1724 pr->body[dot]->precedence) {
1725 struct symbol *s = pr->body[dot];
1726 printf(" [%d%s]", s->precedence,
1727 assoc_names[s->assoc]);
1729 if (pr->line_like == 1)
1730 printf(" $$NEWLINE");
1731 else if (pr->line_like)
1736 The LA sets which are (possibly) reported with each item:
1738 static void report_la(struct grammar *g, int lanum)
1740 struct symset la = set_find(g, lanum);
1744 printf(" LOOK AHEAD(%d)", lanum);
1745 for (i = 0; i < la.cnt; i++) {
1748 prtxt(g->symtab[la.syms[i]]->name);
1753 Then the go to sets:
1755 static void report_goto(struct grammar *g, struct symset gt)
1760 for (i = 0; i < gt.cnt; i++) {
1762 prtxt(g->symtab[gt.syms[i]]->name);
1763 printf(" -> %d\n", gt.data[i]);
1767 Now we can report all the item sets complete with items, LA sets, and GO TO.
1769 static void report_itemsets(struct grammar *g)
1772 printf("ITEM SETS(%d)\n", g->states);
1773 for (s = 0; s < g->states; s++) {
1775 struct itemset *is = g->statetab[s];
1776 printf(" Itemset %d:%s min prefix=%d",
1777 s, is->starts_line?" (startsline)":"", is->min_prefix);
1779 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1781 for (j = 0; j < is->items.cnt; j++) {
1782 report_item(g, is->items.syms[j]);
1783 if (is->items.data != NO_DATA)
1784 report_la(g, is->items.data[j]);
1786 report_goto(g, is->go_to);
1790 ### Reporting conflicts
1792 Conflict detection varies a lot among different analysis levels. However
1793 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1794 LR1 are also similar as they have FOLLOW or LA sets.
1798 ## conflict functions
1800 static int report_conflicts(struct grammar *g, enum grammar_type type)
1803 printf("Conflicts:\n");
1805 cnt = conflicts_lr0(g, type);
1807 cnt = conflicts_slr(g, type);
1809 printf(" - no conflicts\n");
1813 LR0 conflicts are any state which have both a reducible item and
1814 a shiftable item, or two reducible items.
1816 LR05 conflicts only occur if two possible reductions exist,
1817 as shifts always over-ride reductions.
1819 ###### conflict functions
1820 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1824 for (i = 0; i < g->states; i++) {
1825 struct itemset *is = g->statetab[i];
1826 int last_reduce = -1;
1827 int prev_reduce = -1;
1828 int last_shift = -1;
1832 for (j = 0; j < is->items.cnt; j++) {
1833 int itm = is->items.syms[j];
1834 int p = item_prod(itm);
1835 int bp = item_dot(itm);
1836 struct production *pr = g->productions[p];
1838 if (bp == pr->body_size) {
1839 prev_reduce = last_reduce;
1843 if (pr->body[bp]->type == Terminal)
1846 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1847 printf(" State %d has both SHIFT and REDUCE:\n", i);
1848 report_item(g, is->items.syms[last_shift]);
1849 report_item(g, is->items.syms[last_reduce]);
1852 if (prev_reduce >= 0) {
1853 printf(" State %d has 2 (or more) reducible items\n", i);
1854 report_item(g, is->items.syms[prev_reduce]);
1855 report_item(g, is->items.syms[last_reduce]);
1862 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1863 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1864 in the source of the look ahead set.
1866 We build two datasets to reflect the "action" table: one which maps
1867 terminals to items where that terminal could be shifted and another
1868 which maps terminals to items that could be reduced when the terminal
1869 is in look-ahead. We report when we get conflicts between the two.
1871 As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
1872 terminal, we ignore it. NEWLINES are handled specially with its own
1873 rules for when to shift and when to reduce. Conflicts are expected,
1874 but handled internally.
1876 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1881 for (i = 0; i < g->states; i++) {
1882 struct itemset *is = g->statetab[i];
1883 struct symset shifts = INIT_DATASET;
1884 struct symset reduce = INIT_DATASET;
1888 /* First collect the shifts */
1889 for (j = 0; j < is->items.cnt; j++) {
1890 unsigned short itm = is->items.syms[j];
1891 int p = item_prod(itm);
1892 int bp = item_dot(itm);
1893 struct production *pr = g->productions[p];
1896 if (bp >= pr->body_size ||
1897 pr->body[bp]->type != Terminal)
1902 if (s->precedence && is->precedence)
1903 /* Precedence resolves this, so no conflict */
1906 if (symset_find(&shifts, s->num) < 0)
1907 symset_add(&shifts, s->num, itm);
1909 /* Now look for reductions and conflicts */
1910 for (j = 0; j < is->items.cnt; j++) {
1911 unsigned short itm = is->items.syms[j];
1912 int p = item_prod(itm);
1913 int bp = item_dot(itm);
1914 struct production *pr = g->productions[p];
1916 if (bp < pr->body_size)
1921 la = g->follow[pr->head->num];
1923 la = set_find(g, is->items.data[j]);
1925 for (k = 0; k < la.cnt; k++) {
1926 int pos = symset_find(&shifts, la.syms[k]);
1927 if (pos >= 0 && la.syms[k] != TK_newline) {
1928 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1930 prtxt(g->symtab[la.syms[k]]->name);
1932 report_item(g, shifts.data[pos]);
1933 report_item(g, itm);
1935 pos = symset_find(&reduce, la.syms[k]);
1937 symset_add(&reduce, la.syms[k], itm);
1940 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1941 prtxt(g->symtab[la.syms[k]]->name);
1943 report_item(g, itm);
1944 report_item(g, reduce.data[pos]);
1948 symset_free(shifts);
1949 symset_free(reduce);
1955 ## Generating the parser
1957 The exported part of the parser is the `parse_XX` function, where the name
1958 `XX` is based on the name of the parser files.
1960 This takes a `code_node`, a partially initialized `token_config`, and an
1961 optional `FILE` to send tracing to. The `token_config` gets the list of
1962 known words added and then is used with the `code_node` to initialize the
1965 `parse_XX` then calls the library function `parser_run` to actually complete
1966 the parse. This needs the `states` table and functions to call the various
1967 pieces of code provided in the grammar file, so they are generated first.
1969 ###### parser_generate
1971 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1972 struct code_node *pre_reduce)
1978 gen_reduce(f, g, file, pre_reduce);
1981 fprintf(f, "#line 0 \"gen_parser\"\n");
1982 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1985 fprintf(f, "\tstruct token_state *tokens;\n");
1986 fprintf(f, "\tconfig->words_marks = known;\n");
1987 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1988 fprintf(f, "\ttokens = token_open(code, config);\n");
1989 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1990 fprintf(f, "\ttoken_close(tokens);\n");
1991 fprintf(f, "\treturn rv;\n");
1992 fprintf(f, "}\n\n");
1995 ### Known words table
1997 The known words table is simply an array of terminal symbols.
1998 The table of nonterminals used for tracing is a similar array.
2002 static void gen_known(FILE *f, struct grammar *g)
2005 fprintf(f, "#line 0 \"gen_known\"\n");
2006 fprintf(f, "static const char *known[] = {\n");
2007 for (i = TK_reserved;
2008 i < g->num_syms && g->symtab[i]->type == Terminal;
2010 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
2011 g->symtab[i]->name.txt);
2012 fprintf(f, "};\n\n");
2015 static void gen_non_term(FILE *f, struct grammar *g)
2018 fprintf(f, "#line 0 \"gen_non_term\"\n");
2019 fprintf(f, "static const char *non_term[] = {\n");
2020 for (i = TK_reserved;
2023 if (g->symtab[i]->type == Nonterminal)
2024 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
2025 g->symtab[i]->name.txt);
2026 fprintf(f, "};\n\n");
2029 ### States and the goto tables.
2031 For each state we record the goto table and details of the reducible
2032 production if there is one.
2033 Some of the details of the reducible production are stored in the
2034 `do_reduce` function to come later. Here we store the production
2035 number, the body size (useful for stack management), and the resulting
2036 symbol (useful for knowing how to free data later).
2038 The go to table is stored in a simple array of `sym` and corresponding
2041 ###### exported types
2049 const struct lookup * go_to;
2060 static void gen_goto(FILE *f, struct grammar *g)
2063 fprintf(f, "#line 0 \"gen_goto\"\n");
2064 for (i = 0; i < g->states; i++) {
2066 fprintf(f, "static const struct lookup goto_%d[] = {\n",
2068 struct symset gt = g->statetab[i]->go_to;
2069 for (j = 0; j < gt.cnt; j++)
2070 fprintf(f, "\t{ %d, %d },\n",
2071 gt.syms[j], gt.data[j]);
2076 static void gen_states(FILE *f, struct grammar *g)
2079 fprintf(f, "#line 0 \"gen_states\"\n");
2080 fprintf(f, "static const struct state states[] = {\n");
2081 for (i = 0; i < g->states; i++) {
2082 struct itemset *is = g->statetab[i];
2083 int j, prod = -1, prod_len;
2085 for (j = 0; j < is->items.cnt; j++) {
2086 int itm = is->items.syms[j];
2087 int p = item_prod(itm);
2088 int bp = item_dot(itm);
2089 struct production *pr = g->productions[p];
2091 if (bp < pr->body_size)
2093 /* This is what we reduce - choose longest */
2094 if (prod < 0 || prod_len < pr->body_size) {
2096 prod_len = pr->body_size;
2101 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n",
2102 i, is->go_to.cnt, i, prod,
2103 g->productions[prod]->body_size,
2104 g->productions[prod]->head->num,
2106 g->productions[prod]->line_like,
2109 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n",
2110 i, is->go_to.cnt, i,
2111 is->starts_line, is->min_prefix);
2113 fprintf(f, "};\n\n");
2116 ### The `do_reduce` function and the code
2118 When the parser engine decides to reduce a production, it calls
2119 `do_reduce` which runs the code that was included with the production,
2122 This code needs to be able to store data somewhere. Rather than
2123 requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
2124 buffer and have `do_reduce` return the size to be saved.
2126 In order for the code to access "global" context, we pass in the
2127 "config" pointer that was passed to the parser function. If the `struct
2128 token_config` is embedded in some larger structure, the reducing code
2129 can access the larger structure using pointer manipulation. Performing
2130 that pointer manipulation, and any other common code or variables for
2131 all reduce actions, can be provided in code node called "reduce" which
2132 is passed around in `pre_reduce` which you might have already noticed.
2134 The code fragments require translation when written out. Any `$N` needs
2135 to be converted to a reference either to that buffer (if `$0`) or to the
2136 structure returned by a previous reduction. These pointers need to be
2137 cast to the appropriate type for each access. All this is handled in
2140 `gen_code` also allows symbol references to contain a '`<`' as in
2141 '`$<2`'. This is particularly useful for references (or pointers), but
2142 can be used with structures too. The `<` implies that the value is
2143 being moved out, so the object will not be automatically freed. It is
2144 equivalent to assigning `NULL` to the pointer or filling a structure
2147 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2148 and possibly a number. A number by itself (other than zero) selects a
2149 symbol from the body of the production. A sequence of letters selects
2150 the shortest symbol in the body which contains those letters in the
2151 given order. If a number follows the letters, then a later occurrence
2152 of that symbol is chosen. So "`$AB2`" will refer to the structure
2153 attached to the second occurrence of the shortest symbol which contains
2154 an `A` followed by a `B`. If there is no unique shortest system, or if
2155 the number given is too large, then the symbol reference is not
2156 transformed, and will cause an error when the code is compiled.
2160 static int textchr(struct text t, char c, int s)
2163 for (i = s; i < t.len; i++)
2169 static int subseq_match(char *seq, int slen, struct text name)
2173 st = textchr(name, *seq, st);
2183 static int choose_sym(char **namep, int len, struct production *p)
2185 char *name = *namep;
2194 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2200 while (len > 0 && (c >= '0' && c <= '9')) {
2203 n = n * 10 + (c - '0');
2213 for (i = 0; i < p->body_size; i++) {
2214 if (!subseq_match(nam, namlen, p->body[i]->name))
2216 if (slen == 0 || p->body[i]->name.len < slen)
2218 if (s >= 0 && p->body[i] != p->body[s] &&
2219 p->body[i]->name.len == p->body[s]->name.len)
2220 /* not unique, so s cannot be used */
2227 for (i = 0; i < p->body_size; i++)
2228 if (p->body[i] == p->body[s]) {
2239 static void gen_code(struct production *p, FILE *f, struct grammar *g)
2242 char *used = calloc(1, p->body_size);
2245 fprintf(f, "\t\t\t");
2246 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2260 n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2269 fprintf(f, "(*(struct %.*s*%s)ret)",
2270 p->head->struct_name.len,
2271 p->head->struct_name.txt,
2272 p->head->isref ? "*":"");
2273 else if (p->body[n-1]->type == Terminal)
2274 fprintf(f, "(*(struct token *)body[%d])",
2276 else if (p->body[n-1]->struct_name.txt == NULL)
2277 fprintf(f, "$%d", n);
2279 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2280 p->body[n-1]->struct_name.len,
2281 p->body[n-1]->struct_name.txt,
2282 p->body[n-1]->isref ? "*":"", n-1);
2288 for (i = 0; i < p->body_size; i++) {
2289 if (p->body[i]->struct_name.txt &&
2291 // assume this has been copied out
2292 if (p->body[i]->isref)
2293 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2295 fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2303 static void gen_reduce(FILE *f, struct grammar *g, char *file,
2304 struct code_node *code)
2307 fprintf(f, "#line 1 \"gen_reduce\"\n");
2308 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2310 fprintf(f, "\tint ret_size = 0;\n");
2312 code_node_print(f, code, file);
2314 fprintf(f, "#line 4 \"gen_reduce\"\n");
2315 fprintf(f, "\tswitch(prod) {\n");
2316 for (i = 0; i < g->production_count; i++) {
2317 struct production *p = g->productions[i];
2318 fprintf(f, "\tcase %d:\n", i);
2321 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2325 if (p->head->struct_name.txt)
2326 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2327 p->head->struct_name.len,
2328 p->head->struct_name.txt,
2329 p->head->isref ? "*":"");
2331 fprintf(f, "\t\tbreak;\n");
2333 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2338 As each non-terminal can potentially cause a different type of data
2339 structure to be allocated and filled in, we need to be able to free it when
2342 It is particularly important to have fine control over freeing during error
2343 recovery where individual stack frames might need to be freed.
2345 For this, the grammar author is required to defined a `free_XX` function for
2346 each structure that is used by a non-terminal. `do_free` will call whichever
2347 is appropriate given a symbol number, and will call `free` (as is
2348 appropriate for tokens) on any terminal symbol.
2352 static void gen_free(FILE *f, struct grammar *g)
2356 fprintf(f, "#line 0 \"gen_free\"\n");
2357 fprintf(f, "static void do_free(short sym, void *asn)\n");
2359 fprintf(f, "\tif (!asn) return;\n");
2360 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2361 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2362 fprintf(f, "\tswitch(sym) {\n");
2364 for (i = 0; i < g->num_syms; i++) {
2365 struct symbol *s = g->symtab[i];
2367 s->type != Nonterminal ||
2368 s->struct_name.txt == NULL)
2371 fprintf(f, "\tcase %d:\n", s->num);
2373 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2375 s->struct_name.txt);
2376 fprintf(f, "\t\tfree(asn);\n");
2378 fprintf(f, "\t\tfree_%.*s(asn);\n",
2380 s->struct_name.txt);
2381 fprintf(f, "\t\tbreak;\n");
2383 fprintf(f, "\t}\n}\n\n");
2386 ## The main routine.
2388 There are three key parts to the "main" function of parsergen: processing
2389 the arguments, loading the grammar file, and dealing with the grammar.
2391 ### Argument processing.
2393 Command line options allow the selection of analysis mode, name of output
2394 file, and whether or not a report should be generated. By default we create
2395 a report only if no code output was requested.
2397 The `parse_XX` function name uses the basename of the output file which
2398 should not have a suffix (`.c`). `.c` is added to the given name for the
2399 code, and `.h` is added for the header (if header text is specifed in the
2406 static const struct option long_options[] = {
2407 { "LR0", 0, NULL, '0' },
2408 { "LR05", 0, NULL, '5' },
2409 { "SLR", 0, NULL, 'S' },
2410 { "LALR", 0, NULL, 'L' },
2411 { "LR1", 0, NULL, '1' },
2412 { "tag", 1, NULL, 't' },
2413 { "report", 0, NULL, 'R' },
2414 { "output", 1, NULL, 'o' },
2415 { NULL, 0, NULL, 0 }
2417 const char *options = "05SL1t:Ro:";
2419 ###### process arguments
2421 char *outfile = NULL;
2426 enum grammar_type type = LR05;
2427 while ((opt = getopt_long(argc, argv, options,
2428 long_options, NULL)) != -1) {
2443 outfile = optarg; break;
2445 tag = optarg; break;
2447 fprintf(stderr, "Usage: parsergen ...\n");
2452 infile = argv[optind++];
2454 fprintf(stderr, "No input file given\n");
2457 if (outfile && report == 1)
2460 if (name && strchr(name, '/'))
2461 name = strrchr(name, '/')+1;
2463 if (optind < argc) {
2464 fprintf(stderr, "Excess command line arguments\n");
2468 ### Loading the grammar file
2470 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2473 Once we have extracted the code (with `mdcode`) we expect to find three
2474 or four sections: header, code, grammar, reduce.
2475 Anything else that is not excluded by the `--tag` option is an error.
2477 "header", "code", and "reduce" are optional, though it is hard to build
2478 a working parser without the first two. "grammar" must be provided.
2482 #include <sys/mman.h>
2487 static void pr_err(char *msg)
2490 fprintf(stderr, "%s\n", msg);
2494 struct section *table;
2498 fd = open(infile, O_RDONLY);
2500 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2501 infile, strerror(errno));
2504 len = lseek(fd, 0, 2);
2505 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2506 table = code_extract(file, file+len, pr_err);
2508 struct code_node *hdr = NULL;
2509 struct code_node *code = NULL;
2510 struct code_node *gram = NULL;
2511 struct code_node *pre_reduce = NULL;
2512 for (s = table; s; s = s->next) {
2513 struct text sec = s->section;
2514 if (tag && !strip_tag(&sec, tag))
2516 if (text_is(sec, "header"))
2518 else if (text_is(sec, "code"))
2520 else if (text_is(sec, "grammar"))
2522 else if (text_is(sec, "reduce"))
2523 pre_reduce = s->code;
2525 fprintf(stderr, "Unknown content section: %.*s\n",
2526 s->section.len, s->section.txt);
2531 ### Processing the input
2533 As we need to append an extention to a filename and then open it for
2534 writing, and we need to do this twice, it helps to have a separate function.
2538 static FILE *open_ext(char *base, char *ext)
2540 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2542 strcat(strcpy(fn, base), ext);
2548 If we can read the grammar, then we analyse and optionally report on it. If
2549 the report finds conflicts we will exit with an error status.
2551 ###### process input
2552 struct grammar *g = NULL;
2554 fprintf(stderr, "No grammar section provided\n");
2557 g = grammar_read(gram);
2559 fprintf(stderr, "Failure to parse grammar\n");
2564 grammar_analyse(g, type);
2566 if (grammar_report(g, type))
2570 If a "headers" section is defined, we write it out and include a declaration
2571 for the `parse_XX` function so it can be used from separate code.
2573 if (rv == 0 && hdr && outfile) {
2574 FILE *f = open_ext(outfile, ".h");
2576 code_node_print(f, hdr, infile);
2577 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2581 fprintf(stderr, "Cannot create %s.h\n",
2587 And if all goes well and an output file was provided, we create the `.c`
2588 file with the code section (if any) and the parser tables and function.
2590 if (rv == 0 && outfile) {
2591 FILE *f = open_ext(outfile, ".c");
2594 code_node_print(f, code, infile);
2595 gen_parser(f, g, infile, name, pre_reduce);
2598 fprintf(stderr, "Cannot create %s.c\n",
2604 And that about wraps it up. We need to set the locale so that UTF-8 is
2605 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2607 ###### File: parsergen.mk
2608 parsergen : parsergen.o libscanner.o libmdcode.o
2609 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2616 int main(int argc, char *argv[])
2621 setlocale(LC_ALL,"");
2623 ## process arguments
2630 ## The SHIFT/REDUCE parser
2632 Having analysed the grammar and generated all the tables, we only need
2633 the shift/reduce engine to bring it all together.
2635 ### Goto table lookup
2637 The parser generator has nicely provided us with goto tables sorted by
2638 symbol number. We need a binary search function to find a symbol in the
2641 ###### parser functions
2643 static int search(const struct state *l, int sym)
2646 int hi = l->go_to_cnt;
2650 while (lo + 1 < hi) {
2651 int mid = (lo + hi) / 2;
2652 if (l->go_to[mid].sym <= sym)
2657 if (l->go_to[lo].sym == sym)
2658 return l->go_to[lo].state;
2663 ### Memory allocation
2665 The `scanner` returns tokens in a local variable - we want them in allocated
2666 memory so they can live in the `asn_stack`. Similarly the `asn` produced by
2667 a reduce is in a large buffer. Both of these require some allocation and
2668 copying, hence `memdup` and `tok_copy`.
2670 ###### parser includes
2673 ###### parser functions
2675 void *memdup(void *m, int len)
2681 memcpy(ret, m, len);
2685 static struct token *tok_copy(struct token tk)
2687 struct token *new = malloc(sizeof(*new));
2692 ### The state stack.
2694 The core data structure for the parser is the stack. This tracks all
2695 the symbols that have been recognised or partially recognised.
2697 The stack usually won't grow very large - maybe a few tens of entries.
2698 So we dynamically grow an array as required but never bother to
2699 shrink it down again.
2701 We keep the stack as two separate allocations. One, `asn_stack` stores
2702 the "abstract syntax nodes" which are created by each reduction. When
2703 we call `do_reduce` we need to pass an array of the `asn`s of the body
2704 of the production, and by keeping a separate `asn` stack, we can just
2705 pass a pointer into this stack.
2707 The other allocation stores all other stack fields of which there are
2708 several. The `state` is the most important one and guides the parsing
2709 process. The `sym` is nearly unnecessary as it is implicit in the
2710 state. However when we want to free entries from the `asn_stack`, it
2711 helps to know what type they are so we can call the right freeing
2712 function. The symbol leads us to the right free function through
2715 The `indents` count tracks the line indents with-in the symbol or
2716 immediately follow it. These are used to allow indent information to
2717 guide parsing and error recovery.
2719 `since_newline` tracks how many stack frames since the last
2720 start-of-line (whether indented or not). So if `since_newline` is
2721 zero, then this symbol is at the start of a line. Similarly
2722 `since_indent` counts the number of states since an indent, it is zero
2723 precisely when `indents` is not zero.
2725 `newline_permitted` keeps track of whether newlines should be ignored
2728 The stack is most properly seen as alternating states and symbols -
2729 states, like the 'DOT' in items, are between symbols. Each frame in
2730 our stack holds a state and the symbol that was before it. The
2731 bottom of stack holds the start state but no symbol, as nothing came
2732 before the beginning. As we need to store some value, `TK_eof` is used
2733 to mark the beginning of the file as well as the end.
2735 ###### parser functions
2740 short newline_permitted;
2744 short since_newline;
2754 Two operations are needed on the stack - shift (which is like push) and pop.
2756 Shift applies not only to terminals but also to non-terminals. When we
2757 reduce a production we will pop off frames corresponding to the body
2758 symbols, then push on a frame for the head of the production. This last
2759 is exactly the same process as shifting in a terminal so we use the same
2760 function for both. In both cases we provide the symbol, the number of
2761 indents the symbol contains (which will be zero for a terminal symbol)
2762 and a flag indicating the the symbol was at (or was reduced from a
2763 symbol which was at) the start of a line. The state is deduced from the
2764 current top-of-stack state and the new symbol.
2766 To simplify other code we arrange for `shift` to fail if there is no `goto`
2767 state for the symbol. This is useful in basic parsing due to our design
2768 that we shift when we can, and reduce when we cannot. So the `shift`
2769 function reports if it could.
2771 `shift` is also used to push state zero onto the stack, so if the
2772 stack is empty, it always chooses zero as the next state.
2774 So `shift` finds the next state. If that succeeds it extends the
2775 allocations if needed and pushes all the information onto the stacks.
2777 Newlines are permitted after a `starts_line` state until an internal
2778 indent. If the new frame has neither a `starts_line` state nor an
2779 indent, newlines are permitted if the previous stack frame permitted
2782 ###### parser functions
2784 static int shift(struct parser *p,
2785 short sym, short indents, short start_of_line,
2787 const struct state states[])
2789 // Push an entry onto the stack
2790 struct frame next = {0};
2791 int newstate = p->tos
2792 ? search(&states[p->stack[p->tos-1].state],
2797 if (p->tos >= p->stack_size) {
2798 p->stack_size += 10;
2799 p->stack = realloc(p->stack, p->stack_size
2800 * sizeof(p->stack[0]));
2801 p->asn_stack = realloc(p->asn_stack, p->stack_size
2802 * sizeof(p->asn_stack[0]));
2805 next.indents = indents;
2806 next.state = newstate;
2807 if (states[newstate].starts_line)
2808 next.newline_permitted = 1;
2810 next.newline_permitted = 0;
2812 next.newline_permitted =
2813 p->stack[p->tos-1].newline_permitted;
2815 next.newline_permitted = 0;
2817 if (!start_of_line) {
2819 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2821 next.since_newline = 1;
2824 next.since_indent = 0;
2826 next.since_indent = p->stack[p->tos-1].since_indent + 1;
2828 next.since_indent = 1;
2830 p->stack[p->tos] = next;
2831 p->asn_stack[p->tos] = asn;
2836 `pop` primarily moves the top of stack (`tos`) back down the required
2837 amount and frees any `asn` entries that need to be freed. It also
2838 collects a summary of the indents and line starts in the symbols that
2839 are being removed. It is called _after_ we reduce a production, just
2840 before we `shift` the nonterminal in.
2842 ###### parser functions
2844 static int pop(struct parser *p, int num,
2845 short *start_of_line,
2846 void(*do_free)(short sym, void *asn))
2853 for (i = 0; i < num; i++) {
2854 sol |= !p->stack[p->tos+i].since_newline;
2855 indents += p->stack[p->tos+i].indents;
2856 do_free(p->stack[p->tos+i].sym,
2857 p->asn_stack[p->tos+i]);
2860 *start_of_line = sol;
2864 ### The heart of the parser.
2866 Now we have the parser. For each token we might shift it, trigger a
2867 reduction, or start error handling. 2D tokens (IN, OUT, EOL) also need
2870 We return whatever `asn` was returned by reducing production zero.
2872 If we can neither shift nor reduce we have an error to handle. We pop
2873 single entries off the stack until we can shift the `TK_error` symbol,
2874 then drop input tokens until we find one we can shift into the new error
2875 state. We need to ensure that something is dropped or shifted after an
2876 error, or we could get into an infinite loop, only shifting in
2877 `TK_error`, then reducing. So we track if there has been a shift since
2878 the last error, and if not the next error always discards one token.
2880 When we find `TK_in` and `TK_out` tokens which report indents we need
2881 to handle them directly as the grammar cannot express what we want to
2884 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2885 record how many indents there are following the previous token.
2887 `TK_out` tokens must be canceled against an indent count
2888 within the stack. If we can reduce some symbols that are all since
2889 the most recent indent, then we do that first. If the minimum prefix
2890 of the current state then extends back before the most recent indent,
2891 that indent can be cancelled. If the minimum prefix is shorter then
2892 the indent had ended prematurely and we must start error handling, which
2893 is still a work-in-progress.
2895 `TK_newline` tokens are ignored unless the top stack frame records
2896 that they are permitted. In that case they will not be considered for
2897 shifting if it is possible to reduce some symbols that are all since
2898 the most recent start of line. This is how a newline forcibly
2899 terminates any line-like structure - we try to reduce down to at most
2900 one symbol for each line where newlines are allowed.
2901 A consequence of this is that a rule like
2903 ###### Example: newlines - broken
2907 IfStatement -> Newlines if ....
2909 cannot work, as the NEWLINE will never be shifted as the empty string
2910 will be reduced first. Optional sets of newlines need to be include
2911 in the thing that preceed:
2913 ###### Example: newlines - works
2917 IfStatement -> If ....
2919 Here the NEWLINE will be shifted because nothing can be reduced until
2922 When during error handling we discard tokens read in, we want to keep
2923 discarding until we see one that is recognised. If we had a full set
2924 of LR(1) grammar states, this would mean looking in the look-ahead set,
2925 but we don't keep a full look-ahead set. We only record the subset
2926 that leads to SHIFT. We can, however, deduce the look-ahead set by
2927 looking at the SHIFT subsets for all states that we can get to by
2928 reducing zero or more times. So we need a little function which
2929 checks if a given token is in any of these look-ahead sets.
2931 ###### parser includes
2936 static int in_lookahead(struct token *tk, const struct state *states, int state)
2938 while (state >= 0) {
2939 if (search(&states[state], tk->num) >= 0)
2941 if (states[state].reduce_prod < 0)
2943 state = search(&states[state], states[state].reduce_sym);
2948 void *parser_run(struct token_state *tokens,
2949 const struct state states[],
2950 int (*do_reduce)(int, void**, struct token_config*, void*),
2951 void (*do_free)(short, void*),
2952 FILE *trace, const char *non_term[],
2953 struct token_config *config)
2955 struct parser p = { 0 };
2956 struct token *tk = NULL;
2958 int shift_since_err = 1;
2961 shift(&p, TK_eof, 0, 1, NULL, states);
2962 while (!accepted && p.tos > 0) {
2963 struct token *err_tk;
2964 struct frame *tos = &p.stack[p.tos-1];
2966 tk = tok_copy(token_next(tokens));
2967 parser_trace(trace, &p,
2968 tk, states, non_term, config->known_count);
2970 if (tk->num == TK_in) {
2972 tos->since_newline = 0;
2973 tos->since_indent = 0;
2974 if (!states[tos->state].starts_line)
2975 tos->newline_permitted = 0;
2978 parser_trace_action(trace, "Record");
2981 if (tk->num == TK_out) {
2982 if (states[tos->state].reduce_size >= 0 &&
2983 states[tos->state].reduce_size <= tos->since_indent)
2985 if (states[tos->state].min_prefix >= tos->since_indent) {
2987 struct frame *in = tos - tos->since_indent;
2989 if (in->indents == 0) {
2990 /* Reassess since_indent and newline_permitted */
2992 in->since_indent = in[-1].since_indent + 1;
2993 in->newline_permitted = in[-1].newline_permitted;
2995 in->since_indent = 0;
2996 in->newline_permitted = 0;
2998 if (states[in->state].starts_line)
2999 in->newline_permitted = 1;
3002 in->since_indent = in[-1].since_indent + 1;
3003 if (states[in->state].starts_line)
3004 in->newline_permitted = 1;
3006 in->newline_permitted = in[-1].newline_permitted;
3011 parser_trace_action(trace, "Cancel");
3014 // fall through to error handling as both SHIFT and REDUCE
3017 if (tk->num == TK_newline) {
3018 if (!tos->newline_permitted) {
3021 parser_trace_action(trace, "Discard");
3024 if (tos->since_newline > 1 &&
3025 states[tos->state].reduce_size >= 0 &&
3026 states[tos->state].reduce_size <= tos->since_newline)
3029 if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
3030 shift_since_err = 1;
3032 parser_trace_action(trace, "Shift");
3036 if (states[tos->state].reduce_prod >= 0 &&
3037 states[tos->state].newline_only &&
3038 !(tk->num == TK_newline ||
3039 tk->num == TK_eof ||
3040 tk->num == TK_out ||
3041 (tos->indents == 0 && tos->since_newline == 0))) {
3042 /* Anything other than newline or out or eof
3043 * in an error unless we are already at start
3044 * of line, as this production must end at EOL.
3046 } else if (states[tos->state].reduce_prod >= 0) {
3049 const struct state *nextstate = &states[tos->state];
3050 int prod = nextstate->reduce_prod;
3051 int size = nextstate->reduce_size;
3053 static char buf[16*1024];
3054 short indents, start_of_line;
3056 body = p.asn_stack + (p.tos - size);
3058 bufsize = do_reduce(prod, body, config, buf);
3060 indents = pop(&p, size, &start_of_line,
3062 res = memdup(buf, bufsize);
3063 memset(buf, 0, bufsize);
3064 if (!shift(&p, nextstate->reduce_sym,
3065 indents, start_of_line,
3067 if (prod != 0) abort();
3071 parser_trace_action(trace, "Reduce");
3074 /* Error. We walk up the stack until we
3075 * find a state which will accept TK_error.
3076 * We then shift in TK_error and see what state
3077 * that takes us too.
3078 * Then we discard input tokens until
3079 * we find one that is acceptable.
3081 parser_trace_action(trace, "ERROR");
3082 short indents = 0, start_of_line;
3084 err_tk = tok_copy(*tk);
3086 shift(&p, TK_error, 0, 0,
3087 err_tk, states) == 0)
3088 // discard this state
3089 indents += pop(&p, 1, &start_of_line, do_free);
3092 // no state accepted TK_error
3095 if (!shift_since_err) {
3096 /* must discard at least one token to avoid
3099 if (tk->num == TK_eof)
3102 tk = tok_copy(token_next(tokens));
3104 shift_since_err = 0;
3105 tos = &p.stack[p.tos-1];
3106 while (!in_lookahead(tk, states, tos->state) &&
3107 tk->num != TK_eof) {
3109 tk = tok_copy(token_next(tokens));
3110 shift_since_err = 1;
3111 if (tk->num == TK_in)
3113 if (tk->num == TK_out) {
3117 // FIXME update since_indent here
3120 tos->indents += indents;
3123 pop(&p, p.tos, NULL, do_free);
3129 ###### exported functions
3130 void *parser_run(struct token_state *tokens,
3131 const struct state states[],
3132 int (*do_reduce)(int, void**, struct token_config*, void*),
3133 void (*do_free)(short, void*),
3134 FILE *trace, const char *non_term[],
3135 struct token_config *config);
3139 Being able to visualize the parser in action can be invaluable when
3140 debugging the parser code, or trying to understand how the parse of a
3141 particular grammar progresses. The stack contains all the important
3142 state, so just printing out the stack every time around the parse loop
3143 can make it possible to see exactly what is happening.
3145 This doesn't explicitly show each SHIFT and REDUCE action. However they
3146 are easily deduced from the change between consecutive lines, and the
3147 details of each state can be found by cross referencing the states list
3148 in the stack with the "report" that parsergen can generate.
3150 For terminal symbols, we just dump the token. For non-terminals we
3151 print the name of the symbol. The look ahead token is reported at the
3152 end inside square brackets.
3154 ###### parser functions
3156 static char *reserved_words[] = {
3157 [TK_error] = "ERROR",
3160 [TK_newline] = "NEWLINE",
3163 static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
3165 fprintf(trace, "(%d", f->state);
3166 if (states[f->state].starts_line)
3167 fprintf(trace, "s");
3168 if (f->newline_permitted)
3169 fprintf(trace, "n%d", f->since_newline);
3170 fprintf(trace, ") ");
3173 void parser_trace(FILE *trace, struct parser *p,
3174 struct token *tk, const struct state states[],
3175 const char *non_term[], int knowns)
3180 for (i = 0; i < p->tos; i++) {
3181 struct frame *f = &p->stack[i];
3184 if (sym < TK_reserved &&
3185 reserved_words[sym] != NULL)
3186 fputs(reserved_words[sym], trace);
3187 else if (sym < TK_reserved + knowns) {
3188 struct token *t = p->asn_stack[i];
3189 text_dump(trace, t->txt, 20);
3191 fputs(non_term[sym - TK_reserved - knowns],
3194 fprintf(trace, ".%d", f->indents);
3195 if (f->since_newline == 0)
3199 parser_trace_state(trace, f, states);
3201 fprintf(trace, "[");
3202 if (tk->num < TK_reserved &&
3203 reserved_words[tk->num] != NULL)
3204 fputs(reserved_words[tk->num], trace);
3206 text_dump(trace, tk->txt, 20);
3207 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3210 void parser_trace_action(FILE *trace, char *action)
3213 fprintf(trace, " - %s\n", action);
3218 The obvious example for a parser is a calculator.
3220 As `scanner` provides number parsing function using `libgmp` it is not much
3221 work to perform arbitrary rational number calculations.
3223 This calculator takes one expression, or an equality test, per line.
3224 The results are printed and if any equality test fails, the program
3225 exits with an error.
3227 ###### File: parsergen.mk
3228 calc.c calc.h : parsergen parsergen.mdc
3229 ./parsergen --tag calc -o calc parsergen.mdc
3230 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3231 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3233 ./calc parsergen.mdc
3238 #include "parse_number.h"
3239 // what do we use for a demo-grammar? A calculator of course.
3251 #include <sys/mman.h>
3257 #include "scanner.h"
3262 static void free_number(struct number *n)
3268 static int text_is(struct text t, char *s)
3270 return (strlen(s) == t.len &&
3271 strncmp(s, t.txt, t.len) == 0);
3274 int main(int argc, char *argv[])
3276 int fd = open(argv[1], O_RDONLY);
3277 int len = lseek(fd, 0, 2);
3278 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3279 struct section *table = code_extract(file, file+len, NULL);
3281 struct token_config config = {
3282 .ignored = (1 << TK_line_comment)
3285 .number_chars = ".,_+-",
3289 for (s = table; s; s = s->next)
3290 if (text_is(s->section, "example: input"))
3291 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3293 struct section *t = table->next;
3294 code_free(table->code);
3306 Session -> Session Line
3309 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3310 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3311 gmp_printf(" or as a decimal: %Fg\n", fl);
3315 | Expression = Expression NEWLINE ${
3316 if (mpq_equal($1.val, $3.val))
3317 gmp_printf("Both equal %Qd\n", $1.val);
3319 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
3324 | NEWLINE ${ printf("Blank line\n"); }$
3325 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3328 Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3329 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3330 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3331 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3332 | Expression // Expression ${ {
3335 mpz_init(z0); mpz_init(z1); mpz_init(z2);
3336 mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3337 mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3338 mpz_tdiv_q(z0, z1, z2);
3339 mpq_set_z($0.val, z0);
3340 mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3342 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3343 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3348 3.1415926535 - 355/113
3350 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3352 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1