1 # LR(1) Parser Generator with 2D support #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 "2D support" means that indentation and line breaks can be significant.
9 There are several distinct sections.
11 1. `grammar_read` will read a grammar from a literate-code file and
12 store the productions, symbols, and code fragments.
13 2. `grammar_analyse` will take that grammar and build LR parsing
14 states and look-ahead tables.
15 3. `grammar_report` will present the details of the analysis
16 in a readable format and will report any conflicts.
17 4. `parser_generate` will write out C code files with various
18 tables and with the code fragments arranged in useful places.
19 5. `parser_run` is a library function intended to be linked together
20 with the generated parser tables to complete the implementation of
22 6. Finally `calc` is a test grammar for a simple calculator. The
23 `parsergen` program built from the C code in this file can extract
24 that grammar directly from this file and process it.
26 ###### File: parsergen.c
31 ## forward declarations
42 ###### File: libparser.c
49 ###### File: parsergen.mk
52 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
55 ## Reading the grammar
57 The grammar must be stored in a markdown literate code file as parsed by
58 "[mdcode][]". It may have up to four top level (i.e. unreferenced)
59 sections: `header`, `code`, `grammar` and `reduce`. The first two, if
60 present, will be literally copied into the generated `.c` and `.h`
61 files. The third is required and contains the grammar which is
62 tokenised with "[scanner][]". `reduce` can provide variable
63 declarations and common intialization code for all the reduction code
64 fragments in the grammar.
66 If the `--tag` option is given, then any top level header that doesn't
67 start with the tag is ignored, and the tag is striped from the rest. So
68 `--tag Foo` means that the four sections expected are `Foo: header`,
69 `Foo: code`, `Foo: grammar` and `Foo: reduce`. The tag `calc` is used
70 to extract the sample calculator from this file.
73 [scanner]: scanner.html
79 ###### parser includes
83 The grammar contains production sets, precedence/associativity
84 declarations, general terminal declarations, and data type declarations.
85 These are all parsed with _ad hoc_ parsing as we don't have a parser
88 The precedence and associativity can be set for each production, and
89 can be inherited from symbols. The data type (either structure or a
90 reference to a structure) is potentially defined for each non-terminal
91 and describes what C structure is used to store information about each
95 enum assoc {Left, Right, Non};
96 char *assoc_names[] = {"Left","Right","Non"};
99 struct text struct_name;
102 unsigned short precedence;
106 unsigned short precedence;
115 The strings reported by `mdcode` and `scanner` are `struct text` which
116 have length rather than being nul terminated. To help with printing and
117 comparing we define `text_is` and `prtxt`, which should possibly go in
118 `mdcode`. `scanner` does provide `text_dump` which is useful for
119 strings which might contain control characters.
121 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
122 because that is what we need to detect tags.
125 static int text_is(struct text t, char *s)
127 return (strlen(s) == t.len &&
128 strncmp(s, t.txt, t.len) == 0);
130 static void prtxt(struct text t)
132 printf("%.*s", t.len, t.txt);
135 static int strip_tag(struct text *t, char *tag)
137 int skip = strlen(tag) + 1;
138 if (skip >= t->len ||
139 strncmp(t->txt, tag, skip-1) != 0 ||
140 t->txt[skip-1] != ':')
142 while (skip < t->len && t->txt[skip] == ' ')
151 Productions are comprised primarily of symbols - terminal and
152 non-terminal. We do not make a syntactic distinction between the two,
153 though non-terminals must be identifiers. Non-terminal symbols are
154 those which appear in the head of a production, terminal symbols are
155 those which don't. There are also "virtual" symbols used for precedence
156 marking discussed later, and sometimes we won't know what type a symbol
159 To help with code safety it is possible to declare the terminal symbols.
160 If this is done, then any symbol used in a production that does not
161 appear in the head of any production and is not declared as a terminal
162 is treated as an error.
164 ###### forward declarations
165 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
166 char *symtypes = "UVTN";
169 ###### grammar fields
170 int terminals_declared;
172 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
173 table of known symbols and the resulting parser will report them as
174 `TK_reserved + N`. A small set of identifiers are reserved for the
175 different token types that `scanner` can report.
179 static struct { int num; char *name; } reserved_words[] = {
180 { TK_error, "ERROR" },
181 { TK_number, "NUMBER" },
182 { TK_ident, "IDENTIFIER" },
184 { TK_string, "STRING" },
185 { TK_multi_string, "MULTI_STRING" },
188 { TK_newline, "NEWLINE" },
194 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
195 recognised. The former is automatically expected at the end of the text
196 being parsed. The latter are always ignored by the parser.
198 All of these symbols are stored in a simple symbol table. We use the
199 `struct text` from `mdcode` to store the name and link them together
200 into a sorted list using an insertion sort.
202 We don't have separate `find` and `insert` functions as any symbol we
203 find needs to be remembered. We simply expect `find` to always return a
204 symbol, but its type might be `Unknown`.
213 ###### grammar fields
218 static struct symbol *sym_find(struct grammar *g, struct text s)
220 struct symbol **l = &g->syms;
225 (cmp = text_cmp((*l)->name, s)) < 0)
229 n = calloc(1, sizeof(*n));
238 static void symbols_init(struct grammar *g)
240 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
242 for (i = 0; i < entries; i++) {
245 t.txt = reserved_words[i].name;
246 t.len = strlen(t.txt);
249 s->num = reserved_words[i].num;
253 ### Data types and precedence.
255 Data type specification, precedence specification, and declaration of
256 terminals are all introduced by a dollar sign at the start of the line.
257 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
258 precedence, if it is `TERM` then the line declares terminals without
259 precedence, otherwise it specifies a data type.
261 The data type name is simply stored and applied to the head of all
262 subsequent productions. It must be the name of a structure optionally
263 preceded by an asterisk which means a reference or "pointer". So
264 `$expression` maps to `struct expression` and `$*statement` maps to
265 `struct statement *`.
267 Any productions given before the first data type declaration will have
268 no data type associated with them and can carry no information. In
269 order to allow other non-terminals to have no type, the data type
270 `$void` can be given. This does *not* mean that `struct void` will be
271 used, but rather than no type will be associated with subsequent
274 The precedence line must contain a list of symbols, either terminal
275 symbols or virtual symbols. It can only contain symbols that have not
276 been seen yet, so precedence declaration must precede the productions
279 A "virtual" symbol is an identifier preceded by `$$`. Virtual symbols
280 carry precedence information but are not included in the grammar. A
281 production can declare that it inherits the precedence of a given
284 This use for `$$` precludes it from being used as a symbol in the
285 described language. Two other symbols: `${` and `}$` are also
288 Each new precedence line introduces a new precedence level and
289 declares how it associates. This level is stored in each symbol
290 listed and may be inherited by any production which uses the symbol. A
291 production inherits from the last symbol which has a precedence.
293 The symbols on the first precedence line have the lowest precedence.
294 Subsequent lines introduce symbols with higher precedence and so bind
297 Note that a declaration line (or "dollar line") can start with either of
298 two different marks: "$" or "$*". The `dollar_line()` function defined
299 here is told which was found in the `isref` argument.
301 ###### grammar fields
302 struct text current_type;
307 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
308 static const char *known[] = { "$$", "${", "}$" };
311 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
313 struct token t = token_next(ts);
319 if (t.num != TK_ident) {
320 err = "type or assoc expected after '$'";
323 if (text_is(t.txt, "LEFT"))
325 else if (text_is(t.txt, "RIGHT"))
327 else if (text_is(t.txt, "NON"))
329 else if (text_is(t.txt, "TERM")) {
331 g->terminals_declared = 1;
333 g->current_type = t.txt;
334 g->type_isref = isref;
335 if (text_is(t.txt, "void"))
336 g->current_type.txt = NULL;
338 if (t.num != TK_newline) {
339 err = "Extra tokens after type name";
346 err = "$* cannot be followed by a precedence";
350 // This is a precedence or TERM line, need some symbols.
354 while (t.num != TK_newline) {
355 enum symtype type = Terminal;
357 if (t.num == TK_virtual) {
360 if (t.num != TK_ident) {
361 err = "$$ must be followed by a word";
365 err = "Virtual symbols not permitted on $TERM line";
368 } else if (t.num != TK_ident &&
370 err = "Illegal token in precedence line";
373 s = sym_find(g, t.txt);
374 if (s->type != Unknown) {
375 err = "Symbols in precedence/TERM line must not already be known.";
380 s->precedence = g->prec_levels;
387 err = "No symbols given on precedence/TERM line";
391 while (t.num != TK_newline && t.num != TK_eof)
398 A production either starts with an identifier which is the head
399 non-terminal, or a vertical bar (`|`) in which case this production
400 uses the same head as the previous one. The identifier must be
401 followed by a `->` mark. All productions for a given non-terminal must
402 be grouped together with the `nonterminal ->` given only once.
404 After this a (possibly empty) sequence of identifiers and marks form
405 the body of the production. A virtual symbol may be given after the
406 body by preceding it with `$$`. If a virtual symbol is given, the
407 precedence of the production is that for the virtual symbol. If none
408 is given, the precedence is inherited from the last symbol in the
409 production which has a precedence specified.
411 After the optional precedence may come the `${` mark. This indicates
412 the start of a code fragment. If present, this must be on the same
413 line as the start of the production.
415 All of the text from the `${` through to the matching `}$` is
416 collected and forms the code-fragment for the production. It must all
417 be in one `code_node` of the literate code. The `}$` must be
418 at the end of a line.
420 Text in the code fragment will undergo substitutions where `$N` or
421 `$<N`,for some numeric `N` (or non-numeric indicator as described
422 later), will be replaced with a variable holding the parse information
423 for the particular symbol in the production. `$0` is the head of the
424 production, `$1` is the first symbol of the body, etc. The type of `$N`
425 for a terminal symbol is `struct token`. For a non-terminal, it is
426 whatever has been declared for that symbol. The `<` may be included and
427 means that the value (usually a reference) is being moved out, so it
428 will not automatically be freed. The effect of using '<' is that the
429 variable is cleared to all-zeros.
431 While building productions we will need to add to an array which needs to
435 static void array_add(void *varray, int *cnt, void *new)
437 void ***array = varray;
440 current = ((*cnt-1) | (step-1))+1;
441 if (*cnt == current) {
444 *array = realloc(*array, current * sizeof(void*));
446 (*array)[*cnt] = new;
450 Collecting the code fragment simply involves reading tokens until we
451 hit the end token `}$`, and noting the character position of the start and
455 static struct text collect_code(struct token_state *state,
460 code.txt = start.txt.txt + start.txt.len;
462 t = token_next(state);
463 while (t.node == start.node &&
464 t.num != TK_close && t.num != TK_error &&
466 if (t.num == TK_close && t.node == start.node)
467 code.len = t.txt.txt - code.txt;
473 Now we have all the bits we need to parse a full production.
478 ###### grammar fields
479 struct production **productions;
480 int production_count;
482 ###### production fields
484 struct symbol **body;
490 int first_production;
493 static char *parse_production(struct grammar *g,
495 struct token_state *state)
497 /* Head has already been parsed. */
500 struct production p, *pp;
502 memset(&p, 0, sizeof(p));
504 tk = token_next(state);
505 while (tk.num == TK_ident || tk.num == TK_mark) {
506 struct symbol *bs = sym_find(g, tk.txt);
507 if (bs->type == Unknown) {
508 if (!g->terminals_declared)
511 if (bs->type == Virtual) {
512 err = "Virtual symbol not permitted in production";
515 if (bs->precedence) {
516 p.precedence = bs->precedence;
519 array_add(&p.body, &p.body_size, bs);
520 tk = token_next(state);
522 if (tk.num == TK_virtual) {
524 tk = token_next(state);
525 if (tk.num != TK_ident) {
526 err = "word required after $$";
529 vs = sym_find(g, tk.txt);
530 if (vs->num == TK_newline)
532 else if (vs->num == TK_out)
534 else if (vs->precedence == 0) {
535 err = "symbol after $$ must have precedence";
538 p.precedence = vs->precedence;
541 tk = token_next(state);
543 if (tk.num == TK_open) {
544 p.code_line = tk.line;
545 p.code = collect_code(state, tk);
546 if (p.code.txt == NULL) {
547 err = "code fragment not closed properly";
550 tk = token_next(state);
553 if (tk.num != TK_newline && tk.num != TK_eof) {
554 err = "stray tokens at end of line";
557 pp = malloc(sizeof(*pp));
559 array_add(&g->productions, &g->production_count, pp);
562 while (tk.num != TK_newline && tk.num != TK_eof)
563 tk = token_next(state);
568 With the ability to parse productions and dollar-lines, we have nearly all
569 that we need to parse a grammar from a `code_node`.
571 The head of the first production will effectively be the `start` symbol
572 of the grammar. However it won't _actually_ be so. Processing the
573 grammar is greatly simplified if the real start symbol only has a single
574 production, and expects `$eof` as the final terminal. So when we find
575 the first explicit production we insert an extra implicit production as
576 production zero which looks like
578 ###### Example: production 0
581 where `START` is the first non-terminal given.
583 ###### create production zero
584 struct production *p = calloc(1,sizeof(*p));
585 struct text start = {"$start",6};
586 struct text eof = {"$eof",4};
587 struct text code = {"$0 = $<1;", 9};
588 p->head = sym_find(g, start);
589 p->head->type = Nonterminal;
590 p->head->struct_name = g->current_type;
591 p->head->isref = g->type_isref;
592 if (g->current_type.txt)
594 array_add(&p->body, &p->body_size, head);
595 array_add(&p->body, &p->body_size, sym_find(g, eof));
596 p->head->first_production = g->production_count;
597 array_add(&g->productions, &g->production_count, p);
599 Now we are ready to read in the grammar. We ignore comments
600 and strings so that the marks which introduce them can be
601 used as terminals in the grammar. We don't ignore numbers
602 even though we don't allow them as that causes the scanner
603 to produce errors that the parser is better positioned to handle.
604 We also ignore indentation, but we expect and use newlines.
606 To allow comments in the grammar, we explicitly check for "//" as the
607 first token on the line and those lines are skipped. "//" can still be
608 used as a terminal anywhere that a terminal is expected.
611 static struct grammar *grammar_read(struct code_node *code)
613 struct token_config conf = {
616 .words_marks = known,
617 .known_count = sizeof(known)/sizeof(known[0]),
619 .ignored = (1 << TK_line_comment)
620 | (1 << TK_block_comment)
623 | (1 << TK_multi_string)
628 struct token_state *state = token_open(code, &conf);
630 struct symbol *head = NULL;
634 g = calloc(1, sizeof(*g));
637 for (tk = token_next(state); tk.num != TK_eof;
638 tk = token_next(state)) {
639 if (tk.num == TK_newline)
641 if (tk.num == TK_ident) {
643 head = sym_find(g, tk.txt);
644 if (head->type == Nonterminal)
645 err = "This non-terminal has already be used.";
646 else if (head->type == Virtual)
647 err = "Virtual symbol not permitted in head of production";
649 head->type = Nonterminal;
650 head->struct_name = g->current_type;
651 head->isref = g->type_isref;
652 if (g->production_count == 0) {
653 ## create production zero
655 head->first_production = g->production_count;
656 tk = token_next(state);
657 if (tk.num == TK_mark &&
658 text_is(tk.txt, "->"))
659 err = parse_production(g, head, state);
661 err = "'->' missing in production";
663 } else if (tk.num == TK_mark
664 && text_is(tk.txt, "|")) {
665 // another production for same non-term
667 err = parse_production(g, head, state);
669 err = "First production must have a head";
670 } else if (tk.num == TK_mark
671 && text_is(tk.txt, "$")) {
672 err = dollar_line(state, g, 0);
673 } else if (tk.num == TK_mark
674 && text_is(tk.txt, "$*")) {
675 err = dollar_line(state, g, 1);
676 } else if (tk.num == TK_mark
677 && text_is(tk.txt, "//")) {
678 while (tk.num != TK_newline &&
680 tk = token_next(state);
682 err = "Unrecognised token at start of line.";
688 if (g->terminals_declared) {
691 for (s = g->syms; s; s = s->next) {
692 if (s->type != Unknown)
695 fprintf(stderr, "Token %.*s not declared\n",
696 s->name.len, s->name.txt);
699 free(g); // FIXME free content
705 fprintf(stderr, "Error at line %d: %s\n",
708 free(g); // FIXME free content
712 ## Analysing the grammar
714 The central task in analysing the grammar is to determine a set of
715 states to drive the parsing process. This will require finding various
716 sets of symbols and of "items". Some of these sets will need to attach
717 information to each element in the set, so they are more like maps.
719 Each "item" may have a 'look-ahead' or `LA` set associated with it.
720 Many of these will be the same as each other. To avoid memory wastage,
721 and to simplify some comparisons of sets, these sets will be stored in a
722 list of unique sets, each assigned a number.
724 Once we have the data structures in hand to manage these sets and lists,
725 we can start setting the 'nullable' flag, build the 'FIRST' sets, and
726 then create the item sets which define the various states.
730 Though we don't only store symbols in these sets, they are the main
731 things we store, so they are called symbol sets or "symsets".
733 A symset has a size, an array of shorts, and an optional array of data
734 values, which are also shorts. If the array of data is not present, we
735 store an impossible pointer, as `NULL` is used to indicate that no
736 memory has been allocated yet;
741 unsigned short *syms, *data;
743 #define NO_DATA ((unsigned short *)1)
744 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
745 const struct symset INIT_DATASET = { 0, NULL, NULL };
747 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
748 using an insertion sort. We don't explicitly record the amount of
749 allocated space as it can be derived directly from the current `cnt`
750 using `((cnt - 1) | 7) + 1`.
753 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
756 int current = ((s->cnt-1) | 7) + 1;
757 if (current == s->cnt) {
759 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
760 if (s->data != NO_DATA)
761 s->data = realloc(s->data, sizeof(*s->data) * current);
764 while (i > 0 && s->syms[i-1] > key) {
765 s->syms[i] = s->syms[i-1];
766 if (s->data != NO_DATA)
767 s->data[i] = s->data[i-1];
771 if (s->data != NO_DATA)
776 Finding a symbol (or item) in a `symset` uses a simple binary search.
777 We return the index where the value was found (so data can be accessed),
778 or `-1` to indicate failure.
780 static int symset_find(struct symset *ss, unsigned short key)
787 while (lo + 1 < hi) {
788 int mid = (lo + hi) / 2;
789 if (ss->syms[mid] <= key)
794 if (ss->syms[lo] == key)
799 We will often want to form the union of two symsets. When we do, we
800 will often want to know if anything changed (as that might mean there is
801 more work to do). So `symset_union` reports whether anything was added
802 to the first set. We use a slow quadratic approach as these sets are
803 rarely large. If profiling shows this to be a problem it can be
806 static int symset_union(struct symset *a, struct symset *b)
810 for (i = 0; i < b->cnt; i++)
811 if (symset_find(a, b->syms[i]) < 0) {
812 unsigned short data = 0;
813 if (b->data != NO_DATA)
815 symset_add(a, b->syms[i], data);
821 And of course we must be able to free a symset.
823 static void symset_free(struct symset ss)
826 if (ss.data != NO_DATA)
832 Some symsets are simply stored somewhere appropriate in the `grammar`
833 data structure, others needs a bit of indirection. This applies
834 particularly to "LA" sets. These will be paired with "items" in an
835 "itemset". As itemsets will be stored in a symset, the "LA" set needs
836 to be stored in the `data` field, so we need a mapping from a small
837 (short) number to an LA `symset`.
839 As mentioned earlier this involves creating a list of unique symsets.
841 For now, we just use a linear list sorted by insertion. A skiplist
842 would be more efficient and may be added later.
849 struct setlist *next;
852 ###### grammar fields
853 struct setlist *sets;
858 static int ss_cmp(struct symset a, struct symset b)
861 int diff = a.cnt - b.cnt;
864 for (i = 0; i < a.cnt; i++) {
865 diff = (int)a.syms[i] - (int)b.syms[i];
872 static int save_set(struct grammar *g, struct symset ss)
874 struct setlist **sl = &g->sets;
878 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
885 s = malloc(sizeof(*s));
894 Finding a set by number is currently performed by a simple linear
895 search. If this turns out to hurt performance, we can store the sets in
896 a dynamic array like the productions.
898 static struct symset set_find(struct grammar *g, int num)
900 struct setlist *sl = g->sets;
901 while (sl && sl->num != num)
906 ### Setting `nullable`
908 We set `nullable` on the head symbol for any production for which all
909 the body symbols (if any) are nullable. As this is a recursive
910 definition, any change in the `nullable` setting means that we need to
911 re-evaluate where it needs to be set.
913 We simply loop around performing the same calculations until no more
920 static void set_nullable(struct grammar *g)
923 while (check_again) {
926 for (p = 0; p < g->production_count; p++) {
927 struct production *pr = g->productions[p];
930 if (pr->head->nullable)
932 for (s = 0; s < pr->body_size; s++)
933 if (! pr->body[s]->nullable)
935 if (s == pr->body_size) {
936 pr->head->nullable = 1;
943 ### Setting `line_like`
945 In order to be able to ignore newline tokens when not relevant, but
946 still include them in the parse when needed, we will need to know
947 which states can start a "line-like" section of code. We ignore
948 newlines when there is an indent since the most recent start of a
951 A "line_like" symbol is simply any symbol that can derive a NEWLINE.
952 If a symbol cannot derive a NEWLINE, then it is only part of a line -
953 so is word-like. If it can derive a NEWLINE, then we consider it to
956 Clearly the `TK_newline` token can derive a NEWLINE. Any symbol which
957 is the head of a production that contains a line_like symbol is also a
958 line-like symbol. We use a new field `line_like` to record this
959 attribute of symbols, and compute it in a repetitive manner similar to
966 static void set_line_like(struct grammar *g)
969 g->symtab[TK_newline]->line_like = 1;
970 while (check_again) {
973 for (p = 0; p < g->production_count; p++) {
974 struct production *pr = g->productions[p];
977 if (pr->head->line_like)
980 for (s = 0 ; s < pr->body_size; s++) {
981 if (pr->body[s]->line_like) {
982 pr->head->line_like = 1;
991 ### Building the `first` sets
993 When calculating what can follow a particular non-terminal, we will need
994 to know what the "first" terminal in any subsequent non-terminal might
995 be. So we calculate the `first` set for every non-terminal and store
996 them in an array. We don't bother recording the "first" set for
997 terminals as they are trivial.
999 As the "first" for one symbol might depend on the "first" of another, we
1000 repeat the calculations until no changes happen, just like with
1001 `set_nullable`. This makes use of the fact that `symset_union` reports
1002 if any change happens.
1004 The core of this, which finds the "first" of part of a production body,
1005 will be reused for computing the "follow" sets, so we split that out
1006 into a separate function, `add_first`, which also reports if it got all
1007 the way to the end of the production without finding a non-nullable
1010 ###### grammar fields
1011 struct symset *first;
1015 static int add_first(struct production *pr, int start,
1016 struct symset *target, struct grammar *g,
1021 for (s = start; s < pr->body_size; s++) {
1022 struct symbol *bs = pr->body[s];
1023 if (bs->type == Terminal) {
1024 if (symset_find(target, bs->num) < 0) {
1025 symset_add(target, bs->num, 0);
1029 } else if (symset_union(target, &g->first[bs->num]))
1035 *to_end = (s == pr->body_size);
1039 static void build_first(struct grammar *g)
1041 int check_again = 1;
1043 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1044 for (s = 0; s < g->num_syms; s++)
1045 g->first[s] = INIT_SYMSET;
1047 while (check_again) {
1050 for (p = 0; p < g->production_count; p++) {
1051 struct production *pr = g->productions[p];
1052 struct symset *head = &g->first[pr->head->num];
1054 if (add_first(pr, 0, head, g, NULL))
1060 ### Building the `follow` sets.
1062 There are two different situations when we will want to generate
1063 "follow" sets. If we are doing an SLR analysis, we want to generate a
1064 single "follow" set for each non-terminal in the grammar. That is what
1065 is happening here. If we are doing an LALR or LR analysis we will want
1066 to generate a separate "LA" set for each item. We do that later in
1069 There are two parts to generating a "follow" set. Firstly we look at
1070 every place that any non-terminal appears in the body of any production,
1071 and we find the set of possible "first" symbols after there. This is
1072 done using `add_first` above and only needs to be done once as the
1073 "first" sets are now stable and will not change.
1075 ###### grammar fields
1076 struct symset *follow;
1080 for (p = 0; p < g->production_count; p++) {
1081 struct production *pr = g->productions[p];
1084 for (b = 0; b < pr->body_size - 1; b++) {
1085 struct symbol *bs = pr->body[b];
1086 if (bs->type == Terminal)
1088 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1092 The second part is to add the "follow" set of the head of a production
1093 to the "follow" sets of the final symbol in the production, and any
1094 other symbol which is followed only by `nullable` symbols. As this
1095 depends on "follow" itself we need to repeatedly perform the process
1096 until no further changes happen.
1098 Rather than 2 nested loops, one that repeats the whole process until
1099 there is no change, and one that iterates of the set of productions, we
1100 combine these two functions into a single loop.
1104 for (check_again = 0, p = 0;
1105 p < g->production_count;
1106 p < g->production_count-1
1107 ? p++ : check_again ? (check_again = 0, p = 0)
1109 struct production *pr = g->productions[p];
1112 for (b = pr->body_size - 1; b >= 0; b--) {
1113 struct symbol *bs = pr->body[b];
1114 if (bs->type == Terminal)
1116 if (symset_union(&g->follow[bs->num],
1117 &g->follow[pr->head->num]))
1124 We now just need to create and initialise the `follow` list to get a
1128 static void build_follow(struct grammar *g)
1130 int s, check_again, p;
1131 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1132 for (s = 0; s < g->num_syms; s++)
1133 g->follow[s] = INIT_SYMSET;
1137 ### Building itemsets and states
1139 There are three different levels of detail that can be chosen for
1140 building the itemsets and states for the LR grammar. They are:
1142 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1143 itemsets - look-ahead, if used at all, is simply the 'follow' sets
1145 2. LALR(1) where we build look-ahead sets with each item and merge
1146 the LA sets when we find two paths to the same "kernel" set of items.
1147 3. LR(1) where different look-ahead for any item in the set means
1148 a different item set must be created.
1150 ###### forward declarations
1151 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1153 We need to be able to look through existing states to see if a newly
1154 generated state already exists. For now we use a simple sorted linked
1157 An item is a pair of numbers: the production number and the position of
1158 "DOT", which is an index into the body of the production. As the
1159 numbers are not enormous we can combine them into a single "short" and
1160 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1161 production number (so 2000 productions with maximum of 31 symbols in the
1164 Comparing the itemsets will be a little different to comparing symsets
1165 as we want to do the lookup after generating the "kernel" of an
1166 itemset, so we need to ignore the offset=zero items which are added during
1169 To facilitate this, we modify the "DOT" number so that "0" sorts to
1170 the end of the list in the symset, and then only compare items before
1174 static inline unsigned short item_num(int production, int dot)
1176 return production | ((31-dot) << 11);
1178 static inline int item_prod(unsigned short item)
1180 return item & 0x7ff;
1182 static inline int item_dot(unsigned short item)
1184 return (31-(item >> 11)) & 0x1f;
1187 For LR(1) analysis we need to compare not just the itemset in a state
1188 but also the LA sets. As we assign each unique LA set a number, we
1189 can just compare the symset and the data values together.
1192 static int itemset_cmp(struct symset a, struct symset b,
1193 enum grammar_type type)
1199 i < a.cnt && i < b.cnt &&
1200 item_dot(a.syms[i]) > 0 &&
1201 item_dot(b.syms[i]) > 0;
1203 int diff = a.syms[i] - b.syms[i];
1207 diff = a.data[i] - b.data[i];
1212 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1216 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1222 if (type < LR1 || av == -1)
1225 a.data[i] - b.data[i];
1228 It will be helpful to know if an itemset has been "completed" or not,
1229 particularly for LALR where itemsets get merged, at which point they
1230 need to be consider for completion again. So a `completed` flag is
1233 For correct handling of `TK_newline` when parsing, we will need to
1234 know which states (itemsets) can occur at the start of a line, so we
1235 will record a `starts_line` flag too whenever DOT is at the start of a
1238 Finally, for handling `TK_out` we need to know whether productions in the
1239 current state started *before* the most recent indent. A state
1240 doesn't usually keep details of individual productions, so we need to
1241 add one extra detail. `min_prefix` is the smallest non-zero number of
1242 symbols *before* DOT in any production in an itemset. This will allow
1243 us to determine if the the most recent indent is sufficiently recent
1244 to cancel it against a `TK_out`. If it was seen longer ago than the
1245 `min_prefix`, and if the current state cannot be reduced, then the
1246 indented section must have ended in the middle of a syntactic unit, so
1247 an error must be signaled.
1249 And now we can build the list of itemsets. The lookup routine returns
1250 both a success flag and a pointer to where in the list an insert should
1251 happen, so we don't need to search a second time.
1255 struct itemset *next;
1257 struct symset items;
1258 struct symset go_to;
1260 unsigned short precedence;
1266 ###### grammar fields
1267 struct itemset *items;
1271 static int itemset_find(struct grammar *g, struct itemset ***where,
1272 struct symset kernel, enum grammar_type type)
1274 struct itemset **ip;
1276 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1277 struct itemset *i = *ip;
1279 diff = itemset_cmp(i->items, kernel, type);
1292 Adding an itemset may require merging the LA sets if LALR analysis is
1293 happening. If any new LA set adds any symbols that weren't in the old
1294 LA set, we clear the `completed` flag so that the dependants of this
1295 itemset will be recalculated and their LA sets updated.
1297 `add_itemset` must consume the symsets it is passed, either by adding
1298 them to a data structure, of freeing them.
1300 static int add_itemset(struct grammar *g, struct symset ss,
1301 enum assoc assoc, unsigned short precedence,
1302 enum grammar_type type)
1304 struct itemset **where, *is;
1306 int found = itemset_find(g, &where, ss, type);
1308 is = calloc(1, sizeof(*is));
1309 is->state = g->states;
1313 is->precedence = precedence;
1315 is->go_to = INIT_DATASET;
1324 for (i = 0; i < ss.cnt; i++) {
1325 struct symset temp = INIT_SYMSET, s;
1326 if (ss.data[i] == is->items.data[i])
1328 s = set_find(g, is->items.data[i]);
1329 symset_union(&temp, &s);
1330 s = set_find(g, ss.data[i]);
1331 if (symset_union(&temp, &s)) {
1332 is->items.data[i] = save_set(g, temp);
1343 To build all the itemsets, we first insert the initial itemset made from
1344 production zero, complete each itemset, and then generate new itemsets
1345 from old until no new ones can be made.
1347 Completing an itemset means finding all the items where "DOT" is
1348 followed by a nonterminal and adding "DOT=0" items for every production
1349 from that non-terminal - providing each item hasn't already been added.
1350 When we add such an item it might get inserted before the current
1351 one, so to make sure we process it we reset the loop counter to the
1354 If LA sets are needed, the LA set for each new item is found using
1355 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This
1356 may be supplemented by the LA set for the item which produced the new
1359 We also collect a set of all symbols which follow "DOT" (in `done`) as
1360 this is used in the next stage. If any of these symbols are flagged as
1361 `line_like`, then this state must be a `starts_line` state so now is a
1362 good time to record that.
1364 When itemsets are created we assign a precedence to the itemset from the
1365 complete item, if there is one. We ignore the possibility of there
1366 being two and don't (currently) handle precedence in such grammars.
1367 When completing a grammar we ignore any item where DOT is followed by a
1368 terminal with a precedence lower than that for the itemset. Unless the
1369 terminal has right associativity, we also ignore items where the
1370 terminal has the same precedence. The result is that unwanted items are
1371 still in the itemset, but the terminal doesn't get into the go to set,
1372 so the item is ineffective.
1374 ###### complete itemset
1375 for (i = 0; i < is->items.cnt; i++) {
1376 int p = item_prod(is->items.syms[i]);
1377 int bs = item_dot(is->items.syms[i]);
1378 struct production *pr = g->productions[p];
1381 struct symset LA = INIT_SYMSET;
1382 unsigned short sn = 0;
1383 struct symset LAnl = INIT_SYMSET;
1384 unsigned short snnl = 0;
1386 if (is->min_prefix == 0 ||
1387 (bs > 0 && bs < is->min_prefix))
1388 is->min_prefix = bs;
1389 if (bs == pr->body_size)
1392 if (s->precedence && is->precedence &&
1393 is->precedence > s->precedence)
1394 /* This terminal has a low precedence and
1395 * shouldn't be shifted
1398 if (s->precedence && is->precedence &&
1399 is->precedence == s->precedence && s->assoc != Right)
1400 /* This terminal has a matching precedence and is
1401 * not Right-associative, so we mustn't shift it.
1404 if (symset_find(&done, s->num) < 0) {
1405 symset_add(&done, s->num, 0);
1407 if (s->type != Nonterminal)
1410 is->starts_line = 1;
1415 add_first(pr, bs+1, &LA, g, &to_end);
1417 struct symset ss = set_find(g, is->items.data[i]);
1418 symset_union(&LA, &ss);
1420 sn = save_set(g, LA);
1421 LA = set_find(g, sn);
1422 if (symset_find(&LA, TK_newline))
1423 symset_add(&LAnl, TK_newline, 0);
1424 snnl = save_set(g, LAnl);
1425 LAnl = set_find(g, snnl);
1428 /* Add productions for this symbol */
1429 for (p2 = s->first_production;
1430 p2 < g->production_count &&
1431 g->productions[p2]->head == s;
1433 int itm = item_num(p2, 0);
1434 int pos = symset_find(&is->items, itm);
1436 if (g->productions[p2]->line_like)
1437 symset_add(&is->items, itm, snnl);
1439 symset_add(&is->items, itm, sn);
1440 /* Will have re-ordered, so start
1441 * from beginning again */
1443 } else if (type >= LALR) {
1444 struct symset ss = set_find(g, is->items.data[pos]);
1445 struct symset tmp = INIT_SYMSET;
1446 struct symset *la = &LA;
1448 if (g->productions[p2]->line_like)
1450 symset_union(&tmp, &ss);
1451 if (symset_union(&tmp, la)) {
1452 is->items.data[pos] = save_set(g, tmp);
1460 For each symbol we found (and placed in `done`) we collect all the items
1461 for which this symbol is next, and create a new itemset with "DOT"
1462 advanced over the symbol. This is then added to the collection of
1463 itemsets (or merged with a pre-existing itemset).
1465 ###### derive itemsets
1466 // Now we have a completed itemset, so we need to
1467 // compute all the 'next' itemsets and create them
1468 // if they don't exist.
1469 for (i = 0; i < done.cnt; i++) {
1471 unsigned short state;
1472 struct symbol *sym = g->symtab[done.syms[i]];
1473 enum assoc assoc = Non;
1474 unsigned short precedence = 0;
1475 struct symset newitemset = INIT_SYMSET;
1477 newitemset = INIT_DATASET;
1479 for (j = 0; j < is->items.cnt; j++) {
1480 int itm = is->items.syms[j];
1481 int p = item_prod(itm);
1482 int bp = item_dot(itm);
1483 struct production *pr = g->productions[p];
1484 unsigned short la = 0;
1487 if (bp == pr->body_size)
1489 if (pr->body[bp] != sym)
1494 la = is->items.data[j];
1495 if (bp == pr->body_size &&
1496 pr->precedence > 0 &&
1497 pr->precedence > precedence) {
1498 // new itemset is reducible and has a precedence.
1499 precedence = pr->precedence;
1502 pos = symset_find(&newitemset, item_num(p, bp));
1505 symset_add(&newitemset, item_num(p, bp), la);
1506 else if (type >= LALR) {
1507 // Need to merge la set.
1508 int la2 = newitemset.data[pos];
1510 struct symset ss = set_find(g, la2);
1511 struct symset LA = INIT_SYMSET;
1512 symset_union(&LA, &ss);
1513 ss = set_find(g, la);
1514 if (symset_union(&LA, &ss))
1515 newitemset.data[pos] = save_set(g, LA);
1521 state = add_itemset(g, newitemset, assoc, precedence, type);
1522 if (symset_find(&is->go_to, done.syms[i]) < 0)
1523 symset_add(&is->go_to, done.syms[i], state);
1526 All that is left is to create the initial itemset from production zero, and
1527 with `TK_eof` as the LA set.
1530 static void build_itemsets(struct grammar *g, enum grammar_type type)
1532 struct symset first = INIT_SYMSET;
1535 unsigned short la = 0;
1537 // LA set just has eof
1538 struct symset eof = INIT_SYMSET;
1539 symset_add(&eof, TK_eof, 0);
1540 la = save_set(g, eof);
1541 first = INIT_DATASET;
1543 // production 0, offset 0 (with no data)
1544 symset_add(&first, item_num(0, 0), la);
1545 add_itemset(g, first, Non, 0, type);
1546 for (check_again = 0, is = g->items;
1548 is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1550 struct symset done = INIT_SYMSET;
1561 ### Completing the analysis.
1563 The exact process of analysis depends on which level was requested. For
1564 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1565 `LR1` we need first, but not follow. For `SLR` we need both.
1567 We don't build the "action" tables that you might expect as the parser
1568 doesn't use them. They will be calculated to some extent if needed for
1571 Once we have built everything we allocate arrays for the two lists:
1572 symbols and itemsets. This allows more efficient access during
1573 reporting. The symbols are grouped as terminals and then non-terminals,
1574 and we record the changeover point in `first_nonterm`.
1576 ###### grammar fields
1577 struct symbol **symtab;
1578 struct itemset **statetab;
1581 ###### grammar_analyse
1583 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1587 int snum = TK_reserved;
1588 for (s = g->syms; s; s = s->next)
1589 if (s->num < 0 && s->type == Terminal) {
1593 g->first_nonterm = snum;
1594 for (s = g->syms; s; s = s->next)
1595 if (s->num < 0 && s->type != Virtual) {
1599 for (s = g->syms; s; s = s->next)
1605 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1606 for (s = g->syms; s; s = s->next)
1607 g->symtab[s->num] = s;
1617 build_itemsets(g, type);
1619 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1620 for (is = g->items; is ; is = is->next)
1621 g->statetab[is->state] = is;
1624 ## Reporting on the Grammar
1626 The purpose of the report is to give the grammar developer insight into
1627 how the grammar parser will work. It is basically a structured dump of
1628 all the tables that have been generated, plus a description of any conflicts.
1630 ###### grammar_report
1631 static int grammar_report(struct grammar *g, enum grammar_type type)
1637 return report_conflicts(g, type);
1640 Firstly we have the complete list of symbols, together with the
1641 "FIRST" set if that was generated. We add a mark to each symbol to
1642 show if it is considered to be "line-like" (`<`), or if it is nullable (`.`).
1646 static void report_symbols(struct grammar *g)
1650 printf("SYMBOLS + FIRST:\n");
1652 printf("SYMBOLS:\n");
1654 for (n = 0; n < g->num_syms; n++) {
1655 struct symbol *s = g->symtab[n];
1659 printf(" %c%c%3d%c: ",
1660 s->nullable ? '.':' ',
1661 s->line_like ? '<':' ',
1662 s->num, symtypes[s->type]);
1665 printf(" (%d%s)", s->precedence,
1666 assoc_names[s->assoc]);
1668 if (g->first && s->type == Nonterminal) {
1671 for (j = 0; j < g->first[n].cnt; j++) {
1674 prtxt(g->symtab[g->first[n].syms[j]]->name);
1681 Then we have the follow sets if they were computed.
1683 static void report_follow(struct grammar *g)
1686 printf("FOLLOW:\n");
1687 for (n = 0; n < g->num_syms; n++)
1688 if (g->follow[n].cnt) {
1692 prtxt(g->symtab[n]->name);
1693 for (j = 0; j < g->follow[n].cnt; j++) {
1696 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1702 And finally the item sets. These include the GO TO tables and, for
1703 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1704 it up a bit. First the items, with production number and associativity.
1706 static void report_item(struct grammar *g, int itm)
1708 int p = item_prod(itm);
1709 int dot = item_dot(itm);
1710 struct production *pr = g->productions[p];
1714 prtxt(pr->head->name);
1716 for (i = 0; i < pr->body_size; i++) {
1717 printf(" %s", (dot == i ? ". ": ""));
1718 prtxt(pr->body[i]->name);
1720 if (dot == pr->body_size)
1723 if (pr->precedence && dot == pr->body_size)
1724 printf(" (%d%s)", pr->precedence,
1725 assoc_names[pr->assoc]);
1726 if (dot < pr->body_size &&
1727 pr->body[dot]->precedence) {
1728 struct symbol *s = pr->body[dot];
1729 printf(" [%d%s]", s->precedence,
1730 assoc_names[s->assoc]);
1732 if (pr->line_like == 1)
1733 printf(" $$NEWLINE");
1734 else if (pr->line_like)
1739 The LA sets which are (possibly) reported with each item:
1741 static void report_la(struct grammar *g, int lanum)
1743 struct symset la = set_find(g, lanum);
1747 printf(" LOOK AHEAD(%d)", lanum);
1748 for (i = 0; i < la.cnt; i++) {
1751 prtxt(g->symtab[la.syms[i]]->name);
1756 Then the go to sets:
1758 static void report_goto(struct grammar *g, struct symset gt)
1763 for (i = 0; i < gt.cnt; i++) {
1765 prtxt(g->symtab[gt.syms[i]]->name);
1766 printf(" -> %d\n", gt.data[i]);
1770 Now we can report all the item sets complete with items, LA sets, and GO TO.
1772 static void report_itemsets(struct grammar *g)
1775 printf("ITEM SETS(%d)\n", g->states);
1776 for (s = 0; s < g->states; s++) {
1778 struct itemset *is = g->statetab[s];
1779 printf(" Itemset %d:%s min prefix=%d",
1780 s, is->starts_line?" (startsline)":"", is->min_prefix);
1782 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1784 for (j = 0; j < is->items.cnt; j++) {
1785 report_item(g, is->items.syms[j]);
1786 if (is->items.data != NO_DATA)
1787 report_la(g, is->items.data[j]);
1789 report_goto(g, is->go_to);
1793 ### Reporting conflicts
1795 Conflict detection varies a lot among different analysis levels. However
1796 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1797 LR1 are also similar as they have FOLLOW or LA sets.
1801 ## conflict functions
1803 static int report_conflicts(struct grammar *g, enum grammar_type type)
1806 printf("Conflicts:\n");
1808 cnt = conflicts_lr0(g, type);
1810 cnt = conflicts_slr(g, type);
1812 printf(" - no conflicts\n");
1816 LR0 conflicts are any state which have both a reducible item and
1817 a shiftable item, or two reducible items.
1819 LR05 conflicts only occur if two possible reductions exist,
1820 as shifts always over-ride reductions.
1822 ###### conflict functions
1823 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1827 for (i = 0; i < g->states; i++) {
1828 struct itemset *is = g->statetab[i];
1829 int last_reduce = -1;
1830 int prev_reduce = -1;
1831 int last_shift = -1;
1835 for (j = 0; j < is->items.cnt; j++) {
1836 int itm = is->items.syms[j];
1837 int p = item_prod(itm);
1838 int bp = item_dot(itm);
1839 struct production *pr = g->productions[p];
1841 if (bp == pr->body_size) {
1842 prev_reduce = last_reduce;
1846 if (pr->body[bp]->type == Terminal)
1849 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1850 printf(" State %d has both SHIFT and REDUCE:\n", i);
1851 report_item(g, is->items.syms[last_shift]);
1852 report_item(g, is->items.syms[last_reduce]);
1855 if (prev_reduce >= 0) {
1856 printf(" State %d has 2 (or more) reducible items\n", i);
1857 report_item(g, is->items.syms[prev_reduce]);
1858 report_item(g, is->items.syms[last_reduce]);
1865 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1866 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1867 in the source of the look ahead set.
1869 We build two datasets to reflect the "action" table: one which maps
1870 terminals to items where that terminal could be shifted and another
1871 which maps terminals to items that could be reduced when the terminal
1872 is in look-ahead. We report when we get conflicts between the two.
1874 As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
1875 terminal, we ignore it. NEWLINES are handled specially with its own
1876 rules for when to shift and when to reduce. Conflicts are expected,
1877 but handled internally.
1879 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1884 for (i = 0; i < g->states; i++) {
1885 struct itemset *is = g->statetab[i];
1886 struct symset shifts = INIT_DATASET;
1887 struct symset reduce = INIT_DATASET;
1891 /* First collect the shifts */
1892 for (j = 0; j < is->items.cnt; j++) {
1893 unsigned short itm = is->items.syms[j];
1894 int p = item_prod(itm);
1895 int bp = item_dot(itm);
1896 struct production *pr = g->productions[p];
1899 if (bp >= pr->body_size ||
1900 pr->body[bp]->type != Terminal)
1905 if (s->precedence && is->precedence)
1906 /* Precedence resolves this, so no conflict */
1909 if (symset_find(&shifts, s->num) < 0)
1910 symset_add(&shifts, s->num, itm);
1912 /* Now look for reductions and conflicts */
1913 for (j = 0; j < is->items.cnt; j++) {
1914 unsigned short itm = is->items.syms[j];
1915 int p = item_prod(itm);
1916 int bp = item_dot(itm);
1917 struct production *pr = g->productions[p];
1919 if (bp < pr->body_size)
1924 la = g->follow[pr->head->num];
1926 la = set_find(g, is->items.data[j]);
1928 for (k = 0; k < la.cnt; k++) {
1929 int pos = symset_find(&shifts, la.syms[k]);
1930 if (pos >= 0 && la.syms[k] != TK_newline) {
1931 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1933 prtxt(g->symtab[la.syms[k]]->name);
1935 report_item(g, shifts.data[pos]);
1936 report_item(g, itm);
1938 pos = symset_find(&reduce, la.syms[k]);
1940 symset_add(&reduce, la.syms[k], itm);
1943 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1944 prtxt(g->symtab[la.syms[k]]->name);
1946 report_item(g, itm);
1947 report_item(g, reduce.data[pos]);
1951 symset_free(shifts);
1952 symset_free(reduce);
1958 ## Generating the parser
1960 The exported part of the parser is the `parse_XX` function, where the name
1961 `XX` is based on the name of the parser files.
1963 This takes a `code_node`, a partially initialized `token_config`, and an
1964 optional `FILE` to send tracing to. The `token_config` gets the list of
1965 known words added and then is used with the `code_node` to initialize the
1968 `parse_XX` then calls the library function `parser_run` to actually complete
1969 the parse. This needs the `states` table and functions to call the various
1970 pieces of code provided in the grammar file, so they are generated first.
1972 ###### parser_generate
1974 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1975 struct code_node *pre_reduce)
1981 gen_reduce(f, g, file, pre_reduce);
1984 fprintf(f, "#line 0 \"gen_parser\"\n");
1985 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1988 fprintf(f, "\tstruct token_state *tokens;\n");
1989 fprintf(f, "\tconfig->words_marks = known;\n");
1990 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1991 fprintf(f, "\ttokens = token_open(code, config);\n");
1992 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1993 fprintf(f, "\ttoken_close(tokens);\n");
1994 fprintf(f, "\treturn rv;\n");
1995 fprintf(f, "}\n\n");
1998 ### Known words table
2000 The known words table is simply an array of terminal symbols.
2001 The table of nonterminals used for tracing is a similar array.
2005 static void gen_known(FILE *f, struct grammar *g)
2008 fprintf(f, "#line 0 \"gen_known\"\n");
2009 fprintf(f, "static const char *known[] = {\n");
2010 for (i = TK_reserved;
2011 i < g->num_syms && g->symtab[i]->type == Terminal;
2013 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
2014 g->symtab[i]->name.txt);
2015 fprintf(f, "};\n\n");
2018 static void gen_non_term(FILE *f, struct grammar *g)
2021 fprintf(f, "#line 0 \"gen_non_term\"\n");
2022 fprintf(f, "static const char *non_term[] = {\n");
2023 for (i = TK_reserved;
2026 if (g->symtab[i]->type == Nonterminal)
2027 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
2028 g->symtab[i]->name.txt);
2029 fprintf(f, "};\n\n");
2032 ### States and the goto tables.
2034 For each state we record the goto table and details of the reducible
2035 production if there is one.
2036 Some of the details of the reducible production are stored in the
2037 `do_reduce` function to come later. Here we store the production
2038 number, the body size (useful for stack management), and the resulting
2039 symbol (useful for knowing how to free data later).
2041 The go to table is stored in a simple array of `sym` and corresponding
2044 ###### exported types
2052 const struct lookup * go_to;
2063 static void gen_goto(FILE *f, struct grammar *g)
2066 fprintf(f, "#line 0 \"gen_goto\"\n");
2067 for (i = 0; i < g->states; i++) {
2069 fprintf(f, "static const struct lookup goto_%d[] = {\n",
2071 struct symset gt = g->statetab[i]->go_to;
2072 for (j = 0; j < gt.cnt; j++)
2073 fprintf(f, "\t{ %d, %d },\n",
2074 gt.syms[j], gt.data[j]);
2079 static void gen_states(FILE *f, struct grammar *g)
2082 fprintf(f, "#line 0 \"gen_states\"\n");
2083 fprintf(f, "static const struct state states[] = {\n");
2084 for (i = 0; i < g->states; i++) {
2085 struct itemset *is = g->statetab[i];
2086 int j, prod = -1, prod_len;
2088 for (j = 0; j < is->items.cnt; j++) {
2089 int itm = is->items.syms[j];
2090 int p = item_prod(itm);
2091 int bp = item_dot(itm);
2092 struct production *pr = g->productions[p];
2094 if (bp < pr->body_size)
2096 /* This is what we reduce - choose longest */
2097 if (prod < 0 || prod_len < pr->body_size) {
2099 prod_len = pr->body_size;
2104 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n",
2105 i, is->go_to.cnt, i, prod,
2106 g->productions[prod]->body_size,
2107 g->productions[prod]->head->num,
2109 g->productions[prod]->line_like,
2112 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n",
2113 i, is->go_to.cnt, i,
2114 is->starts_line, is->min_prefix);
2116 fprintf(f, "};\n\n");
2119 ### The `do_reduce` function and the code
2121 When the parser engine decides to reduce a production, it calls
2122 `do_reduce` which runs the code that was included with the production,
2125 This code needs to be able to store data somewhere. Rather than
2126 requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
2127 buffer and have `do_reduce` return the size to be saved.
2129 In order for the code to access "global" context, we pass in the
2130 "config" pointer that was passed to the parser function. If the `struct
2131 token_config` is embedded in some larger structure, the reducing code
2132 can access the larger structure using pointer manipulation. Performing
2133 that pointer manipulation, and any other common code or variables for
2134 all reduce actions, can be provided in code node called "reduce" which
2135 is passed around in `pre_reduce` which you might have already noticed.
2137 The code fragments require translation when written out. Any `$N` needs
2138 to be converted to a reference either to that buffer (if `$0`) or to the
2139 structure returned by a previous reduction. These pointers need to be
2140 cast to the appropriate type for each access. All this is handled in
2143 `gen_code` also allows symbol references to contain a '`<`' as in
2144 '`$<2`'. This is particularly useful for references (or pointers), but
2145 can be used with structures too. The `<` implies that the value is
2146 being moved out, so the object will not be automatically freed. It is
2147 equivalent to assigning `NULL` to the pointer or filling a structure
2150 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2151 and possibly a number. A number by itself (other than zero) selects a
2152 symbol from the body of the production. A sequence of letters selects
2153 the shortest symbol in the body which contains those letters in the
2154 given order. If a number follows the letters, then a later occurrence
2155 of that symbol is chosen. So "`$AB2`" will refer to the structure
2156 attached to the second occurrence of the shortest symbol which contains
2157 an `A` followed by a `B`. If there is no unique shortest system, or if
2158 the number given is too large, then the symbol reference is not
2159 transformed, and will cause an error when the code is compiled.
2163 static int textchr(struct text t, char c, int s)
2166 for (i = s; i < t.len; i++)
2172 static int subseq_match(char *seq, int slen, struct text name)
2176 st = textchr(name, *seq, st);
2186 static int choose_sym(char **namep, int len, struct production *p)
2188 char *name = *namep;
2197 ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2203 while (len > 0 && (c >= '0' && c <= '9')) {
2206 n = n * 10 + (c - '0');
2210 if (name == *namep || n > p->body_size)
2216 for (i = 0; i < p->body_size; i++) {
2217 if (!subseq_match(nam, namlen, p->body[i]->name))
2219 if (slen == 0 || p->body[i]->name.len < slen) {
2221 slen = p->body[i]->name.len;
2223 if (s >= 0 && p->body[i] != p->body[s] &&
2224 p->body[i]->name.len == p->body[s]->name.len)
2225 /* not unique, so s cannot be used */
2232 for (i = 0; i < p->body_size; i++)
2233 if (p->body[i] == p->body[s]) {
2238 if (n > 0 || i == p->body_size)
2244 static void gen_code(struct production *p, FILE *f, struct grammar *g)
2247 char *used = calloc(1, p->body_size);
2250 fprintf(f, "\t\t\t");
2251 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2265 n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2274 fprintf(f, "(*(struct %.*s*%s)ret)",
2275 p->head->struct_name.len,
2276 p->head->struct_name.txt,
2277 p->head->isref ? "*":"");
2278 else if (p->body[n-1]->type == Terminal)
2279 fprintf(f, "(*(struct token *)body[%d])",
2281 else if (p->body[n-1]->struct_name.txt == NULL)
2282 fprintf(f, "$%d", n);
2284 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2285 p->body[n-1]->struct_name.len,
2286 p->body[n-1]->struct_name.txt,
2287 p->body[n-1]->isref ? "*":"", n-1);
2293 for (i = 0; i < p->body_size; i++) {
2294 if (p->body[i]->struct_name.txt &&
2296 // assume this has been copied out
2297 if (p->body[i]->isref)
2298 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2300 fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2308 static void gen_reduce(FILE *f, struct grammar *g, char *file,
2309 struct code_node *pre_reduce)
2312 fprintf(f, "#line 1 \"gen_reduce\"\n");
2313 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2315 fprintf(f, "\tint ret_size = 0;\n");
2317 code_node_print(f, pre_reduce, file);
2319 fprintf(f, "#line 4 \"gen_reduce\"\n");
2320 fprintf(f, "\tswitch(prod) {\n");
2321 for (i = 0; i < g->production_count; i++) {
2322 struct production *p = g->productions[i];
2323 fprintf(f, "\tcase %d:\n", i);
2326 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2330 if (p->head->struct_name.txt)
2331 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2332 p->head->struct_name.len,
2333 p->head->struct_name.txt,
2334 p->head->isref ? "*":"");
2336 fprintf(f, "\t\tbreak;\n");
2338 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2343 As each non-terminal can potentially cause a different type of data
2344 structure to be allocated and filled in, we need to be able to free it when
2347 It is particularly important to have fine control over freeing during error
2348 recovery where individual stack frames might need to be freed.
2350 For this, the grammar author is required to defined a `free_XX` function for
2351 each structure that is used by a non-terminal. `do_free` will call whichever
2352 is appropriate given a symbol number, and will call `free` (as is
2353 appropriate for tokens) on any terminal symbol.
2357 static void gen_free(FILE *f, struct grammar *g)
2361 fprintf(f, "#line 0 \"gen_free\"\n");
2362 fprintf(f, "static void do_free(short sym, void *asn)\n");
2364 fprintf(f, "\tif (!asn) return;\n");
2365 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2366 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2367 fprintf(f, "\tswitch(sym) {\n");
2369 for (i = 0; i < g->num_syms; i++) {
2370 struct symbol *s = g->symtab[i];
2372 s->type != Nonterminal ||
2373 s->struct_name.txt == NULL)
2376 fprintf(f, "\tcase %d:\n", s->num);
2378 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2380 s->struct_name.txt);
2381 fprintf(f, "\t\tfree(asn);\n");
2383 fprintf(f, "\t\tfree_%.*s(asn);\n",
2385 s->struct_name.txt);
2386 fprintf(f, "\t\tbreak;\n");
2388 fprintf(f, "\t}\n}\n\n");
2391 ## The main routine.
2393 There are three key parts to the "main" function of parsergen: processing
2394 the arguments, loading the grammar file, and dealing with the grammar.
2396 ### Argument processing.
2398 Command line options allow the selection of analysis mode, name of output
2399 file, and whether or not a report should be generated. By default we create
2400 a report only if no code output was requested.
2402 The `parse_XX` function name uses the basename of the output file which
2403 should not have a suffix (`.c`). `.c` is added to the given name for the
2404 code, and `.h` is added for the header (if header text is specifed in the
2411 static const struct option long_options[] = {
2412 { "LR0", 0, NULL, '0' },
2413 { "LR05", 0, NULL, '5' },
2414 { "SLR", 0, NULL, 'S' },
2415 { "LALR", 0, NULL, 'L' },
2416 { "LR1", 0, NULL, '1' },
2417 { "tag", 1, NULL, 't' },
2418 { "report", 0, NULL, 'R' },
2419 { "output", 1, NULL, 'o' },
2420 { NULL, 0, NULL, 0 }
2422 const char *options = "05SL1t:Ro:";
2424 ###### process arguments
2426 char *outfile = NULL;
2431 enum grammar_type type = LR05;
2432 while ((opt = getopt_long(argc, argv, options,
2433 long_options, NULL)) != -1) {
2448 outfile = optarg; break;
2450 tag = optarg; break;
2452 fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2457 infile = argv[optind++];
2459 fprintf(stderr, "No input file given\n");
2462 if (outfile && report == 1)
2465 if (name && strchr(name, '/'))
2466 name = strrchr(name, '/')+1;
2468 if (optind < argc) {
2469 fprintf(stderr, "Excess command line arguments\n");
2473 ### Loading the grammar file
2475 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2478 Once we have extracted the code (with `mdcode`) we expect to find three
2479 or four sections: header, code, grammar, reduce.
2480 Anything else that is not excluded by the `--tag` option is an error.
2482 "header", "code", and "reduce" are optional, though it is hard to build
2483 a working parser without the first two. "grammar" must be provided.
2487 #include <sys/mman.h>
2492 static void pr_err(char *msg)
2495 fprintf(stderr, "%s\n", msg);
2499 struct section *table;
2503 fd = open(infile, O_RDONLY);
2505 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2506 infile, strerror(errno));
2509 len = lseek(fd, 0, 2);
2510 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2511 table = code_extract(file, file+len, pr_err);
2513 struct code_node *hdr = NULL;
2514 struct code_node *code = NULL;
2515 struct code_node *gram = NULL;
2516 struct code_node *pre_reduce = NULL;
2517 for (s = table; s; s = s->next) {
2518 struct text sec = s->section;
2519 if (tag && !strip_tag(&sec, tag))
2521 if (text_is(sec, "header"))
2523 else if (text_is(sec, "code"))
2525 else if (text_is(sec, "grammar"))
2527 else if (text_is(sec, "reduce"))
2528 pre_reduce = s->code;
2530 fprintf(stderr, "Unknown content section: %.*s\n",
2531 s->section.len, s->section.txt);
2536 ### Processing the input
2538 As we need to append an extention to a filename and then open it for
2539 writing, and we need to do this twice, it helps to have a separate function.
2543 static FILE *open_ext(char *base, char *ext)
2545 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2547 strcat(strcpy(fn, base), ext);
2553 If we can read the grammar, then we analyse and optionally report on it. If
2554 the report finds conflicts we will exit with an error status.
2556 ###### process input
2557 struct grammar *g = NULL;
2559 fprintf(stderr, "No grammar section provided\n");
2562 g = grammar_read(gram);
2564 fprintf(stderr, "Failure to parse grammar\n");
2569 grammar_analyse(g, type);
2571 if (grammar_report(g, type))
2575 If a "headers" section is defined, we write it out and include a declaration
2576 for the `parse_XX` function so it can be used from separate code.
2578 if (rv == 0 && hdr && outfile) {
2579 FILE *f = open_ext(outfile, ".h");
2581 code_node_print(f, hdr, infile);
2582 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2586 fprintf(stderr, "Cannot create %s.h\n",
2592 And if all goes well and an output file was provided, we create the `.c`
2593 file with the code section (if any) and the parser tables and function.
2595 if (rv == 0 && outfile) {
2596 FILE *f = open_ext(outfile, ".c");
2599 code_node_print(f, code, infile);
2600 gen_parser(f, g, infile, name, pre_reduce);
2603 fprintf(stderr, "Cannot create %s.c\n",
2609 And that about wraps it up. We need to set the locale so that UTF-8 is
2610 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2612 ###### File: parsergen.mk
2613 parsergen : parsergen.o libscanner.o libmdcode.o
2614 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2621 int main(int argc, char *argv[])
2626 setlocale(LC_ALL,"");
2628 ## process arguments
2635 ## The SHIFT/REDUCE parser
2637 Having analysed the grammar and generated all the tables, we only need
2638 the shift/reduce engine to bring it all together.
2640 ### Goto table lookup
2642 The parser generator has nicely provided us with goto tables sorted by
2643 symbol number. We need a binary search function to find a symbol in the
2646 ###### parser functions
2648 static int search(const struct state *l, int sym)
2651 int hi = l->go_to_cnt;
2655 while (lo + 1 < hi) {
2656 int mid = (lo + hi) / 2;
2657 if (l->go_to[mid].sym <= sym)
2662 if (l->go_to[lo].sym == sym)
2663 return l->go_to[lo].state;
2668 ### Memory allocation
2670 The `scanner` returns tokens in a local variable - we want them in allocated
2671 memory so they can live in the `asn_stack`. Similarly the `asn` produced by
2672 a reduce is in a large buffer. Both of these require some allocation and
2673 copying, hence `memdup` and `tok_copy`.
2675 ###### parser includes
2678 ###### parser functions
2680 void *memdup(void *m, int len)
2686 memcpy(ret, m, len);
2690 static struct token *tok_copy(struct token tk)
2692 struct token *new = malloc(sizeof(*new));
2697 ### The state stack.
2699 The core data structure for the parser is the stack. This tracks all
2700 the symbols that have been recognised or partially recognised.
2702 The stack usually won't grow very large - maybe a few tens of entries.
2703 So we dynamically grow an array as required but never bother to
2704 shrink it down again.
2706 We keep the stack as two separate allocations. One, `asn_stack` stores
2707 the "abstract syntax nodes" which are created by each reduction. When
2708 we call `do_reduce` we need to pass an array of the `asn`s of the body
2709 of the production, and by keeping a separate `asn` stack, we can just
2710 pass a pointer into this stack.
2712 The other allocation stores all other stack fields of which there are
2713 several. The `state` is the most important one and guides the parsing
2714 process. The `sym` is nearly unnecessary as it is implicit in the
2715 state. However when we want to free entries from the `asn_stack`, it
2716 helps to know what type they are so we can call the right freeing
2717 function. The symbol leads us to the right free function through
2720 The `indents` count tracks the line indents with-in the symbol or
2721 immediately follow it. These are used to allow indent information to
2722 guide parsing and error recovery.
2724 `since_newline` tracks how many stack frames since the last
2725 start-of-line (whether indented or not). So if `since_newline` is
2726 zero, then this symbol is at the start of a line. Similarly
2727 `since_indent` counts the number of states since an indent, it is zero
2728 precisely when `indents` is not zero.
2730 `newline_permitted` keeps track of whether newlines should be ignored
2733 The stack is most properly seen as alternating states and symbols -
2734 states, like the 'DOT' in items, are between symbols. Each frame in
2735 our stack holds a state and the symbol that was before it. The
2736 bottom of stack holds the start state but no symbol, as nothing came
2737 before the beginning. As we need to store some value, `TK_eof` is used
2738 to mark the beginning of the file as well as the end.
2740 ###### parser functions
2745 short newline_permitted;
2749 short since_newline;
2759 Two operations are needed on the stack - shift (which is like push) and pop.
2761 Shift applies not only to terminals but also to non-terminals. When we
2762 reduce a production we will pop off frames corresponding to the body
2763 symbols, then push on a frame for the head of the production. This last
2764 is exactly the same process as shifting in a terminal so we use the same
2765 function for both. In both cases we provide the symbol, the number of
2766 indents the symbol contains (which will be zero for a terminal symbol)
2767 and a flag indicating the the symbol was at (or was reduced from a
2768 symbol which was at) the start of a line. The state is deduced from the
2769 current top-of-stack state and the new symbol.
2771 To simplify other code we arrange for `shift` to fail if there is no `goto`
2772 state for the symbol. This is useful in basic parsing due to our design
2773 that we shift when we can, and reduce when we cannot. So the `shift`
2774 function reports if it could.
2776 `shift` is also used to push state zero onto the stack, so if the
2777 stack is empty, it always chooses zero as the next state.
2779 So `shift` finds the next state. If that succeeds it extends the
2780 allocations if needed and pushes all the information onto the stacks.
2782 Newlines are permitted after a `starts_line` state until an internal
2783 indent. If the new frame has neither a `starts_line` state nor an
2784 indent, newlines are permitted if the previous stack frame permitted
2787 ###### parser functions
2789 static int shift(struct parser *p,
2790 short sym, short indents, short start_of_line,
2792 const struct state states[])
2794 // Push an entry onto the stack
2795 struct frame next = {0};
2796 int newstate = p->tos
2797 ? search(&states[p->stack[p->tos-1].state],
2802 if (p->tos >= p->stack_size) {
2803 p->stack_size += 10;
2804 p->stack = realloc(p->stack, p->stack_size
2805 * sizeof(p->stack[0]));
2806 p->asn_stack = realloc(p->asn_stack, p->stack_size
2807 * sizeof(p->asn_stack[0]));
2810 next.indents = indents;
2811 next.state = newstate;
2812 if (states[newstate].starts_line)
2813 next.newline_permitted = 1;
2815 next.newline_permitted = 0;
2817 next.newline_permitted =
2818 p->stack[p->tos-1].newline_permitted;
2820 next.newline_permitted = 0;
2822 if (!start_of_line) {
2824 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2826 next.since_newline = 1;
2829 next.since_indent = 0;
2831 next.since_indent = p->stack[p->tos-1].since_indent + 1;
2833 next.since_indent = 1;
2835 p->stack[p->tos] = next;
2836 p->asn_stack[p->tos] = asn;
2841 `pop` primarily moves the top of stack (`tos`) back down the required
2842 amount and frees any `asn` entries that need to be freed. It also
2843 collects a summary of the indents and line starts in the symbols that
2844 are being removed. It is called _after_ we reduce a production, just
2845 before we `shift` the nonterminal in.
2847 ###### parser functions
2849 static int pop(struct parser *p, int num,
2850 short *start_of_line,
2851 void(*do_free)(short sym, void *asn))
2858 for (i = 0; i < num; i++) {
2859 sol |= !p->stack[p->tos+i].since_newline;
2860 indents += p->stack[p->tos+i].indents;
2861 do_free(p->stack[p->tos+i].sym,
2862 p->asn_stack[p->tos+i]);
2865 *start_of_line = sol;
2869 ### The heart of the parser.
2871 Now we have the parser. For each token we might shift it, trigger a
2872 reduction, or start error handling. 2D tokens (IN, OUT, EOL) also need
2875 We return whatever `asn` was returned by reducing production zero.
2877 If we can neither shift nor reduce we have an error to handle. We pop
2878 single entries off the stack until we can shift the `TK_error` symbol,
2879 then drop input tokens until we find one we can shift into the new error
2880 state. We need to ensure that something is dropped or shifted after an
2881 error, or we could get into an infinite loop, only shifting in
2882 `TK_error`, then reducing. So we track if there has been a shift since
2883 the last error, and if not the next error always discards one token.
2885 When we find `TK_in` and `TK_out` tokens which report indents we need
2886 to handle them directly as the grammar cannot express what we want to
2889 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2890 record how many indents there are following the previous token.
2892 `TK_out` tokens must be canceled against an indent count
2893 within the stack. If we can reduce some symbols that are all since
2894 the most recent indent, then we do that first. If the minimum prefix
2895 of the current state then extends back before the most recent indent,
2896 that indent can be cancelled. If the minimum prefix is shorter then
2897 the indent had ended prematurely and we must start error handling, which
2898 is still a work-in-progress.
2900 `TK_newline` tokens are ignored unless the top stack frame records
2901 that they are permitted. In that case they will not be considered for
2902 shifting if it is possible to reduce some symbols that are all since
2903 the most recent start of line. This is how a newline forcibly
2904 terminates any line-like structure - we try to reduce down to at most
2905 one symbol for each line where newlines are allowed.
2906 A consequence of this is that a rule like
2908 ###### Example: newlines - broken
2912 IfStatement -> Newlines if ....
2914 cannot work, as the NEWLINE will never be shifted as the empty string
2915 will be reduced first. Optional sets of newlines need to be include
2916 in the thing that preceed:
2918 ###### Example: newlines - works
2922 IfStatement -> If ....
2924 Here the NEWLINE will be shifted because nothing can be reduced until
2927 When during error handling we discard tokens read in, we want to keep
2928 discarding until we see one that is recognised. If we had a full set
2929 of LR(1) grammar states, this would mean looking in the look-ahead set,
2930 but we don't keep a full look-ahead set. We only record the subset
2931 that leads to SHIFT. We can, however, deduce the look-ahead set by
2932 looking at the SHIFT subsets for all states that we can get to by
2933 reducing zero or more times. So we need a little function which
2934 checks if a given token is in any of these look-ahead sets.
2936 ###### parser includes
2941 static int in_lookahead(struct token *tk, const struct state *states, int state)
2943 while (state >= 0) {
2944 if (search(&states[state], tk->num) >= 0)
2946 if (states[state].reduce_prod < 0)
2948 state = search(&states[state], states[state].reduce_sym);
2953 void *parser_run(struct token_state *tokens,
2954 const struct state states[],
2955 int (*do_reduce)(int, void**, struct token_config*, void*),
2956 void (*do_free)(short, void*),
2957 FILE *trace, const char *non_term[],
2958 struct token_config *config)
2960 struct parser p = { 0 };
2961 struct token *tk = NULL;
2963 int shift_since_err = 1;
2966 shift(&p, TK_eof, 0, 1, NULL, states);
2967 while (!accepted && p.tos > 0) {
2968 struct token *err_tk;
2969 struct frame *tos = &p.stack[p.tos-1];
2971 tk = tok_copy(token_next(tokens));
2972 parser_trace(trace, &p,
2973 tk, states, non_term, config->known_count);
2975 if (tk->num == TK_in) {
2977 tos->since_newline = 0;
2978 tos->since_indent = 0;
2979 if (!states[tos->state].starts_line)
2980 tos->newline_permitted = 0;
2983 parser_trace_action(trace, "Record");
2986 if (tk->num == TK_out) {
2987 if (states[tos->state].reduce_size >= 0 &&
2988 states[tos->state].reduce_size <= tos->since_indent)
2990 if (states[tos->state].min_prefix >= tos->since_indent) {
2992 struct frame *in = tos - tos->since_indent;
2994 if (in->indents == 0) {
2995 /* Reassess since_indent and newline_permitted */
2997 in->since_indent = in[-1].since_indent + 1;
2998 in->newline_permitted = in[-1].newline_permitted;
3000 in->since_indent = 0;
3001 in->newline_permitted = 0;
3003 if (states[in->state].starts_line)
3004 in->newline_permitted = 1;
3007 in->since_indent = in[-1].since_indent + 1;
3008 if (states[in->state].starts_line)
3009 in->newline_permitted = 1;
3011 in->newline_permitted = in[-1].newline_permitted;
3016 parser_trace_action(trace, "Cancel");
3019 // fall through to error handling as both SHIFT and REDUCE
3022 if (tk->num == TK_newline) {
3023 if (!tos->newline_permitted) {
3026 parser_trace_action(trace, "Discard");
3029 if (tos->since_newline > 1 &&
3030 states[tos->state].reduce_size >= 0 &&
3031 states[tos->state].reduce_size <= tos->since_newline)
3034 if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
3035 shift_since_err = 1;
3037 parser_trace_action(trace, "Shift");
3041 if (states[tos->state].reduce_prod >= 0 &&
3042 states[tos->state].newline_only &&
3043 !(tk->num == TK_newline ||
3044 tk->num == TK_eof ||
3045 tk->num == TK_out ||
3046 (tos->indents == 0 && tos->since_newline == 0))) {
3047 /* Anything other than newline or out or eof
3048 * in an error unless we are already at start
3049 * of line, as this production must end at EOL.
3051 } else if (states[tos->state].reduce_prod >= 0) {
3054 const struct state *nextstate = &states[tos->state];
3055 int prod = nextstate->reduce_prod;
3056 int size = nextstate->reduce_size;
3058 static char buf[16*1024];
3059 short indents, start_of_line;
3061 body = p.asn_stack + (p.tos - size);
3063 bufsize = do_reduce(prod, body, config, buf);
3065 indents = pop(&p, size, &start_of_line,
3067 res = memdup(buf, bufsize);
3068 memset(buf, 0, bufsize);
3069 if (!shift(&p, nextstate->reduce_sym,
3070 indents, start_of_line,
3072 if (prod != 0) abort();
3076 parser_trace_action(trace, "Reduce");
3079 /* Error. We walk up the stack until we
3080 * find a state which will accept TK_error.
3081 * We then shift in TK_error and see what state
3082 * that takes us too.
3083 * Then we discard input tokens until
3084 * we find one that is acceptable.
3086 parser_trace_action(trace, "ERROR");
3087 short indents = 0, start_of_line;
3089 err_tk = tok_copy(*tk);
3091 shift(&p, TK_error, 0, 0,
3092 err_tk, states) == 0)
3093 // discard this state
3094 indents += pop(&p, 1, &start_of_line, do_free);
3097 // no state accepted TK_error
3100 if (!shift_since_err) {
3101 /* must discard at least one token to avoid
3104 if (tk->num == TK_eof)
3107 tk = tok_copy(token_next(tokens));
3109 shift_since_err = 0;
3110 tos = &p.stack[p.tos-1];
3111 while (!in_lookahead(tk, states, tos->state) &&
3112 tk->num != TK_eof) {
3114 tk = tok_copy(token_next(tokens));
3115 shift_since_err = 1;
3116 if (tk->num == TK_in)
3118 if (tk->num == TK_out) {
3122 // FIXME update since_indent here
3125 tos->indents += indents;
3128 pop(&p, p.tos, NULL, do_free);
3134 ###### exported functions
3135 void *parser_run(struct token_state *tokens,
3136 const struct state states[],
3137 int (*do_reduce)(int, void**, struct token_config*, void*),
3138 void (*do_free)(short, void*),
3139 FILE *trace, const char *non_term[],
3140 struct token_config *config);
3144 Being able to visualize the parser in action can be invaluable when
3145 debugging the parser code, or trying to understand how the parse of a
3146 particular grammar progresses. The stack contains all the important
3147 state, so just printing out the stack every time around the parse loop
3148 can make it possible to see exactly what is happening.
3150 This doesn't explicitly show each SHIFT and REDUCE action. However they
3151 are easily deduced from the change between consecutive lines, and the
3152 details of each state can be found by cross referencing the states list
3153 in the stack with the "report" that parsergen can generate.
3155 For terminal symbols, we just dump the token. For non-terminals we
3156 print the name of the symbol. The look ahead token is reported at the
3157 end inside square brackets.
3159 ###### parser functions
3161 static char *reserved_words[] = {
3162 [TK_error] = "ERROR",
3165 [TK_newline] = "NEWLINE",
3168 static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
3170 fprintf(trace, "(%d", f->state);
3171 if (states[f->state].starts_line)
3172 fprintf(trace, "s");
3173 if (f->newline_permitted)
3174 fprintf(trace, "n%d", f->since_newline);
3175 fprintf(trace, ") ");
3178 void parser_trace(FILE *trace, struct parser *p,
3179 struct token *tk, const struct state states[],
3180 const char *non_term[], int knowns)
3185 for (i = 0; i < p->tos; i++) {
3186 struct frame *f = &p->stack[i];
3189 if (sym < TK_reserved &&
3190 reserved_words[sym] != NULL)
3191 fputs(reserved_words[sym], trace);
3192 else if (sym < TK_reserved + knowns) {
3193 struct token *t = p->asn_stack[i];
3194 text_dump(trace, t->txt, 20);
3196 fputs(non_term[sym - TK_reserved - knowns],
3199 fprintf(trace, ".%d", f->indents);
3200 if (f->since_newline == 0)
3204 parser_trace_state(trace, f, states);
3206 fprintf(trace, "[");
3207 if (tk->num < TK_reserved &&
3208 reserved_words[tk->num] != NULL)
3209 fputs(reserved_words[tk->num], trace);
3211 text_dump(trace, tk->txt, 20);
3212 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3215 void parser_trace_action(FILE *trace, char *action)
3218 fprintf(trace, " - %s\n", action);
3223 The obvious example for a parser is a calculator.
3225 As `scanner` provides number parsing function using `libgmp` it is not much
3226 work to perform arbitrary rational number calculations.
3228 This calculator takes one expression, or an equality test, per line.
3229 The results are printed and if any equality test fails, the program
3230 exits with an error.
3232 ###### File: parsergen.mk
3233 calc.c calc.h : parsergen parsergen.mdc
3234 ./parsergen --tag calc -o calc parsergen.mdc
3235 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3236 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3238 ./calc parsergen.mdc
3243 #include "parse_number.h"
3244 // what do we use for a demo-grammar? A calculator of course.
3256 #include <sys/mman.h>
3262 #include "scanner.h"
3267 static void free_number(struct number *n)
3273 static int text_is(struct text t, char *s)
3275 return (strlen(s) == t.len &&
3276 strncmp(s, t.txt, t.len) == 0);
3279 int main(int argc, char *argv[])
3281 int fd = open(argv[1], O_RDONLY);
3282 int len = lseek(fd, 0, 2);
3283 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3284 struct section *table = code_extract(file, file+len, NULL);
3286 struct token_config config = {
3287 .ignored = (1 << TK_line_comment)
3290 .number_chars = ".,_+-",
3294 for (s = table; s; s = s->next)
3295 if (text_is(s->section, "example: input"))
3296 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3298 struct section *t = table->next;
3299 code_free(table->code);
3311 Session -> Session Line
3314 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3315 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3316 gmp_printf(" or as a decimal: %Fg\n", fl);
3320 | Expression = Expression NEWLINE ${
3321 if (mpq_equal($1.val, $3.val))
3322 gmp_printf("Both equal %Qd\n", $1.val);
3324 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
3329 | NEWLINE ${ printf("Blank line\n"); }$
3330 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3333 Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3334 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3335 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3336 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3337 | Expression // Expression ${ {
3340 mpz_init(z0); mpz_init(z1); mpz_init(z2);
3341 mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3342 mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3343 mpz_tdiv_q(z0, z1, z2);
3344 mpq_set_z($0.val, z0);
3345 mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3347 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3348 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3353 3.1415926535 - 355/113
3355 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3357 1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1