1 # LR(1) Parser Generator #
3 This parser generator takes a grammar description combined with code
4 fragments, analyses it, and can report details about the analysis and
5 write out C code files which can be compiled to make a parser.
7 There are several distinct sections.
9 1. `grammar_read` will read a grammar from a literate-code file and
10 store the productions, symbols, and code fragments.
11 2. `grammar_analyse` will take that grammar and build LR parsing
12 states and look-ahead tables.
13 3. `grammar_report` will present the details of the analysis
14 in a readable format and will report any conflicts.
15 4. `parser_generate` will write out C code files with various
16 tables and with the code fragments arranged in useful places.
17 5. `parser_run` is a library function intended to be linked together
18 with the generated parser tables to complete the implementation of
20 6. Finally `calc` is a test grammar for a simple calculator. The
21 `parsergen` program built from the C code in this file can extract
22 that grammar directly from this file and process it.
25 ###### File: parsergen.c
30 ## forward declarations
41 ###### File: libparser.c
48 ###### File: parsergen.mk
51 parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
54 ## Reading the grammar
56 The grammar must be stored in a markdown literate code file as parsed
57 by "[mdcode][]". It must have three top level (i.e. unreferenced)
58 sections: `header`, `code`, and `grammar`. The first two will be
59 literally copied into the generated `.c` and `.h`. files. The last
60 contains the grammar. This is tokenised with "[scanner][]".
62 If the `--tag` option is given, then any top level header that doesn't
63 start with the tag is ignored, and the tag is striped from the rest. So
65 means that the three needed sections must be `Foo: header`, `Foo: code`,
66 and `Foo: grammar`. The tag `calc` is used to extract the same calculator
70 [scanner]: scanner.html
76 ###### parser includes
80 The grammar contains production sets, precedence/associativity
81 declarations, and data type declarations. These are all parsed with
82 _ad hoc_ parsing as we don't have a parser generator yet.
84 The precedence and associativity can be set for each production, but
85 can be inherited from symbols. The data type (either structure or a
86 reference to a structure) is potentially defined for each non-terminal
87 and describes what C structure is used to store information about each
91 enum assoc {Left, Right, Non};
92 char *assoc_names[] = {"Left","Right","Non"};
95 struct text struct_name;
98 unsigned short precedence;
102 unsigned short precedence;
110 The strings reported by `mdcode` and `scanner` are `struct text` which have
111 length rather than being null terminated. To help with printing and
112 comparing we define `text_is` and `prtxt`, which should possibly go in
113 `mdcode`. `scanner` does provide `text_dump` which is useful for strings
114 which might contain control characters.
116 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
117 because that is what we need to detect tags.
120 static int text_is(struct text t, char *s)
122 return (strlen(s) == t.len &&
123 strncmp(s, t.txt, t.len) == 0);
125 static void prtxt(struct text t)
127 printf("%.*s", t.len, t.txt);
130 static int strip_tag(struct text *t, char *tag)
132 int skip = strlen(tag) + 1;
133 if (skip >= t->len ||
134 strncmp(t->txt, tag, skip-1) != 0 ||
135 t->txt[skip-1] != ':')
137 while (skip < t->len && t->txt[skip] == ' ')
146 Productions are comprised primarily of symbols - terminal and
147 non-terminal. We do not make a syntactic distinction between the two,
148 though non-terminals must be identifiers. Non-terminal symbols are
149 those which appear in the head of a production, terminal symbols are
150 those which don't. There are also "virtual" symbols used for precedence
151 marking discussed later, and sometimes we won't know what type a symbol
154 ###### forward declarations
155 enum symtype { Unknown, Virtual, Terminal, Nonterminal };
156 char *symtypes = "UVTN";
160 Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
161 table of known symbols and the resulting parser will report them as
162 `TK_reserved + N`. A small set of identifiers are reserved for the
163 different token types that `scanner` can report.
166 static char *reserved_words[] = {
167 [TK_error] = "ERROR",
168 [TK_number] = "NUMBER",
169 [TK_ident] = "IDENTIFIER",
171 [TK_string] = "STRING",
172 [TK_multi_string] = "MULTI_STRING",
175 [TK_newline] = "NEWLINE",
181 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
182 recognised. The former is automatically expected at the end of the text
183 being parsed. The latter are always ignored by the parser.
185 All of these symbols are stored in a simple symbol table. We use the
186 `struct text` from `mdcode` to store the name and link them together
187 into a sorted list using an insertion sort.
189 We don't have separate `find` and `insert` functions as any symbol we
190 find needs to be remembered. We simply expect `find` to always return a
191 symbol, but its type might be `Unknown`.
200 ###### grammar fields
205 static struct symbol *sym_find(struct grammar *g, struct text s)
207 struct symbol **l = &g->syms;
212 (cmp = text_cmp((*l)->name, s)) < 0)
216 n = calloc(1, sizeof(*n));
225 static void symbols_init(struct grammar *g)
227 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
229 for (i = 0; i < entries; i++) {
232 t.txt = reserved_words[i];
235 t.len = strlen(t.txt);
242 ### Data types and precedence.
244 Data type specification and precedence specification are both
245 introduced by a dollar sign at the start of the line. If the next
246 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
247 precedence, otherwise it specifies a data type.
249 The data type name is simply stored and applied to the head of all
250 subsequent productions. It must be the name of a structure optionally
251 preceded by an asterisk which means a reference or "pointer". So
252 `$expression` maps to `struct expression` and `$*statement` maps to
253 `struct statement *`.
255 Any productions given before the first data type declaration will have
256 no data type associated with them and can carry no information. In
257 order to allow other non-terminals to have no type, the data type
258 `$void` can be given. This does *not* mean that `struct void` will be
259 used, but rather than no type will be associated with future
262 The precedence line must contain a list of symbols - typically
263 terminal symbols, but not necessarily. It can only contain symbols
264 that have not been seen yet, so precedence declaration must precede
265 the productions that they affect.
267 A precedence line may also contain a "virtual" symbol which is an
268 identifier preceded by `$$`. Virtual terminals carry precedence
269 information but are not included in the grammar. A production can
270 declare that it inherits the precedence of a given virtual symbol.
272 This use for `$$` precludes it from being used as a symbol in the
273 described language. Two other symbols: `${` and `}$` are also
276 Each new precedence line introduces a new precedence level and
277 declares how it associates. This level is stored in each symbol
278 listed and may be inherited by any production which uses the symbol. A
279 production inherits from the last symbol which has a precedence.
281 ###### grammar fields
282 struct text current_type;
287 enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
288 static const char *known[] = { "$$", "${", "}$" };
291 static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
293 struct token t = token_next(ts);
298 if (t.num != TK_ident) {
299 err = "type or assoc expected after '$'";
302 if (text_is(t.txt, "LEFT"))
304 else if (text_is(t.txt, "RIGHT"))
306 else if (text_is(t.txt, "NON"))
309 g->current_type = t.txt;
310 g->type_isref = isref;
311 if (text_is(t.txt, "void"))
312 g->current_type.txt = NULL;
314 if (t.num != TK_newline) {
315 err = "Extra tokens after type name";
322 err = "$* cannot be followed by a precedence";
326 // This is a precedence line, need some symbols.
330 while (t.num != TK_newline) {
331 enum symtype type = Terminal;
333 if (t.num == TK_virtual) {
336 if (t.num != TK_ident) {
337 err = "$$ must be followed by a word";
340 } else if (t.num != TK_ident &&
342 err = "Illegal token in precedence line";
345 s = sym_find(g, t.txt);
346 if (s->type != Unknown) {
347 err = "Symbols in precedence line must not already be known.";
351 s->precedence = g->prec_levels;
356 err = "No symbols given on precedence line";
360 while (t.num != TK_newline && t.num != TK_eof)
367 A production either starts with an identifier which is the head
368 non-terminal, or a vertical bar (`|`) in which case this production
369 uses the same head as the previous one. The identifier must be
370 followed by a `->` mark. All productions for a given non-terminal must
371 be grouped together with the `nonterminal ->` given only once.
373 After this a (possibly empty) sequence of identifiers and marks form
374 the body of the production. A virtual symbol may be given after the
375 body by preceding it with `$$`. If a virtual symbol is given, the
376 precedence of the production is that for the virtual symbol. If none
377 is given, the precedence is inherited from the last symbol in the
378 production which has a precedence specified.
380 After the optional precedence may come the `${` mark. This indicates
381 the start of a code fragment. If present, this must be on the same
382 line as the start of the production.
384 All of the text from the `${` through to the matching `}$` is
385 collected and forms the code-fragment for the production. It must all
386 be in one `code_node` of the literate code. The `}$` must be
387 at the end of a line.
389 Text in the code fragment will undergo substitutions where `$N` or
390 `$<N`,for some numeric `N`, will be replaced with a variable holding
391 the parse information for the particular symbol in the production.
392 `$0` is the head of the production, `$1` is the first symbol of the
393 body, etc. The type of `$N` for a terminal symbol is `struct token`.
394 For a non-terminal, it is whatever has been declared for that symbol.
395 The `<` may be included for symbols declared as storing a reference
396 (not a structure) and means that the reference is being moved out, so
397 it will not automatically be freed.
399 While building productions we will need to add to an array which needs to
403 static void array_add(void *varray, int *cnt, void *new)
405 void ***array = varray;
408 current = ((*cnt-1) | (step-1))+1;
409 if (*cnt == current) {
412 *array = realloc(*array, current * sizeof(void*));
414 (*array)[*cnt] = new;
418 Collecting the code fragment simply involves reading tokens until we
419 hit the end token `}$`, and noting the character position of the start and
423 static struct text collect_code(struct token_state *state,
428 code.txt = start.txt.txt + start.txt.len;
430 t = token_next(state);
431 while (t.node == start.node &&
432 t.num != TK_close && t.num != TK_error &&
434 if (t.num == TK_close && t.node == start.node)
435 code.len = t.txt.txt - code.txt;
441 Now we have all the bits we need to parse a full production.
446 ###### grammar fields
447 struct production **productions;
448 int production_count;
450 ###### production fields
452 struct symbol **body;
457 int first_production;
460 static char *parse_production(struct grammar *g,
462 struct token_state *state)
464 /* Head has already been parsed. */
467 struct production p, *pp;
469 memset(&p, 0, sizeof(p));
471 tk = token_next(state);
472 while (tk.num == TK_ident || tk.num == TK_mark) {
473 struct symbol *bs = sym_find(g, tk.txt);
474 if (bs->type == Unknown)
476 if (bs->type == Virtual) {
477 err = "Virtual symbol not permitted in production";
480 if (bs->precedence) {
481 p.precedence = bs->precedence;
484 array_add(&p.body, &p.body_size, bs);
485 tk = token_next(state);
487 if (tk.num == TK_virtual) {
489 tk = token_next(state);
490 if (tk.num != TK_ident) {
491 err = "word required after $$";
494 vs = sym_find(g, tk.txt);
495 if (vs->type != Virtual) {
496 err = "symbol after $$ must be virtual";
499 p.precedence = vs->precedence;
501 tk = token_next(state);
503 if (tk.num == TK_open) {
504 p.code = collect_code(state, tk);
505 if (p.code.txt == NULL) {
506 err = "code fragment not closed properly";
509 tk = token_next(state);
511 if (tk.num != TK_newline && tk.num != TK_eof) {
512 err = "stray tokens at end of line";
515 pp = malloc(sizeof(*pp));
517 array_add(&g->productions, &g->production_count, pp);
520 while (tk.num != TK_newline && tk.num != TK_eof)
521 tk = token_next(state);
525 With the ability to parse production and dollar-lines, we have nearly all
526 that we need to parse a grammar from a `code_node`.
528 The head of the first production will effectively be the `start` symbol of
529 the grammar. However it won't _actually_ be so. Processing the grammar is
530 greatly simplified if the real start symbol only has a single production,
531 and expects `$eof` as the final terminal. So when we find the first
532 explicit production we insert an extra production as production zero which
535 ###### Example: production 0
538 where `START` is the first non-terminal given.
540 ###### create production zero
541 struct production *p = calloc(1,sizeof(*p));
542 struct text start = {"$start",6};
543 struct text eof = {"$eof",4};
544 struct text code = {"$0 = $<1;", 9};
545 p->head = sym_find(g, start);
546 p->head->type = Nonterminal;
547 p->head->struct_name = g->current_type;
548 p->head->isref = g->type_isref;
549 if (g->current_type.txt)
551 array_add(&p->body, &p->body_size, head);
552 array_add(&p->body, &p->body_size, sym_find(g, eof));
553 p->head->first_production = g->production_count;
554 array_add(&g->productions, &g->production_count, p);
556 Now we are ready to read in the grammar. We ignore comments
557 and strings so that the marks which introduce them can be
558 used as terminals in the grammar. We don't ignore numbers
559 even though we don't allow them as that causes the scanner
560 to produce errors that the parser is better positioned to handle.
563 static struct grammar *grammar_read(struct code_node *code)
565 struct token_config conf = {
568 .words_marks = known,
569 .known_count = sizeof(known)/sizeof(known[0]),
571 .ignored = (1 << TK_line_comment)
572 | (1 << TK_block_comment)
575 | (1 << TK_multi_string)
580 struct token_state *state = token_open(code, &conf);
582 struct symbol *head = NULL;
586 g = calloc(1, sizeof(*g));
589 for (tk = token_next(state); tk.num != TK_eof;
590 tk = token_next(state)) {
591 if (tk.num == TK_newline)
593 if (tk.num == TK_ident) {
595 head = sym_find(g, tk.txt);
596 if (head->type == Nonterminal)
597 err = "This non-terminal has already be used.";
598 else if (head->type == Virtual)
599 err = "Virtual symbol not permitted in head of production";
601 head->type = Nonterminal;
602 head->struct_name = g->current_type;
603 head->isref = g->type_isref;
604 if (g->production_count == 0) {
605 ## create production zero
607 head->first_production = g->production_count;
608 tk = token_next(state);
609 if (tk.num == TK_mark &&
610 text_is(tk.txt, "->"))
611 err = parse_production(g, head, state);
613 err = "'->' missing in production";
615 } else if (tk.num == TK_mark
616 && text_is(tk.txt, "|")) {
617 // another production for same non-term
619 err = parse_production(g, head, state);
621 err = "First production must have a head";
622 } else if (tk.num == TK_mark
623 && text_is(tk.txt, "$")) {
624 err = dollar_line(state, g, 0);
625 } else if (tk.num == TK_mark
626 && text_is(tk.txt, "$*")) {
627 err = dollar_line(state, g, 1);
629 err = "Unrecognised token at start of line.";
637 fprintf(stderr, "Error at line %d: %s\n",
644 ## Analysing the grammar
646 The central task in analysing the grammar is to determine a set of
647 states to drive the parsing process. This will require finding
648 various sets of symbols and of "items". Some of these sets will need
649 to attach information to each element in the set, so they are more
652 Each "item" may have a 'look-ahead' or `LA` set associated with
653 it. Many of these will be the same as each other. To avoid memory
654 wastage, and to simplify some comparisons of sets, these sets will be
655 stored in a list of unique sets, each assigned a number.
657 Once we have the data structures in hand to manage these sets and
658 lists, we can start setting the 'nullable' flag, build the 'FIRST'
659 sets, and then create the item sets which define the various states.
663 Though we don't only store symbols in these sets, they are the main
664 things we store, so they are called symbol sets or "symsets".
666 A symset has a size, an array of shorts, and an optional array of data
667 values, which are also shorts. If the array of data is not present,
668 we store an impossible pointer, as `NULL` is used to indicate that no
669 memory has been allocated yet;
674 unsigned short *syms, *data;
676 #define NO_DATA ((unsigned short *)1)
677 const struct symset INIT_SYMSET = { 0, NULL, NO_DATA };
678 const struct symset INIT_DATASET = { 0, NULL, NULL };
680 The arrays of shorts are allocated in blocks of 8 and are kept sorted
681 by using an insertion sort. We don't explicitly record the amount of
682 allocated space as it can be derived directly from the current `cnt` using
683 `((cnt - 1) | 7) + 1`.
686 static void symset_add(struct symset *s, unsigned short key, unsigned short val)
689 int current = ((s->cnt-1) | 7) + 1;
690 if (current == s->cnt) {
692 s->syms = realloc(s->syms, sizeof(*s->syms) * current);
693 if (s->data != NO_DATA)
694 s->data = realloc(s->data, sizeof(*s->data) * current);
697 while (i > 0 && s->syms[i-1] > key) {
698 s->syms[i] = s->syms[i-1];
699 if (s->data != NO_DATA)
700 s->data[i] = s->data[i-1];
704 if (s->data != NO_DATA)
709 Finding a symbol (or item) in a `symset` uses a simple binary search.
710 We return the index where the value was found (so data can be accessed),
711 or `-1` to indicate failure.
713 static int symset_find(struct symset *ss, unsigned short key)
720 while (lo + 1 < hi) {
721 int mid = (lo + hi) / 2;
722 if (ss->syms[mid] <= key)
727 if (ss->syms[lo] == key)
732 We will often want to form the union of two symsets. When we do, we
733 will often want to know if anything changed (as that might mean there
734 is more work to do). So `symset_union` reports whether anything was
735 added to the first set. We use a slow quadratic approach as these
736 sets don't really get very big. If profiles shows this to be a problem it
737 can be optimised later.
739 static int symset_union(struct symset *a, struct symset *b)
743 for (i = 0; i < b->cnt; i++)
744 if (symset_find(a, b->syms[i]) < 0) {
745 unsigned short data = 0;
746 if (b->data != NO_DATA)
748 symset_add(a, b->syms[i], data);
754 And of course we must be able to free a symset.
756 static void symset_free(struct symset ss)
759 if (ss.data != NO_DATA)
765 Some symsets are simply stored somewhere appropriate in the `grammar`
766 data structure, others needs a bit of indirection. This applies
767 particularly to "LA" sets. These will be paired with "items" in an
768 "itemset". As itemsets will be stored in a symset, the "LA" set needs to be
769 stored in the `data` field, so we need a mapping from a small (short)
770 number to an LA `symset`.
772 As mentioned earlier this involves creating a list of unique symsets.
774 For now, we just use a linear list sorted by insertion. A skiplist
775 would be more efficient and may be added later.
782 struct setlist *next;
785 ###### grammar fields
786 struct setlist *sets;
791 static int ss_cmp(struct symset a, struct symset b)
794 int diff = a.cnt - b.cnt;
797 for (i = 0; i < a.cnt; i++) {
798 diff = (int)a.syms[i] - (int)b.syms[i];
805 static int save_set(struct grammar *g, struct symset ss)
807 struct setlist **sl = &g->sets;
811 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
818 s = malloc(sizeof(*s));
827 Finding a set by number is currently performed by a simple linear search.
828 If this turns out to hurt performance, we can store the sets in a dynamic
829 array like the productions.
831 static struct symset set_find(struct grammar *g, int num)
833 struct setlist *sl = g->sets;
834 while (sl && sl->num != num)
840 ### Setting `nullable`
842 We set `nullable` on the head symbol for any production for which all
843 the body symbols (if any) are nullable. As this is a recursive
844 definition, any change in the `nullable` setting means that we need to
845 re-evaluate where it needs to be set.
847 We simply loop around performing the same calculations until no more
854 static void set_nullable(struct grammar *g)
857 while (check_again) {
860 for (p = 0; p < g->production_count; p++) {
861 struct production *pr = g->productions[p];
864 if (pr->head->nullable)
866 for (s = 0; s < pr->body_size; s++)
867 if (! pr->body[s]->nullable)
869 if (s == pr->body_size) {
870 pr->head->nullable = 1;
877 ### Setting `can_eol` and `line_like`
879 In order to be able to ignore newline tokens when not relevant, but
880 still include them in the parse when needed, we will need to know
881 which states can start a "line-like" section of code. We ignore
882 newlines when there is an indent since the most recent start of a
885 To know which symbols are line-like, we first need to know which
886 symbols start with a NEWLINE token. Any symbol which is followed by a
887 NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol.
888 Certainly when trying to parse one of these we must take note of NEWLINEs.
890 Clearly the `TK_newline` token can start with a NEWLINE. Any symbol
891 which is the head of a production that contains a starts-with-NEWLINE
892 symbol preceeded only by nullable symbols is also a
893 starts-with-NEWLINE symbol. We use a new field `can_eol` to record
894 this attribute of symbols, and compute it in a repetitive manner
895 similar to `set_nullable`.
897 Once we have that, we can determine which symbols are `line_like` by
898 seeing which are followed by a `can_eol` symbol in any production.
905 static void set_can_eol(struct grammar *g)
908 g->symtab[TK_newline]->can_eol = 1;
909 while (check_again) {
912 for (p = 0; p < g->production_count; p++) {
913 struct production *pr = g->productions[p];
916 if (pr->head->can_eol)
919 for (s = 0 ; s < pr->body_size; s++) {
920 if (pr->body[s]->can_eol) {
921 pr->head->can_eol = 1;
925 if (!pr->body[s]->nullable)
932 static void set_line_like(struct grammar *g)
935 for (p = 0; p < g->production_count; p++) {
936 struct production *pr = g->productions[p];
939 for (s = 1; s < pr->body_size; s++)
940 if (pr->body[s]->can_eol)
941 pr->body[s-1]->line_like = 1;
945 ### Building the `first` sets
947 When calculating what can follow a particular non-terminal, we will need to
948 know what the "first" terminal in any subsequent non-terminal might be. So
949 we calculate the `first` set for every non-terminal and store them in an
950 array. We don't bother recording the "first" set for terminals as they are
953 As the "first" for one symbol might depend on the "first" of another,
954 we repeat the calculations until no changes happen, just like with
955 `set_nullable`. This makes use of the fact that `symset_union`
956 reports if any change happens.
958 The core of this, which finds the "first" of part of a production body,
959 will be reused for computing the "follow" sets, so we split it out
960 into a separate function.
962 ###### grammar fields
963 struct symset *first;
967 static int add_first(struct production *pr, int start,
968 struct symset *target, struct grammar *g,
973 for (s = start; s < pr->body_size; s++) {
974 struct symbol *bs = pr->body[s];
975 if (bs->type == Terminal) {
976 if (symset_find(target, bs->num) < 0) {
977 symset_add(target, bs->num, 0);
981 } else if (symset_union(target, &g->first[bs->num]))
987 *to_end = (s == pr->body_size);
991 static void build_first(struct grammar *g)
995 g->first = calloc(g->num_syms, sizeof(g->first[0]));
996 for (s = 0; s < g->num_syms; s++)
997 g->first[s] = INIT_SYMSET;
999 while (check_again) {
1002 for (p = 0; p < g->production_count; p++) {
1003 struct production *pr = g->productions[p];
1004 struct symset *head = &g->first[pr->head->num];
1006 if (add_first(pr, 0, head, g, NULL))
1012 ### Building the `follow` sets.
1014 There are two different situations when we will want to generate "follow"
1015 sets. If we are doing an SLR analysis, we want to generate a single
1016 "follow" set for each non-terminal in the grammar. That is what is
1017 happening here. If we are doing an LALR or LR analysis we will want
1018 to generate a separate "LA" set for each item. We do that later
1019 in state generation.
1021 There are two parts to generating a "follow" set. Firstly we look at
1022 every place that any non-terminal appears in the body of any
1023 production, and we find the set of possible "first" symbols after
1024 there. This is done using `add_first` above and only needs to be done
1025 once as the "first" sets are now stable and will not change.
1029 for (p = 0; p < g->production_count; p++) {
1030 struct production *pr = g->productions[p];
1033 for (b = 0; b < pr->body_size - 1; b++) {
1034 struct symbol *bs = pr->body[b];
1035 if (bs->type == Terminal)
1037 add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1041 The second part is to add the "follow" set of the head of a production
1042 to the "follow" sets of the final symbol in the production, and any
1043 other symbol which is followed only by `nullable` symbols. As this
1044 depends on "follow" itself we need to repeatedly perform the process
1045 until no further changes happen.
1049 for (again = 0, p = 0;
1050 p < g->production_count;
1051 p < g->production_count-1
1052 ? p++ : again ? (again = 0, p = 0)
1054 struct production *pr = g->productions[p];
1057 for (b = pr->body_size - 1; b >= 0; b--) {
1058 struct symbol *bs = pr->body[b];
1059 if (bs->type == Terminal)
1061 if (symset_union(&g->follow[bs->num],
1062 &g->follow[pr->head->num]))
1069 We now just need to create and initialise the `follow` list to get a
1072 ###### grammar fields
1073 struct symset *follow;
1076 static void build_follow(struct grammar *g)
1079 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1080 for (s = 0; s < g->num_syms; s++)
1081 g->follow[s] = INIT_SYMSET;
1085 ### Building itemsets and states
1087 There are three different levels of detail that can be chosen for
1088 building the itemsets and states for the LR grammar. They are:
1090 1. LR(0) or SLR(1), where no look-ahead is considered.
1091 2. LALR(1) where we build look-ahead sets with each item and merge
1092 the LA sets when we find two paths to the same "kernel" set of items.
1093 3. LR(1) where different look-ahead for any item in the set means
1094 a different state must be created.
1096 ###### forward declarations
1097 enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1099 We need to be able to look through existing states to see if a newly
1100 generated state already exists. For now we use a simple sorted linked
1103 An item is a pair of numbers: the production number and the position of
1104 "DOT", which is an index into the body of the production.
1105 As the numbers are not enormous we can combine them into a single "short"
1106 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1107 production number (so 4000 productions with maximum of 15 symbols in the
1110 Comparing the itemsets will be a little different to comparing symsets
1111 as we want to do the lookup after generating the "kernel" of an
1112 itemset, so we need to ignore the offset=zero items which are added during
1115 To facilitate this, we modify the "DOT" number so that "0" sorts to
1116 the end of the list in the symset, and then only compare items before
1120 static inline unsigned short item_num(int production, int index)
1122 return production | ((31-index) << 11);
1124 static inline int item_prod(unsigned short item)
1126 return item & 0x7ff;
1128 static inline int item_index(unsigned short item)
1130 return (31-(item >> 11)) & 0x1f;
1133 For LR(1) analysis we need to compare not just the itemset in a state
1134 but also the LA sets. As we assign each unique LA set a number, we
1135 can just compare the symset and the data values together.
1138 static int itemset_cmp(struct symset a, struct symset b,
1139 enum grammar_type type)
1145 i < a.cnt && i < b.cnt &&
1146 item_index(a.syms[i]) > 0 &&
1147 item_index(b.syms[i]) > 0;
1149 int diff = a.syms[i] - b.syms[i];
1153 diff = a.data[i] - b.data[i];
1158 if (i == a.cnt || item_index(a.syms[i]) == 0)
1162 if (i == b.cnt || item_index(b.syms[i]) == 0)
1168 if (type < LR1 || av == -1)
1171 a.data[i] - b.data[i];
1174 And now we can build the list of itemsets. The lookup routine returns
1175 both a success flag and a pointer to where in the list an insert
1176 should happen, so we don't need to search a second time.
1178 FIXME: document min_prefix
1182 struct itemset *next;
1184 struct symset items;
1185 struct symset go_to;
1191 ###### grammar fields
1192 struct itemset *items;
1196 static int itemset_find(struct grammar *g, struct itemset ***where,
1197 struct symset kernel, enum grammar_type type)
1199 struct itemset **ip;
1201 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1202 struct itemset *i = *ip;
1204 diff = itemset_cmp(i->items, kernel, type);
1217 Adding an itemset may require merging the LA sets if LALR analysis is
1218 happening. If any new LA set adds any symbols that weren't in the old LA set, we
1219 clear the `completed` flag so that the dependants of this itemset will be
1220 recalculated and their LA sets updated.
1222 `add_itemset` must consume the symsets it is passed, either by adding
1223 them to a data structure, of freeing them.
1225 static int add_itemset(struct grammar *g, struct symset ss,
1226 enum grammar_type type)
1228 struct itemset **where, *is;
1230 int found = itemset_find(g, &where, ss, type);
1232 is = calloc(1, sizeof(*is));
1233 is->state = g->states;
1237 is->go_to = INIT_DATASET;
1246 for (i = 0; i < ss.cnt; i++) {
1247 struct symset temp = INIT_SYMSET, s;
1248 if (ss.data[i] == is->items.data[i])
1250 s = set_find(g, is->items.data[i]);
1251 symset_union(&temp, &s);
1252 s = set_find(g, ss.data[i]);
1253 if (symset_union(&temp, &s)) {
1254 is->items.data[i] = save_set(g, temp);
1265 To build all the itemsets, we first insert the initial itemset made
1266 from production zero, complete each itemset, and then generate new
1267 itemsets from old until no new ones can be made.
1269 Completing an itemset means finding all the items where "DOT" is followed by
1270 a nonterminal and adding "DOT=0" items for every production from that
1271 non-terminal - providing each item hasn't already been added.
1273 If LA sets are needed, the LA set for each new item is found using
1274 `add_first` which was developed earlier for `FIRST` and `FOLLOW`. This may
1275 be supplemented by the LA set for the item which produce the new item.
1277 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1278 is used in the next stage.
1279 If any of these symbols are flagged as starting a line, then this
1280 state must be a `starts_line` state so now is a good time to record that.
1282 NOTE: precedence handling should happen here - I haven't written this yet
1285 ###### complete itemset
1286 for (i = 0; i < is->items.cnt; i++) {
1287 int p = item_prod(is->items.syms[i]);
1288 int bs = item_index(is->items.syms[i]);
1289 struct production *pr = g->productions[p];
1292 struct symset LA = INIT_SYMSET;
1293 unsigned short sn = 0;
1295 if (is->min_prefix == 0 ||
1296 (bs > 0 && bs < is->min_prefix))
1297 is->min_prefix = bs;
1298 if (bs == pr->body_size)
1301 if (symset_find(&done, s->num) < 0) {
1302 symset_add(&done, s->num, 0);
1304 is->starts_line = 1;
1306 if (s->type != Nonterminal)
1312 add_first(pr, bs+1, &LA, g, &to_end);
1314 struct symset ss = set_find(g, is->items.data[i]);
1315 symset_union(&LA, &ss);
1317 sn = save_set(g, LA);
1318 LA = set_find(g, sn);
1321 /* Add productions for this symbol */
1322 for (p2 = s->first_production;
1323 p2 < g->production_count &&
1324 g->productions[p2]->head == s;
1326 int itm = item_num(p2, 0);
1327 int pos = symset_find(&is->items, itm);
1329 symset_add(&is->items, itm, sn);
1330 /* Will have re-ordered, so start
1331 * from beginning again */
1333 } else if (type >= LALR) {
1334 struct symset ss = set_find(g, is->items.data[pos]);
1335 struct symset tmp = INIT_SYMSET;
1337 symset_union(&tmp, &ss);
1338 if (symset_union(&tmp, &LA)) {
1339 is->items.data[pos] = save_set(g, tmp);
1347 For each symbol we found (and placed in `done`) we collect all the items for
1348 which this symbol is next, and create a new itemset with "DOT" advanced over
1349 the symbol. This is then added to the collection of itemsets (or merged
1350 with a pre-existing itemset).
1352 ###### derive itemsets
1353 // Now we have a completed itemset, so we need to
1354 // compute all the 'next' itemsets and create them
1355 // if they don't exist.
1356 for (i = 0; i < done.cnt; i++) {
1358 unsigned short state;
1359 struct symbol *sym = g->symtab[done.syms[i]];
1360 struct symset newitemset = INIT_SYMSET;
1362 newitemset = INIT_DATASET;
1364 for (j = 0; j < is->items.cnt; j++) {
1365 int itm = is->items.syms[j];
1366 int p = item_prod(itm);
1367 int bp = item_index(itm);
1368 struct production *pr = g->productions[p];
1369 unsigned short la = 0;
1372 if (bp == pr->body_size)
1374 if (pr->body[bp] != sym)
1377 la = is->items.data[j];
1378 pos = symset_find(&newitemset, pr->head->num);
1380 symset_add(&newitemset, item_num(p, bp+1), la);
1381 else if (type >= LALR) {
1382 // Need to merge la set.
1383 int la2 = newitemset.data[pos];
1385 struct symset ss = set_find(g, la2);
1386 struct symset LA = INIT_SYMSET;
1387 symset_union(&LA, &ss);
1388 ss = set_find(g, la);
1389 if (symset_union(&LA, &ss))
1390 newitemset.data[pos] = save_set(g, LA);
1396 state = add_itemset(g, newitemset, type);
1397 if (symset_find(&is->go_to, done.syms[i]) < 0)
1398 symset_add(&is->go_to, done.syms[i], state);
1401 All that is left is to crate the initial itemset from production zero, and
1402 with `TK_eof` as the LA set.
1405 static void build_itemsets(struct grammar *g, enum grammar_type type)
1407 struct symset first = INIT_SYMSET;
1410 unsigned short la = 0;
1412 // LA set just has eof
1413 struct symset eof = INIT_SYMSET;
1414 symset_add(&eof, TK_eof, 0);
1415 la = save_set(g, eof);
1416 first = INIT_DATASET;
1418 // production 0, offset 0 (with no data)
1419 symset_add(&first, item_num(0, 0), la);
1420 add_itemset(g, first, type);
1421 for (again = 0, is = g->items;
1423 is = is->next ?: again ? (again = 0, g->items) : NULL) {
1425 struct symset done = INIT_SYMSET;
1436 ### Completing the analysis.
1438 The exact process of analysis depends on which level was requested. For
1439 `LR0` and `LR05` we don't need first and follow sets at all. For `LALR` and
1440 `LR1` we need first, but not follow. For `SLR` we need both.
1442 We don't build the "action" tables that you might expect as the parser
1443 doesn't use them. They will be calculated to some extent if needed for
1446 Once we have built everything we allocate arrays for the two lists:
1447 symbols and itemsets. This allows more efficient access during reporting.
1448 The symbols are grouped as terminals and non-terminals and we record the
1449 changeover point in `first_nonterm`.
1451 ###### grammar fields
1452 struct symbol **symtab;
1453 struct itemset **statetab;
1456 ###### grammar_analyse
1458 static void grammar_analyse(struct grammar *g, enum grammar_type type)
1462 int snum = TK_reserved;
1463 for (s = g->syms; s; s = s->next)
1464 if (s->num < 0 && s->type == Terminal) {
1468 g->first_nonterm = snum;
1469 for (s = g->syms; s; s = s->next)
1475 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1476 for (s = g->syms; s; s = s->next)
1477 g->symtab[s->num] = s;
1488 build_itemsets(g, type);
1490 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1491 for (is = g->items; is ; is = is->next)
1492 g->statetab[is->state] = is;
1495 ## Reporting on the Grammar
1497 The purpose of the report is to give the grammar developer insight into
1498 how the grammar parser will work. It is basically a structured dump of
1499 all the tables that have been generated, plus a description of any conflicts.
1501 ###### grammar_report
1502 static int grammar_report(struct grammar *g, enum grammar_type type)
1508 return report_conflicts(g, type);
1511 Firstly we have the complete list of symbols, together with the
1512 "FIRST" set if that was generated. We add a mark to each symbol to
1513 show if it can end in a newline (`>`), if it is considered to be
1514 "line-like" (`<`), or if it is nullable (`.`).
1518 static void report_symbols(struct grammar *g)
1522 printf("SYMBOLS + FIRST:\n");
1524 printf("SYMBOLS:\n");
1526 for (n = 0; n < g->num_syms; n++) {
1527 struct symbol *s = g->symtab[n];
1531 printf(" %c%c%c%3d%c: ",
1532 s->nullable ? '.':' ',
1533 s->can_eol ? '>':' ',
1534 s->line_like ? '<':' ',
1535 s->num, symtypes[s->type]);
1538 printf(" (%d%s)", s->precedence,
1539 assoc_names[s->assoc]);
1541 if (g->first && s->type == Nonterminal) {
1544 for (j = 0; j < g->first[n].cnt; j++) {
1547 prtxt(g->symtab[g->first[n].syms[j]]->name);
1554 Then we have the follow sets if they were computed.
1556 static void report_follow(struct grammar *g)
1559 printf("FOLLOW:\n");
1560 for (n = 0; n < g->num_syms; n++)
1561 if (g->follow[n].cnt) {
1565 prtxt(g->symtab[n]->name);
1566 for (j = 0; j < g->follow[n].cnt; j++) {
1569 prtxt(g->symtab[g->follow[n].syms[j]]->name);
1575 And finally the item sets. These include the GO TO tables and, for
1576 LALR and LR1, the LA set for each item. Lots of stuff, so we break
1577 it up a bit. First the items, with production number and associativity.
1579 static void report_item(struct grammar *g, int itm)
1581 int p = item_prod(itm);
1582 int dot = item_index(itm);
1583 struct production *pr = g->productions[p];
1587 prtxt(pr->head->name);
1589 for (i = 0; i < pr->body_size; i++) {
1590 printf(" %s", (dot == i ? ". ": ""));
1591 prtxt(pr->body[i]->name);
1593 if (dot == pr->body_size)
1597 printf(" (%d%s)", pr->precedence,
1598 assoc_names[pr->assoc]);
1602 The LA sets which are (possibly) reported with each item:
1604 static void report_la(struct grammar *g, int lanum)
1606 struct symset la = set_find(g, lanum);
1610 printf(" LOOK AHEAD(%d)", lanum);
1611 for (i = 0; i < la.cnt; i++) {
1614 prtxt(g->symtab[la.syms[i]]->name);
1619 Then the go to sets:
1622 static void report_goto(struct grammar *g, struct symset gt)
1627 for (i = 0; i < gt.cnt; i++) {
1629 prtxt(g->symtab[gt.syms[i]]->name);
1630 printf(" -> %d\n", gt.data[i]);
1634 Now we can report all the item sets complete with items, LA sets, and GO TO.
1636 static void report_itemsets(struct grammar *g)
1639 printf("ITEM SETS(%d)\n", g->states);
1640 for (s = 0; s < g->states; s++) {
1642 struct itemset *is = g->statetab[s];
1643 printf(" Itemset %d:%s min prefix=%d\n",
1644 s, is->starts_line?" (startsline)":"", is->min_prefix);
1645 for (j = 0; j < is->items.cnt; j++) {
1646 report_item(g, is->items.syms[j]);
1647 if (is->items.data != NO_DATA)
1648 report_la(g, is->items.data[j]);
1650 report_goto(g, is->go_to);
1654 ### Reporting conflicts
1656 Conflict detection varies a lot among different analysis levels. However
1657 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1658 LR1 are also similar as they have FOLLOW or LA sets.
1662 ## conflict functions
1664 static int report_conflicts(struct grammar *g, enum grammar_type type)
1667 printf("Conflicts:\n");
1669 cnt = conflicts_lr0(g, type);
1671 cnt = conflicts_slr(g, type);
1673 printf(" - no conflicts\n");
1677 LR0 conflicts are any state which have both a reducible item and
1678 a shiftable item, or two reducible items.
1680 LR05 conflicts only occur if two possible reductions exist,
1681 as shifts always over-ride reductions.
1683 ###### conflict functions
1684 static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1688 for (i = 0; i < g->states; i++) {
1689 struct itemset *is = g->statetab[i];
1690 int last_reduce = -1;
1691 int prev_reduce = -1;
1692 int last_shift = -1;
1696 for (j = 0; j < is->items.cnt; j++) {
1697 int itm = is->items.syms[j];
1698 int p = item_prod(itm);
1699 int bp = item_index(itm);
1700 struct production *pr = g->productions[p];
1702 if (bp == pr->body_size) {
1703 prev_reduce = last_reduce;
1707 if (pr->body[bp]->type == Terminal)
1710 if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1711 printf(" State %d has both SHIFT and REDUCE:\n", i);
1712 report_item(g, is->items.syms[last_shift]);
1713 report_item(g, is->items.syms[last_reduce]);
1716 if (prev_reduce >= 0) {
1717 printf(" State %d has 2 (or more) reducible items\n", i);
1718 report_item(g, is->items.syms[prev_reduce]);
1719 report_item(g, is->items.syms[last_reduce]);
1726 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1727 look ahead, or if a symbol in a look-ahead can be shifted. They differ only
1728 in the source of the look ahead set.
1730 We build two datasets to reflect the "action" table: one which maps
1731 terminals to items where that terminal could be shifted and another
1732 which maps terminals to items that could be reduced when the terminal
1733 is in look-ahead. We report when we get conflicts between the two.
1735 static int conflicts_slr(struct grammar *g, enum grammar_type type)
1740 for (i = 0; i < g->states; i++) {
1741 struct itemset *is = g->statetab[i];
1742 struct symset shifts = INIT_DATASET;
1743 struct symset reduce = INIT_DATASET;
1747 /* First collect the shifts */
1748 for (j = 0; j < is->items.cnt; j++) {
1749 unsigned short itm = is->items.syms[j];
1750 int p = item_prod(itm);
1751 int bp = item_index(itm);
1752 struct production *pr = g->productions[p];
1754 if (bp < pr->body_size &&
1755 pr->body[bp]->type == Terminal) {
1757 int sym = pr->body[bp]->num;
1758 if (symset_find(&shifts, sym) < 0)
1759 symset_add(&shifts, sym, itm);
1762 /* Now look for reduction and conflicts */
1763 for (j = 0; j < is->items.cnt; j++) {
1764 unsigned short itm = is->items.syms[j];
1765 int p = item_prod(itm);
1766 int bp = item_index(itm);
1767 struct production *pr = g->productions[p];
1769 if (bp < pr->body_size)
1774 la = g->follow[pr->head->num];
1776 la = set_find(g, is->items.data[j]);
1778 for (k = 0; k < la.cnt; k++) {
1779 int pos = symset_find(&shifts, la.syms[k]);
1781 printf(" State %d has SHIFT/REDUCE conflict on ", i);
1782 prtxt(g->symtab[la.syms[k]]->name);
1784 report_item(g, shifts.data[pos]);
1785 report_item(g, itm);
1788 pos = symset_find(&reduce, la.syms[k]);
1790 symset_add(&reduce, la.syms[k], itm);
1793 printf(" State %d has REDUCE/REDUCE conflict on ", i);
1794 prtxt(g->symtab[la.syms[k]]->name);
1796 report_item(g, itm);
1797 report_item(g, reduce.data[pos]);
1801 symset_free(shifts);
1802 symset_free(reduce);
1808 ## Generating the parser
1810 The exported part of the parser is the `parse_XX` function, where the name
1811 `XX` is based on the name of the parser files.
1813 This takes a `code_node`, a partially initialized `token_config`, and an
1814 optional `FILE` to send tracing to. The `token_config` gets the list of
1815 known words added and then is used with the `code_node` to initialize the
1818 `parse_XX` then calls the library function `parser_run` to actually complete
1819 the parse. This needs the `states` table and function to call the various
1820 pieces of code provided in the grammar file, so they are generated first.
1822 ###### parser_generate
1824 static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1830 gen_reduce(f, g, file);
1833 fprintf(f, "#line 0 \"gen_parser\"\n");
1834 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1837 fprintf(f, "\tstruct token_state *tokens;\n");
1838 fprintf(f, "\tconfig->words_marks = known;\n");
1839 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1840 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1841 fprintf(f, "\ttokens = token_open(code, config);\n");
1842 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1843 fprintf(f, "\ttoken_close(tokens);\n");
1844 fprintf(f, "\treturn rv;\n");
1845 fprintf(f, "}\n\n");
1848 ### Known words table
1850 The known words table is simply an array of terminal symbols.
1851 The table of nonterminals used for tracing is a similar array.
1855 static void gen_known(FILE *f, struct grammar *g)
1858 fprintf(f, "#line 0 \"gen_known\"\n");
1859 fprintf(f, "static const char *known[] = {\n");
1860 for (i = TK_reserved;
1861 i < g->num_syms && g->symtab[i]->type == Terminal;
1863 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1864 g->symtab[i]->name.txt);
1865 fprintf(f, "};\n\n");
1868 static void gen_non_term(FILE *f, struct grammar *g)
1871 fprintf(f, "#line 0 \"gen_non_term\"\n");
1872 fprintf(f, "static const char *non_term[] = {\n");
1873 for (i = TK_reserved;
1876 if (g->symtab[i]->type == Nonterminal)
1877 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1878 g->symtab[i]->name.txt);
1879 fprintf(f, "};\n\n");
1882 ### States and the goto tables.
1884 For each state we record the goto table, the reducible production if
1885 there is one, or a symbol to shift for error recovery.
1886 Some of the details of the reducible production are stored in the
1887 `do_reduce` function to come later. Here we store the production number,
1888 the body size (useful for stack management) and the resulting symbol (useful
1889 for knowing how to free data later).
1891 The go to table is stored in a simple array of `sym` and corresponding
1894 ###### exported types
1902 const struct lookup * go_to;
1914 static void gen_goto(FILE *f, struct grammar *g)
1917 fprintf(f, "#line 0 \"gen_goto\"\n");
1918 for (i = 0; i < g->states; i++) {
1920 fprintf(f, "static const struct lookup goto_%d[] = {\n",
1922 struct symset gt = g->statetab[i]->go_to;
1923 for (j = 0; j < gt.cnt; j++)
1924 fprintf(f, "\t{ %d, %d },\n",
1925 gt.syms[j], gt.data[j]);
1932 static void gen_states(FILE *f, struct grammar *g)
1935 fprintf(f, "#line 0 \"gen_states\"\n");
1936 fprintf(f, "static const struct state states[] = {\n");
1937 for (i = 0; i < g->states; i++) {
1938 struct itemset *is = g->statetab[i];
1939 int j, prod = -1, prod_len;
1941 int shift_len = 0, shift_remain = 0;
1942 for (j = 0; j < is->items.cnt; j++) {
1943 int itm = is->items.syms[j];
1944 int p = item_prod(itm);
1945 int bp = item_index(itm);
1946 struct production *pr = g->productions[p];
1948 if (bp < pr->body_size) {
1949 if (shift_sym < 0 ||
1950 (shift_len == bp && shift_remain > pr->body_size - bp)) {
1951 shift_sym = pr->body[bp]->num;
1953 shift_remain = pr->body_size - bp;
1957 /* This is what we reduce */
1958 if (prod < 0 || prod_len < pr->body_size) {
1960 prod_len = pr->body_size;
1965 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d, %d },\n",
1966 i, is->go_to.cnt, i, prod,
1967 g->productions[prod]->body_size,
1968 g->productions[prod]->head->num,
1969 is->starts_line, is->min_prefix);
1971 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d, %d },\n",
1972 i, is->go_to.cnt, i, shift_sym,
1973 is->starts_line, is->min_prefix);
1975 fprintf(f, "};\n\n");
1978 ### The `do_reduce` function and the code
1980 When the parser engine decides to reduce a production, it calls `do_reduce`.
1981 This has two functions.
1983 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1984 production being reduced. Secondly it runs the code that was included with
1985 the production if any.
1987 This code needs to be able to store data somewhere. Rather than requiring
1988 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1989 `do_reduce` return the size to be saved.
1991 In order for the code to access "global" context, we pass in the
1992 "config" pointer that was passed to parser function. If the `struct
1993 token_config` is embedded in some larger structure, the reducing code
1994 can access the larger structure using pointer manipulation.
1996 The code fragment requires translation when written out. Any `$N` needs to
1997 be converted to a reference either to that buffer (if `$0`) or to the
1998 structure returned by a previous reduction. These pointers need to be cast
1999 to the appropriate type for each access. All this is handled in
2002 `gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
2003 This applied only to symbols with references (or pointers), not those with structures.
2004 The `<` implies that the reference it being moved out, so the object will not be
2005 automatically freed. This is equivalent to assigning `NULL` to the pointer.
2009 static void gen_code(struct production *p, FILE *f, struct grammar *g)
2012 char *used = calloc(1, p->body_size);
2015 fprintf(f, "\t\t\t");
2016 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2030 if (*c < '0' || *c > '9') {
2037 while (c[1] >= '0' && c[1] <= '9') {
2039 n = n * 10 + *c - '0';
2042 fprintf(f, "(*(struct %.*s*%s)ret)",
2043 p->head->struct_name.len,
2044 p->head->struct_name.txt,
2045 p->head->isref ? "*":"");
2046 else if (n > p->body_size)
2047 fprintf(f, "$%d", n);
2048 else if (p->body[n-1]->type == Terminal)
2049 fprintf(f, "(*(struct token *)body[%d])",
2051 else if (p->body[n-1]->struct_name.txt == NULL)
2052 fprintf(f, "$%d", n);
2054 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2055 p->body[n-1]->struct_name.len,
2056 p->body[n-1]->struct_name.txt,
2057 p->body[n-1]->isref ? "*":"", n-1);
2062 for (i = 0; i < p->body_size; i++) {
2063 if (p->body[i]->struct_name.txt &&
2064 p->body[i]->isref &&
2066 // assume this has been copied out
2067 fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2074 static void gen_reduce(FILE *f, struct grammar *g, char *file)
2077 fprintf(f, "#line 0 \"gen_reduce\"\n");
2078 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2080 fprintf(f, "\tint ret_size = 0;\n");
2082 fprintf(f, "\tswitch(prod) {\n");
2083 for (i = 0; i < g->production_count; i++) {
2084 struct production *p = g->productions[i];
2085 fprintf(f, "\tcase %d:\n", i);
2090 if (p->head->struct_name.txt)
2091 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2092 p->head->struct_name.len,
2093 p->head->struct_name.txt,
2094 p->head->isref ? "*":"");
2096 fprintf(f, "\t\tbreak;\n");
2098 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2103 As each non-terminal can potentially cause a different type of data
2104 structure to be allocated and filled in, we need to be able to free it when
2107 It is particularly important to have fine control over freeing during error
2108 recovery where individual stack frames might need to be freed.
2110 For this, the grammar author is required to defined a `free_XX` function for
2111 each structure that is used by a non-terminal. `do_free` will call whichever
2112 is appropriate given a symbol number, and will call `free` (as is
2113 appropriate for tokens) on any terminal symbol.
2117 static void gen_free(FILE *f, struct grammar *g)
2121 fprintf(f, "#line 0 \"gen_free\"\n");
2122 fprintf(f, "static void do_free(short sym, void *asn)\n");
2124 fprintf(f, "\tif (!asn) return;\n");
2125 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2126 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2127 fprintf(f, "\tswitch(sym) {\n");
2129 for (i = 0; i < g->num_syms; i++) {
2130 struct symbol *s = g->symtab[i];
2132 s->type != Nonterminal ||
2133 s->struct_name.txt == NULL)
2136 fprintf(f, "\tcase %d:\n", s->num);
2138 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2140 s->struct_name.txt);
2141 fprintf(f, "\t\tfree(asn);\n");
2143 fprintf(f, "\t\tfree_%.*s(asn);\n",
2145 s->struct_name.txt);
2146 fprintf(f, "\t\tbreak;\n");
2148 fprintf(f, "\t}\n}\n\n");
2151 ## The main routine.
2153 There are three key parts to the "main" function of parsergen: processing
2154 the arguments, loading the grammar file, and dealing with the grammar.
2156 ### Argument processing.
2158 Command line options allow the selection of analysis mode, name of output
2159 file, and whether or not a report should be generated. By default we create
2160 a report only if no code output was requested.
2162 The `parse_XX` function name uses the basename of the output file which
2163 should not have a suffix (`.c`). `.c` is added to the given name for the
2164 code, and `.h` is added for the header (if header text is specifed in the
2171 static const struct option long_options[] = {
2172 { "LR0", 0, NULL, '0' },
2173 { "LR05", 0, NULL, '5' },
2174 { "SLR", 0, NULL, 'S' },
2175 { "LALR", 0, NULL, 'L' },
2176 { "LR1", 0, NULL, '1' },
2177 { "tag", 1, NULL, 't' },
2178 { "report", 0, NULL, 'R' },
2179 { "output", 1, NULL, 'o' },
2180 { NULL, 0, NULL, 0 }
2182 const char *options = "05SL1t:Ro:";
2184 ###### process arguments
2186 char *outfile = NULL;
2191 enum grammar_type type = LR05;
2192 while ((opt = getopt_long(argc, argv, options,
2193 long_options, NULL)) != -1) {
2208 outfile = optarg; break;
2210 tag = optarg; break;
2212 fprintf(stderr, "Usage: parsergen ...\n");
2217 infile = argv[optind++];
2219 fprintf(stderr, "No input file given\n");
2222 if (outfile && report == 1)
2225 if (name && strchr(name, '/'))
2226 name = strrchr(name, '/')+1;
2228 if (optind < argc) {
2229 fprintf(stderr, "Excess command line arguments\n");
2233 ### Loading the grammar file
2235 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2238 Once we have extracted the code (with `mdcode`) we expect to find three
2239 sections: header, code, and grammar. Anything else that is not
2240 excluded by the `--tag` option is an error.
2242 "header" and "code" are optional, though it is hard to build a working
2243 parser with neither. "grammar" must be provided.
2247 #include <sys/mman.h>
2252 static void pr_err(char *msg)
2255 fprintf(stderr, "%s\n", msg);
2259 struct section *table;
2263 fd = open(infile, O_RDONLY);
2265 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2266 infile, strerror(errno));
2269 len = lseek(fd, 0, 2);
2270 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2271 table = code_extract(file, file+len, pr_err);
2273 struct code_node *hdr = NULL;
2274 struct code_node *code = NULL;
2275 struct code_node *gram = NULL;
2276 for (s = table; s; s = s->next) {
2277 struct text sec = s->section;
2278 if (tag && !strip_tag(&sec, tag))
2280 if (text_is(sec, "header"))
2282 else if (text_is(sec, "code"))
2284 else if (text_is(sec, "grammar"))
2287 fprintf(stderr, "Unknown content section: %.*s\n",
2288 s->section.len, s->section.txt);
2293 ### Processing the input
2295 As we need to append an extention to a filename and then open it for
2296 writing, and we need to do this twice, it helps to have a separate function.
2300 static FILE *open_ext(char *base, char *ext)
2302 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2304 strcat(strcpy(fn, base), ext);
2310 If we can read the grammar, then we analyse and optionally report on it. If
2311 the report finds conflicts we will exit with an error status.
2313 ###### process input
2314 struct grammar *g = NULL;
2316 fprintf(stderr, "No grammar section provided\n");
2319 g = grammar_read(gram);
2321 fprintf(stderr, "Failure to parse grammar\n");
2326 grammar_analyse(g, type);
2328 if (grammar_report(g, type))
2332 If a "headers" section is defined, we write it out and include a declaration
2333 for the `parse_XX` function so it can be used from separate code.
2335 if (rv == 0 && hdr && outfile) {
2336 FILE *f = open_ext(outfile, ".h");
2338 code_node_print(f, hdr, infile);
2339 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2343 fprintf(stderr, "Cannot create %s.h\n",
2349 And if all goes well and an output file was provided, we create the `.c`
2350 file with the code section (if any) and the parser tables and function.
2352 if (rv == 0 && outfile) {
2353 FILE *f = open_ext(outfile, ".c");
2356 code_node_print(f, code, infile);
2357 gen_parser(f, g, infile, name);
2360 fprintf(stderr, "Cannot create %s.c\n",
2366 And that about wraps it up. We need to set the locale so that UTF-8 is
2367 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2369 ###### File: parsergen.mk
2370 parsergen : parsergen.o libscanner.o libmdcode.o
2371 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2378 int main(int argc, char *argv[])
2383 setlocale(LC_ALL,"");
2385 ## process arguments
2392 ## The SHIFT/REDUCE parser
2394 Having analysed the grammar and generated all the tables, we only need the
2395 shift/reduce engine to bring it all together.
2397 ### Goto table lookup
2399 The parser generator has nicely provided us with goto tables sorted by
2400 symbol number. We need a binary search function to find a symbol in the
2403 ###### parser functions
2405 static int search(const struct state *l, int sym)
2408 int hi = l->go_to_cnt;
2412 while (lo + 1 < hi) {
2413 int mid = (lo + hi) / 2;
2414 if (l->go_to[mid].sym <= sym)
2419 if (l->go_to[lo].sym == sym)
2420 return l->go_to[lo].state;
2425 ### The state stack.
2427 The core data structure for the parser is the stack. This tracks all the
2428 symbols that have been recognised or partially recognised.
2430 The stack usually won't grow very large - maybe a few tens of entries. So
2431 we dynamically resize and array as required but never bother to shrink it
2434 We keep the stack as two separate allocations. One, `asn_stack` stores the
2435 "abstract syntax nodes" which are created by each reduction. When we call
2436 `do_reduce` we need to pass an array of the `asn`s of the body of the
2437 production, and by keeping a separate `asn` stack, we can just pass a
2438 pointer into this stack.
2440 The other allocation stores all other stack fields of which there are six.
2441 The `state` is the most important one and guides the parsing process. The
2442 `sym` is nearly unnecessary. However when we want to free entries from the
2443 `asn_stack`, it helps to know what type they are so we can call the right
2444 freeing function. The symbol leads us to the right free function through
2447 The `indents` count tracks the line indents with-in the symbol or
2448 immediately follow it. These are used to allow indent information to
2449 guide parsing and error recovery.
2451 `since_newline` tracks how many stack frames since the last
2452 start-of-line (whether indented or not). So if `since_newline` is
2453 zero, then this symbol is at the start of a line. Similarly
2454 `since_indent` counts the number of states since an indent, it is zero
2455 precisely when `indents` is not zero.
2457 `newline_permitted` keeps track of whether newlines should be ignored
2460 The stack is most properly seen as alternating states and symbols -
2461 states, like the 'DOT' in items, are between symbols. Each frame in
2462 our stack holds a state and the symbol that was before it. The
2463 bottom of stack holds the start state but no symbol, as nothing came
2464 before the beginning.
2466 ###### parser functions
2471 short newline_permitted;
2475 short since_newline;
2485 Two operations are needed on the stack - shift (which is like push) and pop.
2487 Shift applies not only to terminals but also to non-terminals. When
2488 we reduce a production we will pop off entries corresponding to the
2489 body symbols, then push on an item for the head of the production.
2490 This last is exactly the same process as shifting in a terminal so we
2491 use the same function for both. In both cases we provide the symbol,
2492 the number of indents the symbol contains (which will be zero for a
2493 terminal symbol) and a flag indicating the the symbol was at (or was
2494 reduced from a symbol which was at) the start of a line. The state is
2495 deduced from the current top-of-stack state and the new symbol.
2497 To simplify other code we arrange for `shift` to fail if there is no `goto`
2498 state for the symbol. This is useful in basic parsing due to our design
2499 that we shift when we can, and reduce when we cannot. So the `shift`
2500 function reports if it could.
2502 `shift` is also used to push state zero onto the stack, so if the
2503 stack is empty, it always chooses zero as the next state.
2505 So `shift` finds the next state. If that succeeds it extends the
2506 allocations if needed and pushes all the information onto the stacks.
2508 Newlines are permitted after a `starts_line` state until an internal
2509 indent. If the new frame has neither a `starts_line` state nor an
2510 indent, newlines are permitted if the previous stack frame permitted
2513 ###### parser functions
2515 static int shift(struct parser *p,
2516 short sym, short indents, short start_of_line,
2518 const struct state states[])
2520 // Push an entry onto the stack
2521 struct frame next = {0};
2522 int newstate = p->tos
2523 ? search(&states[p->stack[p->tos-1].state],
2528 if (p->tos >= p->stack_size) {
2529 p->stack_size += 10;
2530 p->stack = realloc(p->stack, p->stack_size
2531 * sizeof(p->stack[0]));
2532 p->asn_stack = realloc(p->asn_stack, p->stack_size
2533 * sizeof(p->asn_stack[0]));
2536 next.indents = indents;
2537 next.state = newstate;
2538 if (states[newstate].starts_line)
2539 next.newline_permitted = 1;
2541 next.newline_permitted = 0;
2543 next.newline_permitted =
2544 p->stack[p->tos-1].newline_permitted;
2546 next.newline_permitted = 0;
2548 if (!start_of_line) {
2550 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2552 next.since_newline = 1;
2555 next.since_indent = 0;
2557 next.since_indent = p->stack[p->tos-1].since_indent + 1;
2559 next.since_indent = 1;
2561 p->stack[p->tos] = next;
2562 p->asn_stack[p->tos] = asn;
2567 `pop` primarily moves the top of stack (`tos`) back down the required
2568 amount and frees any `asn` entries that need to be freed. It also
2569 collects a summary of the indents and line starts in the symbols that
2570 are being removed. It is called _after_ we reduce a production, just
2571 before we `shift` the nonterminal in.
2573 ###### parser functions
2575 static int pop(struct parser *p, int num,
2576 short *start_of_line,
2577 void(*do_free)(short sym, void *asn))
2584 for (i = 0; i < num; i++) {
2585 sol |= !p->stack[p->tos+i].since_newline;
2586 indents += p->stack[p->tos+i].indents;
2587 do_free(p->stack[p->tos+i].sym,
2588 p->asn_stack[p->tos+i]);
2591 *start_of_line = sol;
2595 ### Memory allocation
2597 The `scanner` returns tokens in a local variable - we want them in allocated
2598 memory so they can live in the `asn_stack`. Similarly the `asn` produced by
2599 a reduce is in a large buffer. Both of these require some allocation and
2600 copying, hence `memdup` and `tokcopy`.
2602 ###### parser includes
2605 ###### parser functions
2607 void *memdup(void *m, int len)
2613 memcpy(ret, m, len);
2617 static struct token *tok_copy(struct token tk)
2619 struct token *new = malloc(sizeof(*new));
2624 ### The heart of the parser.
2626 Now we have the parser. If we can shift we do, though newlines and
2627 reducing indenting may block that. If not and we can reduce we do
2628 that. If the production we reduced was production zero, then we have
2629 accepted the input and can finish.
2631 We return whatever `asn` was returned by reducing production zero.
2633 If we can neither shift nor reduce we have an error to handle. We pop
2634 single entries off the stack until we can shift the `TK_error` symbol, then
2635 drop input tokens until we find one we can shift into the new error state.
2637 When we find `TK_in` and `TK_out` tokens which report indents we need
2638 to handle them directly as the grammar cannot express what we want to
2641 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2642 record how many indents there are following the previous token.
2644 `TK_out` tokens must be canceled against an indent count
2645 within the stack. If we can reduce some symbols that are all since
2646 the most recent indent, then we do that first. If the minimum prefix
2647 of the current state then extends back before the most recent indent,
2648 that indent can be cancelled. If the minimum prefix is shorter then
2649 the indent is premature and we must start error handling, which
2650 currently doesn't work at all.
2652 `TK_newline` tokens are ignored unless the top stack frame records
2653 that they are permitted. In that case they will not be considered for
2654 shifting if it is possible to reduce some symbols that are all since
2655 the most recent start of line. This is how a newline forcible
2656 terminates any line-like structure - we try to reduce down to at most
2657 one symbol for each line where newlines are allowed.
2659 ###### parser includes
2662 void *parser_run(struct token_state *tokens,
2663 const struct state states[],
2664 int (*do_reduce)(int, void**, struct token_config*, void*),
2665 void (*do_free)(short, void*),
2666 FILE *trace, const char *non_term[],
2667 struct token_config *config)
2669 struct parser p = { 0 };
2670 struct token *tk = NULL;
2674 shift(&p, TK_eof, 0, 1, NULL, states);
2676 struct token *err_tk;
2677 struct frame *tos = &p.stack[p.tos-1];
2679 tk = tok_copy(token_next(tokens));
2680 parser_trace(trace, &p,
2681 tk, states, non_term, config->known_count);
2683 if (tk->num == TK_in) {
2685 tos->since_newline = 0;
2686 tos->since_indent = 0;
2687 if (!states[tos->state].starts_line)
2688 tos->newline_permitted = 0;
2691 parser_trace_action(trace, "Record");
2694 if (tk->num == TK_out) {
2695 if (states[tos->state].reduce_size >= 0 &&
2696 states[tos->state].reduce_size <= tos->since_indent)
2698 if (states[tos->state].min_prefix >= tos->since_indent) {
2700 struct frame *in = tos - tos->since_indent;
2702 if (in->indents == 0) {
2703 /* Reassess since_indent and newline_permitted */
2705 in->since_indent = in[-1].since_indent + 1;
2706 in->newline_permitted = in[-1].newline_permitted;
2708 in->since_indent = 0;
2709 in->newline_permitted = 0;
2711 if (states[in->state].starts_line)
2712 in->newline_permitted = 1;
2715 in->since_indent = in[-1].since_indent + 1;
2716 if (states[in->state].starts_line)
2717 in->newline_permitted = 1;
2719 in->newline_permitted = in[-1].newline_permitted;
2724 parser_trace_action(trace, "Cancel");
2727 // fall through and force a REDUCE (as 'shift'
2730 if (tk->num == TK_newline) {
2731 if (!tos->newline_permitted) {
2734 parser_trace_action(trace, "Discard");
2737 if (tos->since_newline > 1 &&
2738 states[tos->state].reduce_size >= 0 &&
2739 states[tos->state].reduce_size <= tos->since_newline)
2742 if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
2744 parser_trace_action(trace, "Shift");
2748 if (states[tos->state].reduce_prod >= 0) {
2751 const struct state *nextstate = &states[tos->state];
2752 int prod = nextstate->reduce_prod;
2753 int size = nextstate->reduce_size;
2755 static char buf[16*1024];
2756 short indents, start_of_line;
2758 body = p.asn_stack + (p.tos - size);
2760 bufsize = do_reduce(prod, body, config, buf);
2762 indents = pop(&p, size, &start_of_line,
2764 res = memdup(buf, bufsize);
2765 memset(buf, 0, bufsize);
2766 if (!shift(&p, nextstate->reduce_sym,
2767 indents, start_of_line,
2769 if (prod != 0) abort();
2773 parser_trace_action(trace, "Reduce");
2776 if (tk->num == TK_out) {
2777 // Indent problem - synthesise tokens to get us
2779 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[tos->state].shift_sym);
2780 shift(&p, states[tos->state].shift_sym,
2781 0, 1, tok_copy(*tk), states);
2782 // FIXME need to report this error somehow
2783 parser_trace_action(trace, "Synthesize");
2786 /* Error. We walk up the stack until we
2787 * find a state which will accept TK_error.
2788 * We then shift in TK_error and see what state
2789 * that takes us too.
2790 * Then we discard input tokens until
2791 * we find one that is acceptable.
2793 parser_trace_action(trace, "ERROR");
2794 short indents = 0, start_of_line;
2796 err_tk = tok_copy(*tk);
2797 while (shift(&p, TK_error, 0, 0,
2798 err_tk, states) == 0
2800 // discard this state
2801 indents += pop(&p, 1, &start_of_line, do_free);
2804 // no state accepted TK_error
2807 tos = &p.stack[p.tos-1];
2808 while (search(&states[tos->state], tk->num) < 0 &&
2809 tk->num != TK_eof) {
2811 tk = tok_copy(token_next(tokens));
2812 if (tk->num == TK_in)
2814 if (tk->num == TK_out) {
2818 // FIXME update since_indent here
2821 if (p.tos == 0 && tk->num == TK_eof)
2823 tos = &p.stack[p.tos-1];
2824 tos->indents += indents;
2828 pop(&p, p.tos, NULL, do_free);
2834 ###### exported functions
2835 void *parser_run(struct token_state *tokens,
2836 const struct state states[],
2837 int (*do_reduce)(int, void**, struct token_config*, void*),
2838 void (*do_free)(short, void*),
2839 FILE *trace, const char *non_term[],
2840 struct token_config *config);
2844 Being able to visualize the parser in action can be invaluable when
2845 debugging the parser code, or trying to understand how the parse of a
2846 particular grammar progresses. The stack contains all the important
2847 state, so just printing out the stack every time around the parse loop
2848 can make it possible to see exactly what is happening.
2850 This doesn't explicitly show each SHIFT and REDUCE action. However they
2851 are easily deduced from the change between consecutive lines, and the
2852 details of each state can be found by cross referencing the states list
2853 in the stack with the "report" that parsergen can generate.
2855 For terminal symbols, we just dump the token. For non-terminals we
2856 print the name of the symbol. The look ahead token is reported at the
2857 end inside square brackets.
2859 ###### parser functions
2861 static char *reserved_words[] = {
2862 [TK_error] = "ERROR",
2865 [TK_newline] = "NEWLINE",
2868 static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2870 fprintf(trace, "(%d", f->state);
2871 if (states[f->state].starts_line)
2872 fprintf(trace, "s");
2873 if (f->newline_permitted)
2874 fprintf(trace, "n%d", f->since_newline);
2875 fprintf(trace, ") ");
2878 void parser_trace(FILE *trace, struct parser *p,
2879 struct token *tk, const struct state states[],
2880 const char *non_term[], int knowns)
2885 for (i = 0; i < p->tos; i++) {
2886 struct frame *f = &p->stack[i];
2889 if (sym < TK_reserved &&
2890 reserved_words[sym] != NULL)
2891 fputs(reserved_words[sym], trace);
2892 else if (sym < TK_reserved + knowns) {
2893 struct token *t = p->asn_stack[i];
2894 text_dump(trace, t->txt, 20);
2896 fputs(non_term[sym - TK_reserved - knowns],
2899 fprintf(trace, ".%d", f->indents);
2900 if (f->since_newline == 0)
2904 parser_trace_state(trace, f, states);
2906 fprintf(trace, "[");
2907 if (tk->num < TK_reserved &&
2908 reserved_words[tk->num] != NULL)
2909 fputs(reserved_words[tk->num], trace);
2911 text_dump(trace, tk->txt, 20);
2915 void parser_trace_action(FILE *trace, char *action)
2918 fprintf(trace, " - %s\n", action);
2923 The obvious example for a parser is a calculator.
2925 As `scanner` provides number parsing function using `libgmp` is it not much
2926 work to perform arbitrary rational number calculations.
2928 This calculator takes one expression, or an equality test, per line. The
2929 results are printed and if any equality test fails, the program exits with
2932 ###### File: parsergen.mk
2933 calc.c calc.h : parsergen parsergen.mdc
2934 ./parsergen --tag calc -o calc parsergen.mdc
2935 calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2936 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2941 // what do we use for a demo-grammar? A calculator of course.
2953 #include <sys/mman.h>
2958 #include "scanner.h"
2964 static void free_number(struct number *n)
2970 int main(int argc, char *argv[])
2972 int fd = open(argv[1], O_RDONLY);
2973 int len = lseek(fd, 0, 2);
2974 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2975 struct section *s = code_extract(file, file+len, NULL);
2976 struct token_config config = {
2977 .ignored = (1 << TK_line_comment)
2978 | (1 << TK_block_comment)
2981 .number_chars = ".,_+-",
2985 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2987 struct section *t = s->next;
2997 Session -> Session Line
3000 Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3001 { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3002 gmp_printf(" or as a decimal: %Fg\n", fl);
3006 | Expression = Expression NEWLINE ${
3007 if (mpq_equal($1.val, $3.val))
3008 gmp_printf("Both equal %Qd\n", $1.val);
3010 gmp_printf("NOT EQUAL: %Qd\n != : %Qd\n",
3015 | NEWLINE ${ printf("Blank line\n"); }$
3016 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3019 Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3020 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3021 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
3023 Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3024 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3025 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
3027 Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3028 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$