ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 There are several distinct sections.
   8
   9 1. `grammar_read` will read a grammar from a literate-code file and
  10    store the productions, symbols, and code fragments.
  11 2. `grammar_analyse` will take that grammar and build LR parsing
  12    states and look-ahead tables.
  13 3. `grammar_report` will present the details of the analysis
  14    in a readable format and will report any conflicts.
  15 4. `parser_generate` will write out C code files with various
  16    tables and with the code fragments arranged in useful places.
  17 5. `parser_run` is a library function intended to be linked together
  18    with the generated parser tables to complete the implementation of
  19    a parser.
  20 6. Finally `calc` is a test grammar for a simple calculator.  The
  21    `parsergen` program built from the C code in this file can extract
  22    that grammar directly from this file and process it.
  23
  24
  25 ###### File: parsergen.c
  26         #include <unistd.h>
  27         #include <stdlib.h>
  28         #include <stdio.h>
  29         ## includes
  30         ## forward declarations
  31         ## declarations
  32         ## functions
  33         ## grammar_read
  34         ## grammar_analyse
  35         ## grammar_report
  36         ## parser_generate
  37         ## main
  38 ###### File: parser.h
  39         ## exported types
  40         ## exported functions
  41 ###### File: libparser.c
  42         #include <unistd.h>
  43         #include <stdlib.h>
  44         #include <stdio.h>
  45         ## parser includes
  46         ## parser functions
  47         ## parser_run
  48 ###### File: parsergen.mk
  49         CFLAGS += -Wall -g
  50         all :: parsergen calc
  51         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  52                 ./md2c parsergen.mdc
  53
  54 ## Reading the grammar
  55
  56 The grammar must be stored in a markdown literate code file as parsed
  57 by "[mdcode][]".  It must have three top level (i.e. unreferenced)
  58 sections: `header`, `code`, and `grammar`.  The first two will be
  59 literally copied into the generated `.c` and `.h`. files.  The last
  60 contains the grammar.  This is tokenised with "[scanner][]".
  61
  62 If the `--tag` option is given, then any top level header that doesn't
  63 start with the tag is ignored, and the tag is striped from the rest.  So
  64 `--tag Foo`
  65 means that the three needed sections must be `Foo: header`, `Foo: code`,
  66 and `Foo: grammar`.
  67
  68 [mdcode]: mdcode.html
  69 [scanner]: scanner.html
  70
  71 ###### includes
  72         #include "mdcode.h"
  73         #include "scanner.h"
  74
  75 ###### parser includes
  76         #include "mdcode.h"
  77         #include "scanner.h"
  78
  79 The grammar contains production sets, precedence/associativity
  80 declarations, and data type declarations.  These are all parsed with
  81 _ad hoc_ parsing as we don't have a parser generator yet.
  82
  83 The precedence and associativity can be set for each production, but
  84 can be inherited from symbols.  The data type (either structure or a
  85 reference to a structure) is potentially defined for each non-terminal
  86 and describes what C structure is used to store information about each
  87 symbol.
  88
  89 ###### declarations
  90         enum assoc {Left, Right, Non};
  91         char *assoc_names[] = {"Left","Right","Non"};
  92
  93         struct symbol {
  94                 struct text struct_name;
  95                 int isref;
  96                 enum assoc assoc;
  97                 unsigned short precedence;
  98                 ## symbol fields
  99         };
 100         struct production {
 101                 unsigned short precedence;
 102                 enum assoc assoc;
 103                 ## production fields
 104         };
 105         struct grammar {
 106                 ## grammar fields
 107         };
 108
 109 The strings reported by `mdcode` and `scanner` are `struct text` which have
 110 length rather than being null terminated.  To help with printing and
 111 comparing we define `text_is` and `prtxt`, which should possibly go in
 112 `mdcode`.  `scanner` does provide `text_dump` which is useful for strings
 113 which might contain control characters.
 114
 115 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 116 because that is what we need to detect tags.
 117
 118 ###### functions
 119         static int text_is(struct text t, char *s)
 120         {
 121                 return (strlen(s) == t.len &&
 122                         strncmp(s, t.txt, t.len) == 0);
 123         }
 124         static void prtxt(struct text t)
 125         {
 126                 printf("%.*s", t.len, t.txt);
 127         }
 128
 129         static int strip_tag(struct text *t, char *tag)
 130         {
 131                 int skip = strlen(tag) + 1;
 132                 if (skip >= t->len ||
 133                     strncmp(t->txt, tag, skip-1) != 0 ||
 134                     t->txt[skip-1] != ':')
 135                         return 0;
 136                 while (skip < t->len && t->txt[skip] == ' ')
 137                         skip++;
 138                 t->len -= skip;
 139                 t->txt += skip;
 140                 return 1;
 141         }
 142
 143 ### Symbols
 144
 145 Productions are comprised primarily of symbols - terminal and
 146 non-terminal.  We do not make a syntactic distinction between the two,
 147 though non-terminals must be identifiers.  Non-terminal symbols are
 148 those which appear in the head of a production, terminal symbols are
 149 those which don't.  There are also "virtual" symbols used for precedence
 150 marking discussed later, and sometimes we won't know what type a symbol
 151 is yet.
 152
 153 ###### forward declarations
 154         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 155         char *symtypes = "UVTN";
 156 ###### symbol fields
 157         enum symtype type;
 158
 159 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 160 table of known symbols and the resulting parser will report them as
 161 `TK_reserved + N`.  A small set of identifiers are reserved for the
 162 different token types that `scanner` can report.
 163
 164 ###### declarations
 165         static char *reserved_words[] = {
 166                 [TK_error]        = "ERROR",
 167                 [TK_number]       = "NUMBER",
 168                 [TK_ident]        = "IDENTIFIER",
 169                 [TK_mark]         = "MARK",
 170                 [TK_string]       = "STRING",
 171                 [TK_multi_string] = "MULTI_STRING",
 172                 [TK_in]           = "IN",
 173                 [TK_out]          = "OUT",
 174                 [TK_newline]      = "NEWLINE",
 175                 [TK_eof]          = "$eof",
 176         };
 177 ###### symbol fields
 178         short num;
 179
 180 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 181 recognised.  The former is automatically expected at the end of the text
 182 being parsed. The latter are always ignored by the parser.
 183
 184 All of these symbols are stored in a simple symbol table.  We use the
 185 `struct text` from `mdcode` to store the name and link them together
 186 into a sorted list using an insertion sort.
 187
 188 We don't have separate `find` and `insert` functions as any symbol we
 189 find needs to be remembered.  We simply expect `find` to always return a
 190 symbol, but its type might be `Unknown`.
 191
 192 ###### includes
 193         #include <string.h>
 194
 195 ###### symbol fields
 196         struct text name;
 197         struct symbol *next;
 198
 199 ###### grammar fields
 200         struct symbol *syms;
 201         int num_syms;
 202
 203 ###### functions
 204         static int text_cmp(struct text a, struct text b)
 205         {
 206                 int len = a.len;
 207                 if (a.len > b.len)
 208                         len = b.len;
 209                 int cmp = strncmp(a.txt, b.txt, len);
 210                 if (cmp)
 211                         return cmp;
 212                 else
 213                         return a.len - b.len;
 214         }
 215
 216         static struct symbol *sym_find(struct grammar *g, struct text s)
 217         {
 218                 struct symbol **l = &g->syms;
 219                 struct symbol *n;
 220                 int cmp = 1;
 221
 222                 while (*l &&
 223                         (cmp = text_cmp((*l)->name, s)) < 0)
 224                                 l = & (*l)->next;
 225                 if (cmp == 0)
 226                         return *l;
 227                 n = calloc(1, sizeof(*n));
 228                 n->name = s;
 229                 n->type = Unknown;
 230                 n->next = *l;
 231                 n->num = -1;
 232                 *l = n;
 233                 return n;
 234         }
 235
 236         static void symbols_init(struct grammar *g)
 237         {
 238                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 239                 int i;
 240                 for (i = 0; i < entries; i++) {
 241                         struct text t;
 242                         struct symbol *s;
 243                         t.txt = reserved_words[i];
 244                         if (!t.txt)
 245                                 continue;
 246                         t.len = strlen(t.txt);
 247                         s = sym_find(g, t);
 248                         s->type = Terminal;
 249                         s->num = i;
 250                 }
 251         }
 252
 253 ### Data types and precedence.
 254
 255 Data type specification and precedence specification are both
 256 introduced by a dollar sign at the start of the line.  If the next
 257 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 258 precedence, otherwise it specifies a data type.
 259
 260 The data type name is simply stored and applied to the head of all
 261 subsequent productions.  It must be the name of a structure optionally
 262 preceded by an asterisk which means a reference or "pointer".  So
 263 `$expression` maps to `struct expression` and `$*statement` maps to
 264 `struct statement *`.
 265
 266 Any productions given before the first data type declaration will have
 267 no data type associated with them and can carry no information.  In
 268 order to allow other non-terminals to have no type, the data type
 269 `$void` can be given.  This does *not* mean that `struct void` will be
 270 used, but rather than no type will be associated with future
 271 non-terminals.
 272
 273 The precedence line must contain a list of symbols - typically
 274 terminal symbols, but not necessarily.  It can only contain symbols
 275 that have not been seen yet, so precedence declaration must precede
 276 the productions that they affect.
 277
 278 A precedence line may also contain a "virtual" symbol which is an
 279 identifier preceded by `$$`.  Virtual terminals carry precedence
 280 information but are not included in the grammar.  A production can
 281 declare that it inherits the precedence of a given virtual symbol.
 282
 283 This use for `$$` precludes it from being used as a symbol in the
 284 described language.  Two other symbols: `${` and `}$` are also
 285 unavailable.
 286
 287 Each new precedence line introduces a new precedence level and
 288 declares how it associates.  This level is stored in each symbol
 289 listed and may be inherited by any production which uses the symbol.  A
 290 production inherits from the last symbol which has a precedence.
 291
 292 ###### grammar fields
 293         struct text current_type;
 294         int type_isref;
 295         int prec_levels;
 296
 297 ###### declarations
 298         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 299         static const char *known[] = { "$$", "${", "}$" };
 300
 301 ###### functions
 302         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 303         {
 304                 struct token t = token_next(ts);
 305                 char *err;
 306                 enum assoc assoc;
 307                 int found;
 308
 309                 if (t.num != TK_ident) {
 310                         err = "type or assoc expected after '$'";
 311                         goto abort;
 312                 }
 313                 if (text_is(t.txt, "LEFT"))
 314                         assoc = Left;
 315                 else if (text_is(t.txt, "RIGHT"))
 316                         assoc = Right;
 317                 else if (text_is(t.txt, "NON"))
 318                         assoc = Non;
 319                 else {
 320                         g->current_type = t.txt;
 321                         g->type_isref = isref;
 322                         if (text_is(t.txt, "void"))
 323                                 g->current_type.txt = NULL;
 324                         t = token_next(ts);
 325                         if (t.num != TK_newline) {
 326                                 err = "Extra tokens after type name";
 327                                 goto abort;
 328                         }
 329                         return NULL;
 330                 }
 331
 332                 if (isref) {
 333                         err = "$* cannot be followed by a precedence";
 334                         goto abort;
 335                 }
 336
 337                 // This is a precedence line, need some symbols.
 338                 found = 0;
 339                 g->prec_levels += 1;
 340                 t = token_next(ts);
 341                 while (t.num != TK_newline) {
 342                         enum symtype type = Terminal;
 343                         struct symbol *s;
 344                         if (t.num == TK_virtual) {
 345                                 type = Virtual;
 346                                 t = token_next(ts);
 347                                 if (t.num != TK_ident) {
 348                                         err = "$$ must be followed by a word";
 349                                         goto abort;
 350                                 }
 351                         } else if (t.num != TK_ident &&
 352                                    t.num != TK_mark) {
 353                                 err = "Illegal token in precedence line";
 354                                 goto abort;
 355                         }
 356                         s = sym_find(g, t.txt);
 357                         if (s->type != Unknown) {
 358                                 err = "Symbols in precedence line must not already be known.";
 359                                 goto abort;
 360                         }
 361                         s->type = type;
 362                         s->precedence = g->prec_levels;
 363                         s->assoc = assoc;
 364                         found += 1;
 365                 }
 366                 if (found == 0)
 367                         err = "No symbols given on precedence line";
 368                         goto abort;
 369                 return NULL;
 370         abort:
 371                 while (t.num != TK_newline && t.num != TK_eof)
 372                         t = token_next(ts);
 373                 return err;
 374         }
 375
 376 ### Productions
 377
 378 A production either starts with an identifier which is the head
 379 non-terminal, or a vertical bar (`|`) in which case this production
 380 uses the same head as the previous one.  The identifier must be
 381 followed by a `->` mark.  All productions for a given non-terminal must
 382 be grouped together with the `nonterminal ->` given only once.
 383
 384 After this a (possibly empty) sequence of identifiers and marks form
 385 the body of the production.  A virtual symbol may be given after the
 386 body by preceding it with `$$`.  If a virtual symbol is given, the
 387 precedence of the production is that for the virtual symbol.  If none
 388 is given, the precedence is inherited from the last symbol in the
 389 production which has a precedence specified.
 390
 391 After the optional precedence may come the `${` mark.  This indicates
 392 the start of a code fragment.  If present, this must be on the same
 393 line as the start of the production.
 394
 395 All of the text from the `${` through to the matching `}$` is
 396 collected and forms the code-fragment for the production.  It must all
 397 be in one `code_node` of the literate code.  The `}$` must be
 398 at the end of a line.
 399
 400 Text in the code fragment will undergo substitutions where `$N` for
 401 some numeric `N` will be replaced with a variable holding the parse
 402 information for the particular symbol in the production.  `$0` is the
 403 head of the production, `$1` is the first symbol of the body, etc.
 404 The type of `$N` for a terminal symbol is `struct token`.  For
 405 a non-terminal, it is whatever has been declared for that symbol.
 406
 407 While building productions we will need to add to an array which needs to
 408 grow dynamically.
 409
 410 ###### functions
 411         static void array_add(void *varray, int *cnt, void *new)
 412         {
 413                 void ***array = varray;
 414                 int current = 0;
 415                 const int step = 8;
 416                 current = ((*cnt-1) | (step-1))+1;
 417                 if (*cnt == current) {
 418                         /* must grow */
 419                         current += step;
 420                         *array = realloc(*array, current * sizeof(void*));
 421                 }
 422                 (*array)[*cnt] = new;
 423                 (*cnt) += 1;
 424         }
 425
 426 Collecting the code fragment simply involves reading tokens until we
 427 hit the end token `}$`, and noting the character position of the start and
 428 the end.
 429
 430 ###### functions
 431         static struct text collect_code(struct token_state *state,
 432                                         struct token start)
 433         {
 434                 struct text code;
 435                 struct token t;
 436                 code.txt = start.txt.txt + start.txt.len;
 437                 do
 438                         t = token_next(state);
 439                 while (t.node == start.node &&
 440                        t.num != TK_close && t.num != TK_error &&
 441                        t.num != TK_eof);
 442                 if (t.num == TK_close && t.node == start.node)
 443                         code.len = t.txt.txt - code.txt;
 444                 else
 445                         code.txt = NULL;
 446                 return code;
 447         }
 448
 449 Now we have all the bit we need to parse a full production.
 450
 451 ###### includes
 452         #include <memory.h>
 453
 454 ###### grammar fields
 455         struct production **productions;
 456         int production_count;
 457
 458 ###### production fields
 459         struct symbol  *head;
 460         struct symbol **body;
 461         int             body_size;
 462         struct text     code;
 463
 464 ###### symbol fields
 465         int first_production;
 466
 467 ###### functions
 468         static char *parse_production(struct grammar *g,
 469                                       struct symbol *head,
 470                                       struct token_state *state)
 471         {
 472                 /* Head has already been parsed. */
 473                 struct token tk;
 474                 char *err;
 475                 struct production p, *pp;
 476
 477                 memset(&p, 0, sizeof(p));
 478                 p.head = head;
 479                 tk = token_next(state);
 480                 while (tk.num == TK_ident || tk.num == TK_mark) {
 481                         struct symbol *bs = sym_find(g, tk.txt);
 482                         if (bs->type == Unknown)
 483                                 bs->type = Terminal;
 484                         if (bs->type == Virtual) {
 485                                 err = "Virtual symbol not permitted in production";
 486                                 goto abort;
 487                         }
 488                         if (bs->precedence) {
 489                                 p.precedence = bs->precedence;
 490                                 p.assoc = bs->assoc;
 491                         }
 492                         array_add(&p.body, &p.body_size, bs);
 493                         tk = token_next(state);
 494                 }
 495                 if (tk.num == TK_virtual) {
 496                         struct symbol *vs;
 497                         tk = token_next(state);
 498                         if (tk.num != TK_ident) {
 499                                 err = "word required after $$";
 500                                 goto abort;
 501                         }
 502                         vs = sym_find(g, tk.txt);
 503                         if (vs->type != Virtual) {
 504                                 err = "symbol after $$ must be virtual";
 505                                 goto abort;
 506                         }
 507                         p.precedence = vs->precedence;
 508                         p.assoc = vs->assoc;
 509                         tk = token_next(state);
 510                 }
 511                 if (tk.num == TK_open) {
 512                         p.code = collect_code(state, tk);
 513                         if (p.code.txt == NULL) {
 514                                 err = "code fragment not closed properly";
 515                                 goto abort;
 516                         }
 517                         tk = token_next(state);
 518                 }
 519                 if (tk.num != TK_newline && tk.num != TK_eof) {
 520                         err = "stray tokens at end of line";
 521                         goto abort;
 522                 }
 523                 pp = malloc(sizeof(*pp));
 524                 *pp = p;
 525                 array_add(&g->productions, &g->production_count, pp);
 526                 return NULL;
 527         abort:
 528                 while (tk.num != TK_newline && tk.num != TK_eof)
 529                         tk = token_next(state);
 530                 return err;
 531         }
 532
 533 With the ability to parse production and dollar-lines, we have nearly all
 534 that we need to parse a grammar from a `code_node`.
 535
 536 The head of the first production will effectively be the `start` symbol of
 537 the grammar.  However it won't _actually_ be so.  Processing the grammar is
 538 greatly simplified if the real start symbol only has a single production,
 539 and expects `$eof` as the final terminal.  So when we find the first
 540 explicit production we insert an extra production as production zero which
 541 looks like
 542
 543 ###### Example: production 0
 544         $start -> START $eof
 545
 546 where `START` is the first non-terminal given.
 547
 548 ###### create production zero
 549         struct production *p = calloc(1,sizeof(*p));
 550         struct text start = {"$start",6};
 551         struct text eof = {"$eof",4};
 552         p->head = sym_find(g, start);
 553         p->head->type = Nonterminal;
 554         array_add(&p->body, &p->body_size, head);
 555         array_add(&p->body, &p->body_size, sym_find(g, eof));
 556         p->head->first_production = g->production_count;
 557         array_add(&g->productions, &g->production_count, p);
 558
 559 Now we are ready to read in the grammar.
 560
 561 ###### grammar_read
 562         static struct grammar *grammar_read(struct code_node *code)
 563         {
 564                 struct token_config conf = {
 565                         .word_start = "",
 566                         .word_cont = "",
 567                         .words_marks = known,
 568                         .known_count = sizeof(known)/sizeof(known[0]),
 569                         .number_chars = "",
 570                         .ignored = (1 << TK_line_comment)
 571                                  | (1 << TK_block_comment)
 572                                  | (0 << TK_number)
 573                                  | (1 << TK_string)
 574                                  | (1 << TK_multi_string)
 575                                  | (1 << TK_in)
 576                                  | (1 << TK_out),
 577                 };
 578
 579                 struct token_state *state = token_open(code, &conf);
 580                 struct token tk;
 581                 struct symbol *head = NULL;
 582                 struct grammar *g;
 583                 char *err = NULL;
 584
 585                 g = calloc(1, sizeof(*g));
 586                 symbols_init(g);
 587
 588                 for (tk = token_next(state); tk.num != TK_eof;
 589                      tk = token_next(state)) {
 590                         if (tk.num == TK_newline)
 591                                 continue;
 592                         if (tk.num == TK_ident) {
 593                                 // new non-terminal
 594                                 head = sym_find(g, tk.txt);
 595                                 if (head->type == Nonterminal)
 596                                         err = "This non-terminal has already be used.";
 597                                 else if (head->type == Virtual)
 598                                         err = "Virtual symbol not permitted in head of production";
 599                                 else {
 600                                         head->type = Nonterminal;
 601                                         head->struct_name = g->current_type;
 602                                         head->isref = g->type_isref;
 603                                         if (g->production_count == 0) {
 604                                                 ## create production zero
 605                                         }
 606                                         head->first_production = g->production_count;
 607                                         tk = token_next(state);
 608                                         if (tk.num == TK_mark &&
 609                                             text_is(tk.txt, "->"))
 610                                                 err = parse_production(g, head, state);
 611                                         else
 612                                                 err = "'->' missing in production";
 613                                 }
 614                         } else if (tk.num == TK_mark
 615                                    && text_is(tk.txt, "|")) {
 616                                 // another production for same non-term
 617                                 if (head)
 618                                         err = parse_production(g, head, state);
 619                                 else
 620                                         err = "First production must have a head";
 621                         } else if (tk.num == TK_mark
 622                                    && text_is(tk.txt, "$")) {
 623                                 err = dollar_line(state, g, 0);
 624                         } else if (tk.num == TK_mark
 625                                    && text_is(tk.txt, "$*")) {
 626                                 err = dollar_line(state, g, 1);
 627                         } else {
 628                                 err = "Unrecognised token at start of line.";
 629                         }
 630                         if (err)
 631                                 goto abort;
 632                 }
 633                 token_close(state);
 634                 return g;
 635         abort:
 636                 fprintf(stderr, "Error at line %d: %s\n",
 637                         tk.line, err);
 638                 token_close(state);
 639                 free(g);
 640                 return NULL;
 641         }
 642
 643 ## Analysing the grammar
 644
 645 The central task in analysing the grammar is to determine a set of
 646 states to drive the parsing process.  This will require finding
 647 various sets of symbols and of "items".  Some of these sets will need
 648 to attach information to each element in the set, so they are more
 649 like maps.
 650
 651 Each "item" may have a 'look-ahead' or `LA` set associated with
 652 it.  Many of these will be the same as each other.  To avoid memory
 653 wastage, and to simplify some comparisons of sets, these sets will be
 654 stored in a list of unique sets, each assigned a number.
 655
 656 Once we have the data structures in hand to manage these sets and
 657 lists, we can start setting the 'nullable' flag, build the 'FIRST'
 658 sets, and then create the item sets which define the various states.
 659
 660 ### Symbol sets.
 661
 662 Though we don't only store symbols in these sets, they are the main
 663 things we store, so they are called symbol sets or "symsets".
 664
 665 A symset has a size, an array of shorts, and an optional array of data
 666 values, which are also shorts.  If the array of data is not present,
 667 we store an impossible pointer, as `NULL` is used to indicate that no
 668 memory has been allocated yet;
 669
 670 ###### declarations
 671         struct symset {
 672                 short cnt;
 673                 unsigned short *syms, *data;
 674         };
 675         #define NO_DATA ((unsigned short *)1)
 676         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 677         const struct symset INIT_DATASET = { 0, NULL, NULL };
 678
 679 The arrays of shorts are allocated in blocks of 8 and are kept sorted
 680 by using an insertion sort.  We don't explicitly record the amount of
 681 allocated space as it can be derived directly from the current `cnt` using
 682 `((cnt - 1) | 7) + 1`.
 683
 684 ###### functions
 685         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 686         {
 687                 int i;
 688                 int current = ((s->cnt-1) | 7) + 1;
 689                 if (current == s->cnt) {
 690                         current += 8;
 691                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 692                         if (s->data != NO_DATA)
 693                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 694                 }
 695                 i = s->cnt;
 696                 while (i > 0 && s->syms[i-1] > key) {
 697                         s->syms[i] = s->syms[i-1];
 698                         if (s->data != NO_DATA)
 699                                 s->data[i] = s->data[i-1];
 700                         i--;
 701                 }
 702                 s->syms[i] = key;
 703                 if (s->data != NO_DATA)
 704                         s->data[i] = val;
 705                 s->cnt += 1;
 706         }
 707
 708 Finding a symbol (or item) in a `symset` uses a simple binary search.
 709 We return the index where the value was found (so data can be accessed),
 710 or `-1` to indicate failure.
 711
 712         static int symset_find(struct symset *ss, unsigned short key)
 713         {
 714                 int lo = 0;
 715                 int hi = ss->cnt;
 716
 717                 if (hi == 0)
 718                         return -1;
 719                 while (lo + 1 < hi) {
 720                         int mid = (lo + hi) / 2;
 721                         if (ss->syms[mid] <= key)
 722                                 lo = mid;
 723                         else
 724                                 hi = mid;
 725                 }
 726                 if (ss->syms[lo] == key)
 727                         return lo;
 728                 return -1;
 729         }
 730
 731 We will often want to form the union of two symsets.  When we do, we
 732 will often want to know if anything changed (as they might mean there
 733 is more work to do).  So `symset_union` reports whether anything was
 734 added to the first set.  We use a slow quadratic approach as these
 735 sets don't really get very big.  If profiles shows this to be a problem is
 736 can be optimised later.
 737
 738         static int symset_union(struct symset *a, struct symset *b)
 739         {
 740                 int i;
 741                 int added = 0;
 742                 for (i = 0; i < b->cnt; i++)
 743                         if (symset_find(a, b->syms[i]) < 0) {
 744                                 unsigned short data = 0;
 745                                 if (b->data != NO_DATA)
 746                                         data = b->data[i];
 747                                 symset_add(a, b->syms[i], data);
 748                                 added++;
 749                         }
 750                 return added;
 751         }
 752
 753 And of course we must be able to free a symset.
 754
 755         static void symset_free(struct symset ss)
 756         {
 757                 free(ss.syms);
 758                 if (ss.data != NO_DATA)
 759                         free(ss.data);
 760         }
 761
 762 ### Symset Storage
 763
 764 Some symsets are simply stored somewhere appropriate in the `grammar`
 765 data structure, others needs a bit of indirection.  This applies
 766 particularly to "LA" sets.  These will be paired with "items" in an
 767 "itemset".  As itemsets will be stored in a symset, the "LA" set needs to be
 768 stored in the `data` field, so we need a mapping from a small (short)
 769 number to an LA `symset`.
 770
 771 As mentioned earlier this involves creating a list of unique symsets.
 772
 773 For now, we just use a linear list sorted by insertion.  A skiplist
 774 would be more efficient and may be added later.
 775
 776 ###### declarations
 777
 778         struct setlist {
 779                 struct symset ss;
 780                 int num;
 781                 struct setlist *next;
 782         };
 783
 784 ###### grammar fields
 785         struct setlist *sets;
 786         int nextset;
 787
 788 ###### functions
 789
 790         static int ss_cmp(struct symset a, struct symset b)
 791         {
 792                 int i;
 793                 int diff = a.cnt - b.cnt;
 794                 if (diff)
 795                         return diff;
 796                 for (i = 0; i < a.cnt; i++) {
 797                         diff = (int)a.syms[i] - (int)b.syms[i];
 798                         if (diff)
 799                                 return diff;
 800                 }
 801                 return 0;
 802         }
 803
 804         static int save_set(struct grammar *g, struct symset ss)
 805         {
 806                 struct setlist **sl = &g->sets;
 807                 int cmp = 1;
 808                 struct setlist *s;
 809
 810                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 811                         sl = & (*sl)->next;
 812                 if (cmp == 0) {
 813                         symset_free(ss);
 814                         return (*sl)->num;
 815                 }
 816
 817                 s = malloc(sizeof(*s));
 818                 s->ss = ss;
 819                 s->num = g->nextset;
 820                 g->nextset += 1;
 821                 s->next = *sl;
 822                 *sl = s;
 823                 return s->num;
 824         }
 825
 826 Finding a set by number is currently performed by a simple linear search.
 827 If this turns out to hurt performance, we can store the sets in a dynamic
 828 array like the productions.
 829
 830         static struct symset set_find(struct grammar *g, int num)
 831         {
 832                 struct setlist *sl = g->sets;
 833                 while (sl && sl->num != num)
 834                         sl = sl->next;
 835                 return sl->ss;
 836         }
 837
 838
 839 ### Setting `nullable`
 840
 841 We set `nullable` on the head symbol for any production for which all
 842 the body symbols (if any) are nullable.  As this is a recursive
 843 definition, any change in the `nullable` setting means that we need to
 844 re-evaluate where it needs to be set.
 845
 846 We simply loop around performing the same calculations until no more
 847 changes happen.
 848
 849 ###### symbol fields
 850         int nullable;
 851
 852 ###### functions
 853         static void set_nullable(struct grammar *g)
 854         {
 855                 int check_again = 1;
 856                 while (check_again) {
 857                         int p;
 858                         check_again = 0;
 859                         for (p = 0; p < g->production_count; p++) {
 860                                 struct production *pr = g->productions[p];
 861                                 int s;
 862
 863                                 if (pr->head->nullable)
 864                                         continue;
 865                                 for (s = 0; s < pr->body_size; s++)
 866                                         if (! pr->body[s]->nullable)
 867                                                 break;
 868                                 if (s == pr->body_size) {
 869                                         pr->head->nullable = 1;
 870                                         check_again = 1;
 871                                 }
 872                         }
 873                 }
 874         }
 875
 876 ### Setting `can_eol`
 877
 878 In order to be able to ignore newline tokens when not relevant, but
 879 still include them in the parse when needed, we will need to know
 880 which states can start a "line-like" section of code.  We ignore
 881 newlines when there is an indent since the most recent start of a
 882 line-like section.
 883
 884 To know what is line-like, we first need to know which symbols can end
 885 a line-like section, which is precisely those which can end with a
 886 newline token.  These symbols don't necessarily alway end with a
 887 newline, but they can.  Hence they are not described as "lines" but
 888 only "line-like".
 889
 890 Clearly the `TK_newline` token can end with a newline.  Any symbol
 891 which is the head of a production that contains a line-ending symbol
 892 followed only by nullable symbols is also a line-ending symbol.  We
 893 use a new field `can_eol` to record this attribute of symbols, and
 894 compute it in a repetitive manner similar to `set_nullable`.
 895
 896 ###### symbol fields
 897         int can_eol;
 898
 899 ###### functions
 900         static void set_can_eol(struct grammar *g)
 901         {
 902                 int check_again = 1;
 903                 g->symtab[TK_newline]->can_eol = 1;
 904                 while (check_again) {
 905                         int p;
 906                         check_again = 0;
 907                         for (p = 0; p < g->production_count; p++) {
 908                                 struct production *pr = g->productions[p];
 909                                 int s;
 910
 911                                 if (pr->head->can_eol)
 912                                         continue;
 913
 914                                 for (s = pr->body_size - 1; s >= 0; s--) {
 915                                         if (pr->body[s]->can_eol) {
 916                                                 pr->head->can_eol = 1;
 917                                                 check_again = 1;
 918                                                 break;
 919                                         }
 920                                         if (!pr->body[s]->nullable)
 921                                                 break;
 922                                 }
 923                         }
 924                 }
 925         }
 926
 927 ### Building the `first` sets
 928
 929 When calculating what can follow a particular non-terminal, we will need to
 930 know what the "first" terminal in any subsequent non-terminal might be.  So
 931 we calculate the `first` set for every non-terminal and store them in an
 932 array.  We don't bother recording the "first" set for terminals as they are
 933 trivial.
 934
 935 As the "first" for one symbol might depend on the "first" of another,
 936 we repeat the calculations until no changes happen, just like with
 937 `set_nullable`.  This makes use of the fact that `symset_union`
 938 reports if any change happens.
 939
 940 The core of this which finds the "first" of part of a production body
 941 will be reused for computing the "follow" sets, so we split it out
 942 into a separate function.
 943
 944 ###### grammar fields
 945         struct symset *first;
 946
 947 ###### functions
 948
 949         static int add_first(struct production *pr, int start,
 950                              struct symset *target, struct grammar *g,
 951                              int *to_end)
 952         {
 953                 int s;
 954                 int changed = 0;
 955                 for (s = start; s < pr->body_size; s++) {
 956                         struct symbol *bs = pr->body[s];
 957                         if (bs->type == Terminal) {
 958                                 if (symset_find(target, bs->num) < 0) {
 959                                         symset_add(target, bs->num, 0);
 960                                         changed = 1;
 961                                 }
 962                                 break;
 963                         } else if (symset_union(target, &g->first[bs->num]))
 964                                 changed = 1;
 965                         if (!bs->nullable)
 966                                 break;
 967                 }
 968                 if (to_end)
 969                         *to_end = (s == pr->body_size);
 970                 return changed;
 971         }
 972
 973         static void build_first(struct grammar *g)
 974         {
 975                 int check_again = 1;
 976                 int s;
 977                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
 978                 for (s = 0; s < g->num_syms; s++)
 979                         g->first[s] = INIT_SYMSET;
 980
 981                 while (check_again) {
 982                         int p;
 983                         check_again = 0;
 984                         for (p = 0; p < g->production_count; p++) {
 985                                 struct production *pr = g->productions[p];
 986                                 struct symset *head = &g->first[pr->head->num];
 987
 988                                 if (add_first(pr, 0, head, g, NULL))
 989                                         check_again = 1;
 990                         }
 991                 }
 992         }
 993
 994 ### Building the `follow` sets.
 995
 996 There are two different situations when we will want to generate "follow"
 997 sets.  If we are doing an SLR analysis, we want to generate a single
 998 "follow" set for each non-terminal in the grammar.  That is what is
 999 happening here.  If we are doing an LALR or LR analysis we will want
1000 to generate a separate "LA" set for each item.  We do that later
1001 in state generation.
1002
1003 There are two parts to generating a "follow" set.  Firstly we look at
1004 every place that any non-terminal appears in the body of any
1005 production, and we find the set of possible "first" symbols after
1006 there.  This is done using `add_first` above and only needs to be done
1007 once as the "first" sets are now stable and will not change.
1008
1009 ###### follow code
1010
1011         for (p = 0; p < g->production_count; p++) {
1012                 struct production *pr = g->productions[p];
1013                 int b;
1014
1015                 for (b = 0; b < pr->body_size - 1; b++) {
1016                         struct symbol *bs = pr->body[b];
1017                         if (bs->type == Terminal)
1018                                 continue;
1019                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1020                 }
1021         }
1022
1023 The second part is to add the "follow" set of the head of a production
1024 to the "follow" sets of the final symbol in the production, and any
1025 other symbol which is followed only by `nullable` symbols.  As this
1026 depends on "follow" itself we need to repeatedly perform the process
1027 until no further changes happen.
1028
1029 ###### follow code
1030
1031         for (again = 0, p = 0;
1032              p < g->production_count;
1033              p < g->production_count-1
1034                 ? p++ : again ? (again = 0, p = 0)
1035                               : p++) {
1036                 struct production *pr = g->productions[p];
1037                 int b;
1038
1039                 for (b = pr->body_size - 1; b >= 0; b--) {
1040                         struct symbol *bs = pr->body[b];
1041                         if (bs->type == Terminal)
1042                                 break;
1043                         if (symset_union(&g->follow[bs->num],
1044                                          &g->follow[pr->head->num]))
1045                                 again = 1;
1046                         if (!bs->nullable)
1047                                 break;
1048                 }
1049         }
1050
1051 We now just need to create and initialise the `follow` list to get a
1052 complete function.
1053
1054 ###### grammar fields
1055         struct symset *follow;
1056
1057 ###### functions
1058         static void build_follow(struct grammar *g)
1059         {
1060                 int s, again, p;
1061                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1062                 for (s = 0; s < g->num_syms; s++)
1063                         g->follow[s] = INIT_SYMSET;
1064                 ## follow code
1065         }
1066
1067 ### Building itemsets and states
1068
1069 There are three different levels of detail that can be chosen for
1070 building the itemsets and states for the LR grammar.  They are:
1071
1072 1. LR(0) or SLR(1), where no look-ahead is considered.
1073 2. LALR(1) where we build look-ahead sets with each item and merge
1074    the LA sets when we find two paths to the same "kernel" set of items.
1075 3. LR(1) where different look-ahead for any item in the set means
1076    a different state must be created.
1077
1078 ###### forward declarations
1079         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1080
1081 We need to be able to look through existing states to see if a newly
1082 generated state already exists.  For now we use a simple sorted linked
1083 list.
1084
1085 An item is a pair of numbers: the production number and the position of
1086 "DOT", which is an index into the body of the production.
1087 As the numbers are not enormous we can combine them into a single "short"
1088 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1089 production number (so 4000 productions with maximum of 15 symbols in the
1090 body).
1091
1092 Comparing the itemsets will be a little different to comparing symsets
1093 as we want to do the lookup after generating the "kernel" of an
1094 itemset, so we need to ignore the offset=zero items which are added during
1095 completion.
1096
1097 To facilitate this, we modify the "DOT" number so that "0" sorts to
1098 the end of the list in the symset, and then only compare items before
1099 the first "0".
1100
1101 ###### declarations
1102         static inline unsigned short item_num(int production, int index)
1103         {
1104                 return production | ((31-index) << 11);
1105         }
1106         static inline int item_prod(unsigned short item)
1107         {
1108                 return item & 0x7ff;
1109         }
1110         static inline int item_index(unsigned short item)
1111         {
1112                 return (31-(item >> 11)) & 0x1f;
1113         }
1114
1115 For LR(1) analysis we need to compare not just the itemset in a state
1116 but also the LA sets.  As we assign each unique LA set a number, we
1117 can just compare the symset and the data values together.
1118
1119 ###### functions
1120         static int itemset_cmp(struct symset a, struct symset b,
1121                                enum grammar_type type)
1122         {
1123                 int i;
1124                 int av, bv;
1125
1126                 for (i = 0;
1127                      i < a.cnt && i < b.cnt &&
1128                      item_index(a.syms[i]) > 0 &&
1129                      item_index(b.syms[i]) > 0;
1130                      i++) {
1131                         int diff = a.syms[i] - b.syms[i];
1132                         if (diff)
1133                                 return diff;
1134                         if (type == LR1) {
1135                                 diff = a.data[i] - b.data[i];
1136                                 if (diff)
1137                                         return diff;
1138                         }
1139                 }
1140                 if (i == a.cnt || item_index(a.syms[i]) == 0)
1141                         av = -1;
1142                 else
1143                         av = a.syms[i];
1144                 if (i == b.cnt || item_index(b.syms[i]) == 0)
1145                         bv = -1;
1146                 else
1147                         bv = b.syms[i];
1148                 if (av - bv)
1149                         return av - bv;
1150                 if (type < LR1 || av == -1)
1151                         return 0;
1152                 return
1153                         a.data[i] - b.data[i];
1154         }
1155
1156 And now we can build the list of itemsets.  The lookup routine returns
1157 both a success flag and a pointer to where in the list an insert
1158 should happen, so we don't need to search a second time.
1159
1160 ###### declarations
1161         struct itemset {
1162                 struct itemset *next;
1163                 short state;
1164                 struct symset items;
1165                 struct symset go_to;
1166                 char completed;
1167                 char starts_line;
1168         };
1169
1170 ###### grammar fields
1171         struct itemset *items;
1172         int states;
1173
1174 ###### functions
1175         static int itemset_find(struct grammar *g, struct itemset ***where,
1176                                 struct symset kernel, enum grammar_type type)
1177         {
1178                 struct itemset **ip;
1179
1180                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1181                         struct itemset *i = *ip;
1182                         int diff;
1183                         diff = itemset_cmp(i->items, kernel, type);
1184                         if (diff < 0)
1185                                 continue;
1186                         if (diff > 0)
1187                                 break;
1188                         /* found */
1189                         *where = ip;
1190                         return 1;
1191                 }
1192                 *where = ip;
1193                 return 0;
1194         }
1195
1196 Adding an itemset may require merging the LA sets if LALR analysis is
1197 happening. If any new LA set adds any symbols that weren't in the old LA set, we
1198 clear the `completed` flag so that the dependants of this itemset will be
1199 recalculated and their LA sets updated.
1200
1201 `add_itemset` must consume the symsets it is passed, either by adding
1202 them to a data structure, of freeing them.
1203
1204         static int add_itemset(struct grammar *g, struct symset ss,
1205                                enum grammar_type type, int starts_line)
1206         {
1207                 struct itemset **where, *is;
1208                 int i;
1209                 int found = itemset_find(g, &where, ss, type);
1210                 if (!found) {
1211                         is = calloc(1, sizeof(*is));
1212                         is->state = g->states;
1213                         g->states += 1;
1214                         is->items = ss;
1215                         is->next = *where;
1216                         is->go_to = INIT_DATASET;
1217                         is->starts_line = starts_line;
1218                         *where = is;
1219                         return is->state;
1220                 }
1221                 is = *where;
1222                 if (type != LALR) {
1223                         symset_free(ss);
1224                         return is->state;
1225                 }
1226                 for (i = 0; i < ss.cnt; i++) {
1227                         struct symset temp = INIT_SYMSET, s;
1228                         if (ss.data[i] == is->items.data[i])
1229                                 continue;
1230                         s = set_find(g, is->items.data[i]);
1231                         symset_union(&temp, &s);
1232                         s = set_find(g, ss.data[i]);
1233                         if (symset_union(&temp, &s)) {
1234                                 is->items.data[i] = save_set(g, temp);
1235                                 is->completed = 0;
1236                         } else
1237                                 symset_free(temp);
1238                 }
1239                 symset_free(ss);
1240                 return is->state;
1241         }
1242
1243 #### The build
1244
1245 To build all the itemsets, we first insert the initial itemset made
1246 from production zero, complete each itemset, and then generate new
1247 itemsets from old until no new ones can be made.
1248
1249 Completing an itemset means finding all the items where "DOT" is followed by
1250 a nonterminal and adding "DOT=0" items for every production from that
1251 non-terminal - providing each item hasn't already been added.
1252
1253 If LA sets are needed, the LA set for each new item is found using
1254 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This may
1255 be supplemented by the LA set for the item which produce the new item.
1256
1257 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1258 is used in the next stage.
1259
1260 NOTE: precedence handling should happen here - I haven't written this yet
1261 though.
1262
1263 ###### complete itemset
1264         for (i = 0; i < is->items.cnt; i++) {
1265                 int p = item_prod(is->items.syms[i]);
1266                 int bs = item_index(is->items.syms[i]);
1267                 struct production *pr = g->productions[p];
1268                 int p2;
1269                 struct symbol *s;
1270                 struct symset LA = INIT_SYMSET;
1271                 unsigned short sn = 0;
1272
1273                 if (bs == pr->body_size)
1274                         continue;
1275                 s = pr->body[bs];
1276                 if (symset_find(&done, s->num) < 0)
1277                         symset_add(&done, s->num, 0);
1278                 if (s->type != Nonterminal)
1279                         continue;
1280                 again = 1;
1281                 if (type >= LALR) {
1282                         // Need the LA set.
1283                         int to_end;
1284                         add_first(pr, bs+1, &LA, g, &to_end);
1285                         if (to_end) {
1286                                 struct symset ss = set_find(g, is->items.data[i]);
1287                                 symset_union(&LA, &ss);
1288                         }
1289                         sn = save_set(g, LA);
1290                         LA = set_find(g, sn);
1291                 }
1292
1293                 /* Add productions for this symbol */
1294                 for (p2 = s->first_production;
1295                      p2 < g->production_count &&
1296                       g->productions[p2]->head == s;
1297                      p2++) {
1298                         int itm = item_num(p2, 0);
1299                         int pos = symset_find(&is->items, itm);
1300                         if (pos < 0) {
1301                                 symset_add(&is->items, itm, sn);
1302                                 /* Will have re-ordered, so start
1303                                  * from beginning again */
1304                                 i = -1;
1305                         } else if (type >= LALR) {
1306                                 struct symset ss = set_find(g, is->items.data[pos]);
1307                                 struct symset tmp = INIT_SYMSET;
1308
1309                                 symset_union(&tmp, &ss);
1310                                 if (symset_union(&tmp, &LA)) {
1311                                         is->items.data[pos] = save_set(g, tmp);
1312                                         i = -1;
1313                                 }else
1314                                         symset_free(tmp);
1315                         }
1316                 }
1317         }
1318
1319 For each symbol we found (and placed in `done`) we collect all the items for
1320 which this symbol is next, and create a new itemset with "DOT" advanced over
1321 the symbol.  This is then added to the collection of itemsets (or merged
1322 with a pre-existing itemset).
1323
1324 ###### derive itemsets
1325         // Now we have a completed itemset, so we need to
1326         // compute all the 'next' itemsets and create them
1327         // if they don't exist.
1328         for (i = 0; i < done.cnt; i++) {
1329                 int j;
1330                 unsigned short state;
1331                 int starts_line = 0;
1332                 struct symbol *sym = g->symtab[done.syms[i]];
1333                 struct symset newitemset = INIT_SYMSET;
1334                 if (type >= LALR)
1335                         newitemset = INIT_DATASET;
1336
1337                 if (sym->can_eol ||
1338                     (sym->nullable && is->starts_line))
1339                         starts_line = 1;
1340                 for (j = 0; j < is->items.cnt; j++) {
1341                         int itm = is->items.syms[j];
1342                         int p = item_prod(itm);
1343                         int bp = item_index(itm);
1344                         struct production *pr = g->productions[p];
1345                         unsigned short la = 0;
1346                         int pos;
1347
1348                         if (bp == pr->body_size)
1349                                 continue;
1350                         if (pr->body[bp] != sym)
1351                                 continue;
1352                         if (type >= LALR)
1353                                 la = is->items.data[j];
1354                         pos = symset_find(&newitemset, pr->head->num);
1355                         if (pos < 0)
1356                                 symset_add(&newitemset, item_num(p, bp+1), la);
1357                         else if (type >= LALR) {
1358                                 // Need to merge la set.
1359                                 int la2 = newitemset.data[pos];
1360                                 if (la != la2) {
1361                                         struct symset ss = set_find(g, la2);
1362                                         struct symset LA = INIT_SYMSET;
1363                                         symset_union(&LA, &ss);
1364                                         ss = set_find(g, la);
1365                                         if (symset_union(&LA, &ss))
1366                                                 newitemset.data[pos] = save_set(g, LA);
1367                                         else
1368                                                 symset_free(LA);
1369                                 }
1370                         }
1371                 }
1372                 state = add_itemset(g, newitemset, type, starts_line);
1373                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1374                         symset_add(&is->go_to, done.syms[i], state);
1375         }
1376
1377 All that is left is to crate the initial itemset from production zero, and
1378 with `TK_eof` as the LA set.
1379
1380 ###### functions
1381         static void build_itemsets(struct grammar *g, enum grammar_type type)
1382         {
1383                 struct symset first = INIT_SYMSET;
1384                 struct itemset *is;
1385                 int again;
1386                 unsigned short la = 0;
1387                 if (type >= LALR) {
1388                         // LA set just has eof
1389                         struct symset eof = INIT_SYMSET;
1390                         symset_add(&eof, TK_eof, 0);
1391                         la = save_set(g, eof);
1392                         first = INIT_DATASET;
1393                 }
1394                 // production 0, offset 0 (with no data)
1395                 symset_add(&first, item_num(0, 0), la);
1396                 add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
1397                 for (again = 0, is = g->items;
1398                      is;
1399                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
1400                         int i;
1401                         struct symset done = INIT_SYMSET;
1402                         if (is->completed)
1403                                 continue;
1404                         is->completed = 1;
1405                         again = 1;
1406                         ## complete itemset
1407                         ## derive itemsets
1408                         symset_free(done);
1409                 }
1410         }
1411
1412 ### Completing the analysis.
1413
1414 The exact process of analysis depends on which level was requested.  For
1415 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1416 `LR1` we need first, but not follow.  For `SLR` we need both.
1417
1418 We don't build the "action" tables that you might expect as the parser
1419 doesn't use them.  They will be calculated to some extent if needed for
1420 a report.
1421
1422 Once we have built everything we allocate arrays for the two lists:
1423 symbols and itemsets.  This allows more efficient access during reporting.
1424 The symbols are grouped as terminals and non-terminals and we record the
1425 changeover point in `first_nonterm`.
1426
1427 ###### grammar fields
1428         struct symbol **symtab;
1429         struct itemset **statetab;
1430         int first_nonterm;
1431
1432 ###### grammar_analyse
1433
1434         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1435         {
1436                 struct symbol *s;
1437                 struct itemset *is;
1438                 int snum = TK_reserved;
1439                 for (s = g->syms; s; s = s->next)
1440                         if (s->num < 0 && s->type == Terminal) {
1441                                 s->num = snum;
1442                                 snum++;
1443                         }
1444                 g->first_nonterm = snum;
1445                 for (s = g->syms; s; s = s->next)
1446                         if (s->num < 0) {
1447                                 s->num = snum;
1448                                 snum++;
1449                         }
1450                 g->num_syms = snum;
1451                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1452                 for (s = g->syms; s; s = s->next)
1453                         g->symtab[s->num] = s;
1454
1455                 set_nullable(g);
1456                 set_can_eol(g);
1457                 if (type >= SLR)
1458                         build_first(g);
1459
1460                 if (type == SLR)
1461                         build_follow(g);
1462
1463                 build_itemsets(g, type);
1464
1465                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1466                 for (is = g->items; is ; is = is->next)
1467                         g->statetab[is->state] = is;
1468         }
1469
1470 ## Reporting on the Grammar
1471
1472 The purpose of the report is to give the grammar developer insight into
1473 how the grammar parser will work.  It is basically a structured dump of
1474 all the tables that have been generated, plus a description of any conflicts.
1475
1476 ###### grammar_report
1477         static int grammar_report(struct grammar *g, enum grammar_type type)
1478         {
1479                 report_symbols(g);
1480                 if (g->follow)
1481                         report_follow(g);
1482                 report_itemsets(g);
1483                 return report_conflicts(g, type);
1484         }
1485
1486 Firstly we have the complete list of symbols, together with the
1487 "FIRST" set if that was generated.  We add a mark to each symbol to
1488 show if it can end in a newline (`>`), or if it is nullable (`.`).
1489
1490 ###### functions
1491
1492         static void report_symbols(struct grammar *g)
1493         {
1494                 int n;
1495                 if (g->first)
1496                         printf("SYMBOLS + FIRST:\n");
1497                 else
1498                         printf("SYMBOLS:\n");
1499
1500                 for (n = 0; n < g->num_syms; n++) {
1501                         struct symbol *s = g->symtab[n];
1502                         if (!s)
1503                                 continue;
1504
1505                         printf(" %c%c%3d%c: ",
1506                                s->nullable ? '.':' ',
1507                                s->can_eol ? '>':' ',
1508                                s->num, symtypes[s->type]);
1509                         prtxt(s->name);
1510                         if (s->precedence)
1511                                 printf(" (%d%s)", s->precedence,
1512                                        assoc_names[s->assoc]);
1513
1514                         if (g->first && s->type == Nonterminal) {
1515                                 int j;
1516                                 char c = ':';
1517                                 for (j = 0; j < g->first[n].cnt; j++) {
1518                                         printf("%c ", c);
1519                                         c = ',';
1520                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1521                                 }
1522                         }
1523                         printf("\n");
1524                 }
1525         }
1526
1527 Then we have the follow sets if they were computed.
1528
1529         static void report_follow(struct grammar *g)
1530         {
1531                 int n;
1532                 printf("FOLLOW:\n");
1533                 for (n = 0; n < g->num_syms; n++)
1534                         if (g->follow[n].cnt) {
1535                                 int j;
1536                                 char c = ':';
1537                                 printf("  ");
1538                                 prtxt(g->symtab[n]->name);
1539                                 for (j = 0; j < g->follow[n].cnt; j++) {
1540                                         printf("%c ", c);
1541                                         c = ',';
1542                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1543                                 }
1544                                 printf("\n");
1545                         }
1546         }
1547
1548 And finally the item sets.  These include the GO TO tables and, for
1549 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1550 it up a bit.  First the items, with production number and associativity.
1551
1552         static void report_item(struct grammar *g, int itm)
1553         {
1554                 int p = item_prod(itm);
1555                 int dot = item_index(itm);
1556                 struct production *pr = g->productions[p];
1557                 int i;
1558
1559                 printf("    ");
1560                 prtxt(pr->head->name);
1561                 printf(" ->");
1562                 for (i = 0; i < pr->body_size; i++) {
1563                         printf(" %s", (dot == i ? ". ": ""));
1564                         prtxt(pr->body[i]->name);
1565                 }
1566                 if (dot == pr->body_size)
1567                         printf(" .");
1568                 printf(" [%d]", p);
1569                 if (pr->precedence)
1570                         printf(" (%d%s)", pr->precedence,
1571                                assoc_names[pr->assoc]);
1572                 printf("\n");
1573         }
1574
1575 The LA sets which are (possibly) reported with each item:
1576
1577         static void report_la(struct grammar *g, int lanum)
1578         {
1579                 struct symset la = set_find(g, lanum);
1580                 int i;
1581                 char c = ':';
1582
1583                 printf("        LOOK AHEAD(%d)", lanum);
1584                 for (i = 0; i < la.cnt; i++) {
1585                         printf("%c ", c);
1586                         c = ',';
1587                         prtxt(g->symtab[la.syms[i]]->name);
1588                 }
1589                 printf("\n");
1590         }
1591
1592 Then the go to sets:
1593
1594
1595         static void report_goto(struct grammar *g, struct symset gt)
1596         {
1597                 int i;
1598                 printf("    GOTO:\n");
1599
1600                 for (i = 0; i < gt.cnt; i++) {
1601                         printf("      ");
1602                         prtxt(g->symtab[gt.syms[i]]->name);
1603                         printf(" -> %d\n", gt.data[i]);
1604                 }
1605         }
1606
1607 Now we can report all the item sets complete with items, LA sets, and GO TO.
1608
1609         static void report_itemsets(struct grammar *g)
1610         {
1611                 int s;
1612                 printf("ITEM SETS(%d)\n", g->states);
1613                 for (s = 0; s < g->states; s++) {
1614                         int j;
1615                         struct itemset *is = g->statetab[s];
1616                         printf("  Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
1617                         for (j = 0; j < is->items.cnt; j++) {
1618                                 report_item(g, is->items.syms[j]);
1619                                 if (is->items.data != NO_DATA)
1620                                         report_la(g, is->items.data[j]);
1621                         }
1622                         report_goto(g, is->go_to);
1623                 }
1624         }
1625
1626 ### Reporting conflicts
1627
1628 Conflict detection varies a lot among different analysis levels.  However
1629 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1630 LR1 are also similar as they have FOLLOW or LA sets.
1631
1632 ###### functions
1633
1634         ## conflict functions
1635
1636         static int report_conflicts(struct grammar *g, enum grammar_type type)
1637         {
1638                 int cnt = 0;
1639                 printf("Conflicts:\n");
1640                 if (type < SLR)
1641                         cnt = conflicts_lr0(g, type);
1642                 else
1643                         cnt = conflicts_slr(g, type);
1644                 if (cnt == 0)
1645                         printf(" - no conflicts\n");
1646                 return cnt;
1647         }
1648
1649 LR0 conflicts are any state which have both a reducible item and
1650 a shiftable item, or two reducible items.
1651
1652 LR05 conflicts only occurs if two possibly reductions exist,
1653 as shifts always over-ride reductions.
1654
1655 ###### conflict functions
1656         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1657         {
1658                 int i;
1659                 int cnt = 0;
1660                 for (i = 0; i < g->states; i++) {
1661                         struct itemset *is = g->statetab[i];
1662                         int last_reduce = -1;
1663                         int prev_reduce = -1;
1664                         int last_shift = -1;
1665                         int j;
1666                         if (!is)
1667                                 continue;
1668                         for (j = 0; j < is->items.cnt; j++) {
1669                                 int itm = is->items.syms[j];
1670                                 int p = item_prod(itm);
1671                                 int bp = item_index(itm);
1672                                 struct production *pr = g->productions[p];
1673
1674                                 if (bp == pr->body_size) {
1675                                         prev_reduce = last_reduce;
1676                                         last_reduce = j;
1677                                         continue;
1678                                 }
1679                                 if (pr->body[bp]->type == Terminal)
1680                                         last_shift = j;
1681                         }
1682                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1683                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1684                                 report_item(g, is->items.syms[last_shift]);
1685                                 report_item(g, is->items.syms[last_reduce]);
1686                                 cnt++;
1687                         }
1688                         if (prev_reduce >= 0) {
1689                                 printf("  State %d has 2 (or more) reducible items\n", i);
1690                                 report_item(g, is->items.syms[prev_reduce]);
1691                                 report_item(g, is->items.syms[last_reduce]);
1692                                 cnt++;
1693                         }
1694                 }
1695                 return cnt;
1696         }
1697
1698 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1699 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1700 in the source of the look ahead set.
1701
1702 We build two datasets to reflect the "action" table: one which maps
1703 terminals to items where that terminal could be shifted and another
1704 which maps terminals to items that could be reduced when the terminal
1705 is in look-ahead.  We report when we get conflicts between the two.
1706
1707         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1708         {
1709                 int i;
1710                 int cnt = 0;
1711
1712                 for (i = 0; i < g->states; i++) {
1713                         struct itemset *is = g->statetab[i];
1714                         struct symset shifts = INIT_DATASET;
1715                         struct symset reduce = INIT_DATASET;
1716                         int j;
1717                         if (!is)
1718                                 continue;
1719                         /* First collect the shifts */
1720                         for (j = 0; j < is->items.cnt; j++) {
1721                                 unsigned short itm = is->items.syms[j];
1722                                 int p = item_prod(itm);
1723                                 int bp = item_index(itm);
1724                                 struct production *pr = g->productions[p];
1725
1726                                 if (bp < pr->body_size &&
1727                                     pr->body[bp]->type == Terminal) {
1728                                         /* shiftable */
1729                                         int sym = pr->body[bp]->num;
1730                                         if (symset_find(&shifts, sym) < 0)
1731                                                 symset_add(&shifts, sym, itm);
1732                                 }
1733                         }
1734                         /* Now look for reduction and conflicts */
1735                         for (j = 0; j < is->items.cnt; j++) {
1736                                 unsigned short itm = is->items.syms[j];
1737                                 int p = item_prod(itm);
1738                                 int bp = item_index(itm);
1739                                 struct production *pr = g->productions[p];
1740
1741                                 if (bp < pr->body_size)
1742                                         continue;
1743                                 /* reducible */
1744                                 struct symset la;
1745                                 if (type == SLR)
1746                                         la = g->follow[pr->head->num];
1747                                 else
1748                                         la = set_find(g, is->items.data[j]);
1749                                 int k;
1750                                 for (k = 0; k < la.cnt; k++) {
1751                                         int pos = symset_find(&shifts, la.syms[k]);
1752                                         if (pos >= 0) {
1753                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1754                                                 prtxt(g->symtab[la.syms[k]]->name);
1755                                                 printf(":\n");
1756                                                 report_item(g, shifts.data[pos]);
1757                                                 report_item(g, itm);
1758                                                 cnt++;
1759                                         }
1760                                         pos = symset_find(&reduce, la.syms[k]);
1761                                         if (pos < 0) {
1762                                                 symset_add(&reduce, la.syms[k], itm);
1763                                                 continue;
1764                                         }
1765                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1766                                         prtxt(g->symtab[la.syms[k]]->name);
1767                                         printf(":\n");
1768                                         report_item(g, itm);
1769                                         report_item(g, reduce.data[pos]);
1770                                         cnt++;
1771                                 }
1772                         }
1773                         symset_free(shifts);
1774                         symset_free(reduce);
1775                 }
1776                 return cnt;
1777         }
1778
1779
1780 ## Generating the parser
1781
1782 The exported part of the parser is the `parse_XX` function, where the name
1783 `XX` is based on the name of the parser files.
1784
1785 This takes a `code_node`, a partially initialized `token_config`, and an
1786 optional `FILE` to send tracing to.  The `token_config` gets the list of
1787 known words added and then is used with the `code_node` to initialize the
1788 scanner.
1789
1790 `parse_XX` then calls the library function `parser_run` to actually complete
1791 the parse.  This needs the `states` table and function to call the various
1792 pieces of code provided in the grammar file, so they are generated first.
1793
1794 ###### parser_generate
1795
1796         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1797         {
1798                 gen_known(f, g);
1799                 gen_non_term(f, g);
1800                 gen_goto(f, g);
1801                 gen_states(f, g);
1802                 gen_reduce(f, g, file);
1803                 gen_free(f, g);
1804
1805                 fprintf(f, "#line 0 \"gen_parser\"\n");
1806                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1807                         name);
1808                 fprintf(f, "{\n");
1809                 fprintf(f, "\tstruct token_state *tokens;\n");
1810                 fprintf(f, "\tconfig->words_marks = known;\n");
1811                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1812                 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1813                 fprintf(f, "\ttokens = token_open(code, config);\n");
1814                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
1815                 fprintf(f, "\ttoken_close(tokens);\n");
1816                 fprintf(f, "\treturn rv;\n");
1817                 fprintf(f, "}\n\n");
1818         }
1819
1820 ### Known words table
1821
1822 The known words table is simply an array of terminal symbols.
1823 The table of nonterminals used for tracing is a similar array.
1824
1825 ###### functions
1826
1827         static void gen_known(FILE *f, struct grammar *g)
1828         {
1829                 int i;
1830                 fprintf(f, "#line 0 \"gen_known\"\n");
1831                 fprintf(f, "static const char *known[] = {\n");
1832                 for (i = TK_reserved;
1833                      i < g->num_syms && g->symtab[i]->type == Terminal;
1834                      i++)
1835                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1836                                 g->symtab[i]->name.txt);
1837                 fprintf(f, "};\n\n");
1838         }
1839
1840         static void gen_non_term(FILE *f, struct grammar *g)
1841         {
1842                 int i;
1843                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1844                 fprintf(f, "static const char *non_term[] = {\n");
1845                 for (i = TK_reserved;
1846                      i < g->num_syms;
1847                      i++)
1848                         if (g->symtab[i]->type == Nonterminal)
1849                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1850                                         g->symtab[i]->name.txt);
1851                 fprintf(f, "};\n\n");
1852         }
1853
1854 ### States and the goto tables.
1855
1856 For each state we record the goto table, the reducible production if
1857 there is one, or a symbol to shift for error recovery.
1858 Some of the details of the reducible production are stored in the
1859 `do_reduce` function to come later.  Here we store the production number,
1860 the body size (useful for stack management) and the resulting symbol (useful
1861 for knowing how to free data later).
1862
1863 The go to table is stored in a simple array of `sym` and corresponding
1864 `state`.
1865
1866 ###### exported types
1867
1868         struct lookup {
1869                 short sym;
1870                 short state;
1871         };
1872         struct state {
1873                 short go_to_cnt;
1874                 const struct lookup * go_to;
1875                 short reduce_prod;
1876                 short reduce_size;
1877                 short reduce_sym;
1878                 short shift_sym;
1879                 short starts_line;
1880         };
1881
1882
1883 ###### functions
1884
1885         static void gen_goto(FILE *f, struct grammar *g)
1886         {
1887                 int i;
1888                 fprintf(f, "#line 0 \"gen_goto\"\n");
1889                 for (i = 0; i < g->states; i++) {
1890                         int j;
1891                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
1892                                 i);
1893                         struct symset gt = g->statetab[i]->go_to;
1894                         for (j = 0; j < gt.cnt; j++)
1895                                 fprintf(f, "\t{ %d, %d },\n",
1896                                         gt.syms[j], gt.data[j]);
1897                         fprintf(f, "};\n");
1898                 }
1899         }
1900
1901 ###### functions
1902
1903         static void gen_states(FILE *f, struct grammar *g)
1904         {
1905                 int i;
1906                 fprintf(f, "#line 0 \"gen_states\"\n");
1907                 fprintf(f, "static const struct state states[] = {\n");
1908                 for (i = 0; i < g->states; i++) {
1909                         struct itemset *is = g->statetab[i];
1910                         int j, prod = -1, prod_len;
1911                         int shift_sym = -1;
1912                         int shift_len = 0, shift_remain = 0;
1913                         for (j = 0; j < is->items.cnt; j++) {
1914                                 int itm = is->items.syms[j];
1915                                 int p = item_prod(itm);
1916                                 int bp = item_index(itm);
1917                                 struct production *pr = g->productions[p];
1918
1919                                 if (bp < pr->body_size) {
1920                                         if (shift_sym < 0 ||
1921                                             (shift_len == bp && shift_remain > pr->body_size - bp)) {
1922                                                 shift_sym = pr->body[bp]->num;
1923                                                 shift_len = bp;
1924                                                 shift_remain = pr->body_size - bp;
1925                                         }
1926                                         continue;
1927                                 }
1928                                 /* This is what we reduce */
1929                                 if (prod < 0 || prod_len < pr->body_size) {
1930                                         prod = p;
1931                                         prod_len = pr->body_size;
1932                                 }
1933                         }
1934
1935                         if (prod >= 0)
1936                                 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
1937                                         i, is->go_to.cnt, i, prod,
1938                                         g->productions[prod]->body_size,
1939                                         g->productions[prod]->head->num,
1940                                         is->starts_line);
1941                         else
1942                                 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
1943                                         i, is->go_to.cnt, i, shift_sym,
1944                                         is->starts_line);
1945                 }
1946                 fprintf(f, "};\n\n");
1947         }
1948
1949 ### The `do_reduce` function and the code
1950
1951 When the parser engine decides to reduce a production, it calls `do_reduce`.
1952 This has two functions.
1953
1954 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1955 production being reduced.  Secondly it runs the code that was included with
1956 the production if any.
1957
1958 This code needs to be able to store data somewhere.  Rather than requiring
1959 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1960 `do_reduce` return the size to be saved.
1961
1962 The code fragment requires translation when written out.  Any `$N` needs to
1963 be converted to a reference either to that buffer (if `$0`) or to the
1964 structure returned by a previous reduction.  These pointers need to be cast
1965 to the appropriate type for each access.  All this is handling in
1966 `gen_code`.
1967
1968
1969 ###### functions
1970
1971         static void gen_code(struct production *p, FILE *f, struct grammar *g)
1972         {
1973                 char *c;
1974                 fprintf(f, "\t\t\t");
1975                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1976                         int n;
1977                         if (*c != '$') {
1978                                 fputc(*c, f);
1979                                 if (*c == '\n')
1980                                         fputs("\t\t\t", f);
1981                                 continue;
1982                         }
1983                         c++;
1984                         if (*c < '0' || *c > '9') {
1985                                 fputc(*c, f);
1986                                 continue;
1987                         }
1988                         n = *c - '0';
1989                         while (c[1] >= '0' && c[1] <= '9') {
1990                                 c += 1;
1991                                 n = n * 10 + *c - '0';
1992                         }
1993                         if (n == 0)
1994                                 fprintf(f, "(*(struct %.*s*%s)ret)",
1995                                         p->head->struct_name.len,
1996                                         p->head->struct_name.txt,
1997                                         p->head->isref ? "*":"");
1998                         else if (n > p->body_size)
1999                                 fprintf(f, "$%d", n);
2000                         else if (p->body[n-1]->type == Terminal)
2001                                 fprintf(f, "(*(struct token *)body[%d])",
2002                                         n-1);
2003                         else if (p->body[n-1]->struct_name.txt == NULL)
2004                                 fprintf(f, "$%d", n);
2005                         else
2006                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2007                                         p->body[n-1]->struct_name.len,
2008                                         p->body[n-1]->struct_name.txt,
2009                                         p->body[n-1]->isref ? "*":"", n-1);
2010                 }
2011                 fputs("\n", f);
2012         }
2013
2014 ###### functions
2015
2016         static void gen_reduce(FILE *f, struct grammar *g, char *file)
2017         {
2018                 int i;
2019                 fprintf(f, "#line 0 \"gen_reduce\"\n");
2020                 fprintf(f, "static int do_reduce(int prod, void **body, void *ret)\n");
2021                 fprintf(f, "{\n");
2022                 fprintf(f, "\tint ret_size = 0;\n");
2023
2024                 fprintf(f, "\tswitch(prod) {\n");
2025                 for (i = 0; i < g->production_count; i++) {
2026                         struct production *p = g->productions[i];
2027                         fprintf(f, "\tcase %d:\n", i);
2028
2029                         if (p->code.txt)
2030                                 gen_code(p, f, g);
2031
2032                         if (p->head->struct_name.txt)
2033                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2034                                         p->head->struct_name.len,
2035                                         p->head->struct_name.txt,
2036                                         p->head->isref ? "*":"");
2037
2038                         fprintf(f, "\t\tbreak;\n");
2039                 }
2040                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2041         }
2042
2043 ### `do_free`
2044
2045 As each non-terminal can potentially cause a different type of data
2046 structure to be allocated and filled in, we need to be able to free it when
2047 done.
2048
2049 It is particularly important to have fine control over freeing during error
2050 recovery where individual stack frames might need to be freed.
2051
2052 For this, the grammar author is required to defined a `free_XX` function for
2053 each structure that is used by a non-terminal.  `do_free` will call whichever
2054 is appropriate given a symbol number, and will call `free` (as is
2055 appropriate for tokens on any terminal symbol.
2056
2057 ###### functions
2058
2059         static void gen_free(FILE *f, struct grammar *g)
2060         {
2061                 int i;
2062
2063                 fprintf(f, "#line 0 \"gen_free\"\n");
2064                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2065                 fprintf(f, "{\n");
2066                 fprintf(f, "\tif (!asn) return;\n");
2067                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2068                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2069                 fprintf(f, "\tswitch(sym) {\n");
2070
2071                 for (i = 0; i < g->num_syms; i++) {
2072                         struct symbol *s = g->symtab[i];
2073                         if (!s ||
2074                             s->type != Nonterminal ||
2075                             s->struct_name.txt == NULL)
2076                                 continue;
2077
2078                         fprintf(f, "\tcase %d:\n", s->num);
2079                         if (s->isref)
2080                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2081                                         s->struct_name.len,
2082                                         s->struct_name.txt);
2083                         else
2084                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2085                                         s->struct_name.len,
2086                                         s->struct_name.txt);
2087                         fprintf(f, "\t\tbreak;\n");
2088                 }
2089                 fprintf(f, "\t}\n}\n\n");
2090         }
2091
2092 ## The main routine.
2093
2094 There are three key parts to the "main" function of parsergen: processing
2095 the arguments, loading the grammar file, and dealing with the grammar.
2096
2097 ### Argument processing.
2098
2099 Command line options allow the selection of analysis mode, name of output
2100 file, and whether or not a report should be generated.  By default we create
2101 a report only if no code output was requested.
2102
2103 The `parse_XX` function name uses the basename of the output file which
2104 should not have a suffix (`.c`).  `.c` is added to the given name for the
2105 code, and `.h` is added for the header (if header text is specifed in the
2106 grammar file).
2107
2108 ###### includes
2109         #include <getopt.h>
2110
2111 ###### declarations
2112         static const struct option long_options[] = {
2113                 { "LR0",        0, NULL, '0' },
2114                 { "LR05",       0, NULL, '5' },
2115                 { "SLR",        0, NULL, 'S' },
2116                 { "LALR",       0, NULL, 'L' },
2117                 { "LR1",        0, NULL, '1' },
2118                 { "tag",        1, NULL, 't' },
2119                 { "report",     0, NULL, 'R' },
2120                 { "output",     1, NULL, 'o' },
2121                 { NULL,         0, NULL, 0   }
2122         };
2123         const char *options = "05SL1t:Ro:";
2124
2125 ###### process arguments
2126         int opt;
2127         char *outfile = NULL;
2128         char *infile;
2129         char *name;
2130         char *tag = NULL;
2131         int report = 1;
2132         enum grammar_type type = LR05;
2133         while ((opt = getopt_long(argc, argv, options,
2134                                   long_options, NULL)) != -1) {
2135                 switch(opt) {
2136                 case '0':
2137                         type = LR0; break;
2138                 case '5':
2139                         type = LR05; break;
2140                 case 'S':
2141                         type = SLR; break;
2142                 case 'L':
2143                         type = LALR; break;
2144                 case '1':
2145                         type = LR1; break;
2146                 case 'R':
2147                         report = 2; break;
2148                 case 'o':
2149                         outfile = optarg; break;
2150                 case 't':
2151                         tag = optarg; break;
2152                 default:
2153                         fprintf(stderr, "Usage: parsergen ...\n");
2154                         exit(1);
2155                 }
2156         }
2157         if (optind < argc)
2158                 infile = argv[optind++];
2159         else {
2160                 fprintf(stderr, "No input file given\n");
2161                 exit(1);
2162         }
2163         if (outfile && report == 1)
2164                 report = 0;
2165         name = outfile;
2166         if (name && strchr(name, '/'))
2167                 name = strrchr(name, '/')+1;
2168
2169         if (optind < argc) {
2170                 fprintf(stderr, "Excess command line arguments\n");
2171                 exit(1);
2172         }
2173
2174 ### Loading the grammar file
2175
2176 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2177 map it.
2178
2179 One we have extracted the code (with `mdcode`) we expect to find three
2180 sections: header, code, and grammar.  Anything else that is not
2181 excluded by the `--tag` option is an error.
2182
2183 "header" and "code" are optional, though it is hard to build a working
2184 parser with neither. "grammar" must be provided.
2185
2186 ###### includes
2187         #include <fcntl.h>
2188         #include <sys/mman.h>
2189         #include <errno.h>
2190
2191 ###### functions
2192         static int errs;
2193         static void pr_err(char *msg)
2194         {
2195                 errs++;
2196                 fprintf(stderr, "%s\n", msg);
2197         }
2198
2199 ###### load file
2200         struct section *table;
2201         int fd;
2202         int len;
2203         char *file;
2204         fd = open(infile, O_RDONLY);
2205         if (fd < 0) {
2206                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2207                         infile, strerror(errno));
2208                 exit(1);
2209         }
2210         len = lseek(fd, 0, 2);
2211         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2212         table = code_extract(file, file+len, pr_err);
2213
2214         struct code_node *hdr = NULL;
2215         struct code_node *code = NULL;
2216         struct code_node *gram = NULL;
2217         for (s = table; s; s = s->next) {
2218                 struct text sec = s->section;
2219                 if (tag && !strip_tag(&sec, tag))
2220                         continue;
2221                 if (text_is(sec, "header"))
2222                         hdr = s->code;
2223                 else if (text_is(sec, "code"))
2224                         code = s->code;
2225                 else if (text_is(sec, "grammar"))
2226                         gram = s->code;
2227                 else {
2228                         fprintf(stderr, "Unknown content section: %.*s\n",
2229                                 s->section.len, s->section.txt);
2230                         rv |= 2;
2231                 }
2232         }
2233
2234 ### Processing the input
2235
2236 As we need to append an extention to a filename and then open it for
2237 writing, and we need to do this twice, it helps to have a separate function.
2238
2239 ###### functions
2240
2241         static FILE *open_ext(char *base, char *ext)
2242         {
2243                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2244                 FILE *f;
2245                 strcat(strcpy(fn, base), ext);
2246                 f = fopen(fn, "w");
2247                 free(fn);
2248                 return f;
2249         }
2250
2251 If we can read the grammar, then we analyse and optionally report on it.  If
2252 the report finds conflicts we will exit with an error status.
2253
2254 ###### process input
2255         struct grammar *g = NULL;
2256         if (gram == NULL) {
2257                 fprintf(stderr, "No grammar section provided\n");
2258                 rv |= 2;
2259         } else {
2260                 g = grammar_read(gram);
2261                 if (!g) {
2262                         fprintf(stderr, "Failure to parse grammar\n");
2263                         rv |= 2;
2264                 }
2265         }
2266         if (g) {
2267                 grammar_analyse(g, type);
2268                 if (report)
2269                         if (grammar_report(g, type))
2270                                 rv |= 1;
2271         }
2272
2273 If a headers section is defined, we write it out and include a declaration
2274 for the `parse_XX` function so it can be used from separate code.
2275
2276         if (rv == 0 && hdr && outfile) {
2277                 FILE *f = open_ext(outfile, ".h");
2278                 if (f) {
2279                         code_node_print(f, hdr, infile);
2280                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2281                                 name);
2282                         fclose(f);
2283                 } else {
2284                         fprintf(stderr, "Cannot create %s.h\n",
2285                                 outfile);
2286                         rv |= 4;
2287                 }
2288         }
2289
2290 And if all goes well and an output file was provided, we create the `.c`
2291 file with the code section (if any) and the parser tables and function.
2292
2293         if (rv == 0 && outfile) {
2294                 FILE *f = open_ext(outfile, ".c");
2295                 if (f) {
2296                         if (code)
2297                                 code_node_print(f, code, infile);
2298                         gen_parser(f, g, infile, name);
2299                         fclose(f);
2300                 } else {
2301                         fprintf(stderr, "Cannot create %s.c\n",
2302                                 outfile);
2303                         rv |= 4;
2304                 }
2305         }
2306
2307 And that about wraps it up.  We need to set the locale so that UTF-8 is
2308 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2309
2310 ###### File: parsergen.mk
2311         parsergen : parsergen.o libscanner.o libmdcode.o
2312                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2313
2314 ###### includes
2315         #include <locale.h>
2316
2317 ###### main
2318
2319         int main(int argc, char *argv[])
2320         {
2321                 struct section *s;
2322                 int rv = 0;
2323
2324                 setlocale(LC_ALL,"");
2325
2326                 ## process arguments
2327                 ## load file
2328                 ## process input
2329
2330                 return rv;
2331         }
2332
2333 ## The SHIFT/REDUCE parser
2334
2335 Having analysed the grammar and generated all the tables, we only need the
2336 shift/reduce engine to bring it all together.
2337
2338 ### Goto table lookup
2339
2340 The parser generator has nicely provided us with goto tables sorted by
2341 symbol number.  We need a binary search function to find a symbol in the
2342 table.
2343
2344 ###### parser functions
2345
2346         static int search(const struct state *l, int sym)
2347         {
2348                 int lo = 0;
2349                 int hi = l->go_to_cnt;
2350
2351                 if (hi == 0)
2352                         return -1;
2353                 while (lo + 1 < hi) {
2354                         int mid = (lo + hi) / 2;
2355                         if (l->go_to[mid].sym <= sym)
2356                                 lo = mid;
2357                         else
2358                                 hi = mid;
2359                 }
2360                 if (l->go_to[lo].sym == sym)
2361                         return l->go_to[lo].state;
2362                 else
2363                         return -1;
2364         }
2365
2366 ### The state stack.
2367
2368 The core data structure for the parser is the stack.  This tracks all the
2369 symbols that have been recognised or partially recognised.
2370
2371 The stack usually won't grow very large - maybe a few tens of entries.  So
2372 we dynamically resize and array as required but never bother to shrink it
2373 down again.
2374
2375 We keep the stack as two separate allocations.  One, `asn_stack` stores the
2376 "abstract syntax nodes" which are created by each reduction.  When we call
2377 `do_reduce` we need to pass an array of the `asn`s of the body of the
2378 production, and by keeping a separate `asn` stack, we can just pass a
2379 pointer into this stack.
2380
2381 The other allocation stores all other stack fields of which there are six.
2382 The `state` is the most important one and guides the parsing process.  The
2383 `sym` is nearly unnecessary.  However when we want to free entries from the
2384 `asn_stack`, it helps to know what type they are so we can call the right
2385 freeing function.  The symbol leads us to the right free function through
2386 `do_free`.
2387
2388 The `indents` count and the `starts_indented` flag track the line
2389 indents in the symbol.  These are used to allow indent information to
2390 guide parsing and error recovery.
2391
2392 `newline_permitted` keeps track of whether newlines should be ignored
2393 or not, and `starts_line` records if this state stated on a newline.
2394
2395 As well as the stack of frames we have a `next` frame which is
2396 assembled from the incoming token and other information prior to
2397 pushing it onto the stack.
2398
2399 ###### parser functions
2400
2401         struct parser {
2402                 struct frame {
2403                         short state;
2404                         short sym;
2405                         short starts_indented;
2406                         short indents;
2407                         short newline_permitted;
2408                 } *stack, next;
2409                 void **asn_stack;
2410                 int stack_size;
2411                 int tos;
2412         };
2413
2414 #### Shift and pop
2415
2416 Two operations are needed on the stack - shift (which is like push) and pop.
2417
2418 Shift applies not only to terminals but also to non-terminals.  When we
2419 reduce a production we will pop off entries corresponding to the body
2420 symbols, then push on an item for the head of the production.  This last is
2421 exactly the same process as shifting in a terminal so we use the same
2422 function for both.
2423
2424 To simplify other code we arrange for `shift` to fail if there is no `goto`
2425 state for the symbol.  This is useful in basic parsing due to our design
2426 that we shift when we can, and reduce when we cannot.  So the `shift`
2427 function reports if it could.
2428
2429 So `shift` finds the next state.  If that succeed it extends the allocations
2430 if needed and pushes all the information onto the stacks.
2431
2432 ###### parser functions
2433
2434         static int shift(struct parser *p,
2435                          void *asn,
2436                          const struct state states[])
2437         {
2438                 // Push an entry onto the stack
2439                 int newstate = search(&states[p->next.state], p->next.sym);
2440                 if (newstate < 0)
2441                         return 0;
2442                 if (p->tos >= p->stack_size) {
2443                         p->stack_size += 10;
2444                         p->stack = realloc(p->stack, p->stack_size
2445                                            * sizeof(p->stack[0]));
2446                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2447                                            * sizeof(p->asn_stack[0]));
2448                 }
2449                 p->stack[p->tos] = p->next;
2450                 p->asn_stack[p->tos] = asn;
2451                 p->tos++;
2452                 p->next.state = newstate;
2453                 p->next.indents = 0;
2454                 p->next.starts_indented = 0;
2455                 // if new state doesn't start a line, we inherit newline_permitted status
2456                 if (states[newstate].starts_line)
2457                         p->next.newline_permitted = 1;
2458                 return 1;
2459         }
2460
2461 `pop` simply moves the top of stack (`tos`) back down the required amount
2462 and frees any `asn` entries that need to be freed.  It is called _after_ we
2463 reduce a production, just before we `shift` the nonterminal in.
2464
2465 ###### parser functions
2466
2467         static void pop(struct parser *p, int num,
2468                         void(*do_free)(short sym, void *asn))
2469         {
2470                 int i;
2471                 p->tos -= num;
2472                 for (i = 0; i < num; i++) {
2473                         p->next.indents += p->stack[p->tos+i].indents;
2474                         do_free(p->stack[p->tos+i].sym,
2475                                 p->asn_stack[p->tos+i]);
2476                 }
2477
2478                 if (num) {
2479                         p->next.state = p->stack[p->tos].state;
2480                         p->next.starts_indented = p->stack[p->tos].starts_indented;
2481                         p->next.newline_permitted = p->stack[p->tos].newline_permitted;
2482                         if (p->next.indents > p->next.starts_indented)
2483                                 p->next.newline_permitted = 0;
2484                 }
2485         }
2486
2487 ### Memory allocation
2488
2489 The `scanner` returns tokens in a local variable - we want them in allocated
2490 memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
2491 a reduce is in a large buffer.  Both of these require some allocation and
2492 copying, hence `memdup` and `tokcopy`.
2493
2494 ###### parser includes
2495         #include <memory.h>
2496
2497 ###### parser functions
2498
2499         void *memdup(void *m, int len)
2500         {
2501                 void *ret;
2502                 if (len == 0)
2503                         return NULL;
2504                 ret = malloc(len);
2505                 memcpy(ret, m, len);
2506                 return ret;
2507         }
2508
2509         static struct token *tok_copy(struct token tk)
2510         {
2511                 struct token *new = malloc(sizeof(*new));
2512                 *new = tk;
2513                 return new;
2514         }
2515
2516 ### The heart of the parser.
2517
2518 Now we have the parser.  If we can shift, we do.  If not and we can reduce
2519 we do.  If the production we reduced was production zero, then we have
2520 accepted the input and can finish.
2521
2522 We return whatever `asn` was returned by reducing production zero.
2523
2524 If we can neither shift nor reduce we have an error to handle.  We pop
2525 single entries off the stack until we can shift the `TK_error` symbol, then
2526 drop input tokens until we find one we can shift into the new error state.
2527
2528 When we find `TK_in` and `TK_out` tokens which report indents we need
2529 to handle them directly as the grammar cannot express what we want to
2530 do with them.
2531
2532 `TK_in` tokens are easy: we simply update the `next` stack frame to
2533 record how many indents there are and that the next token started with
2534 an indent.
2535
2536 `TK_out` tokens must either be counted off against any pending indent,
2537 or must force reductions until there is a pending indent which isn't
2538 at the start of a production.
2539
2540 `TK_newline` tokens are ignored precisely if there has been an indent
2541 since the last state which could have been at the start of a line.
2542
2543 ###### parser includes
2544         #include "parser.h"
2545 ###### parser_run
2546         void *parser_run(struct token_state *tokens,
2547                          const struct state states[],
2548                          int (*do_reduce)(int, void**, void*),
2549                          void (*do_free)(short, void*),
2550                          FILE *trace, const char *non_term[], int knowns)
2551         {
2552                 struct parser p = { 0 };
2553                 struct token *tk = NULL;
2554                 int accepted = 0;
2555                 void *ret;
2556
2557                 p.next.newline_permitted = states[0].starts_line;
2558                 while (!accepted) {
2559                         struct token *err_tk;
2560                         if (!tk)
2561                                 tk = tok_copy(token_next(tokens));
2562                         p.next.sym = tk->num;
2563                         if (trace)
2564                                 parser_trace(trace, &p, tk, states, non_term, knowns);
2565
2566                         if (p.next.sym == TK_in) {
2567                                 p.next.starts_indented = 1;
2568                                 p.next.indents = 1;
2569                                 free(tk);
2570                                 tk = NULL;
2571                                 continue;
2572                         }
2573                         if (p.next.sym == TK_out) {
2574                                 if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
2575                                     (p.stack[p.tos-1].indents == 1 &&
2576                                      states[p.next.state].reduce_size > 1)) {
2577                                         p.stack[p.tos-1].indents -= 1;
2578                                         if (p.stack[p.tos-1].indents == p.stack[p.tos-1].starts_indented) {
2579                                                 // no internal indent any more, reassess 'newline_permitted'
2580                                                 if (states[p.stack[p.tos-1].state].starts_line)
2581                                                         p.stack[p.tos-1].newline_permitted = 1;
2582                                                 else if (p.tos > 1)
2583                                                         p.stack[p.tos-1].newline_permitted = p.stack[p.tos-2].newline_permitted;
2584                                         }
2585                                         free(tk);
2586                                         tk = NULL;
2587                                         continue;
2588                                 }
2589                                 // fall through and force a REDUCE (as 'shift'
2590                                 // will fail).
2591                         }
2592                         if (p.next.sym == TK_newline) {
2593                                 if (!p.tos || ! p.stack[p.tos-1].newline_permitted) {
2594                                         free(tk);
2595                                         tk = NULL;
2596                                         continue;
2597                                 }
2598                         }
2599                         if (shift(&p, tk, states)) {
2600                                 tk = NULL;
2601                                 continue;
2602                         }
2603                         if (states[p.next.state].reduce_prod >= 0) {
2604                                 void **body;
2605                                 int prod = states[p.next.state].reduce_prod;
2606                                 int size = states[p.next.state].reduce_size;
2607                                 int bufsize;
2608                                 static char buf[16*1024];
2609                                 p.next.sym = states[p.next.state].reduce_sym;
2610
2611                                 body = p.asn_stack +
2612                                         (p.tos - states[p.next.state].reduce_size);
2613
2614                                 bufsize = do_reduce(prod, body, buf);
2615
2616                                 pop(&p, size, do_free);
2617                                 shift(&p, memdup(buf, bufsize), states);
2618                                 if (prod == 0)
2619                                         accepted = 1;
2620                                 continue;
2621                         }
2622                         if (tk->num == TK_out) {
2623                                 // Indent problem - synthesise tokens to get us
2624                                 // out of here.
2625                                 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
2626                                 p.next.sym = states[p.next.state].shift_sym;
2627                                 shift(&p, tok_copy(*tk), states);
2628                                 // FIXME need to report this error somehow
2629                                 continue;
2630                         }
2631                         /* Error. We walk up the stack until we
2632                          * find a state which will accept TK_error.
2633                          * We then shift in TK_error and see what state
2634                          * that takes us too.
2635                          * Then we discard input tokens until
2636                          * we find one that is acceptable.
2637                          */
2638
2639                         err_tk = tok_copy(*tk);
2640                         p.next.sym = TK_error;
2641                         while (shift(&p, err_tk, states) == 0
2642                                && p.tos > 0)
2643                                 // discard this state
2644                                 pop(&p, 1, do_free);
2645                         if (p.tos == 0) {
2646                                 free(err_tk);
2647                                 // no state accepted TK_error
2648                                 break;
2649                         }
2650                         while (search(&states[p.next.state], tk->num) < 0 &&
2651                                tk->num != TK_eof) {
2652                                 free(tk);
2653                                 tk = tok_copy(token_next(tokens));
2654                                 if (tk->num == TK_in)
2655                                         p.next.indents += 1;
2656                                 if (tk->num == TK_out) {
2657                                         if (p.next.indents == 0)
2658                                                 break;
2659                                         p.next.indents -= 1;
2660                                 }
2661                         }
2662                         if (p.tos == 0 && tk->num == TK_eof)
2663                                 break;
2664                 }
2665                 free(tk);
2666                 if (accepted)
2667                         ret = p.asn_stack[0];
2668                 else
2669                         pop(&p, p.tos, do_free);
2670                 free(p.asn_stack);
2671                 free(p.stack);
2672                 return ret;
2673         }
2674
2675 ###### exported functions
2676         void *parser_run(struct token_state *tokens,
2677                          const struct state states[],
2678                          int (*do_reduce)(int, void**, void*),
2679                          void (*do_free)(short, void*),
2680                          FILE *trace, const char *non_term[], int knowns);
2681
2682 ### Tracing
2683
2684 Being able to visualize the parser in action can be invaluable when
2685 debugging the parser code, or trying to understand how the parse of a
2686 particular grammar progresses.  The stack contains all the important
2687 state, so just printing out the stack every time around the parse loop
2688 can make it possible to see exactly what is happening.
2689
2690 This doesn't explicitly show each SHIFT and REDUCE action.  However they
2691 are easily deduced from the change between consecutive lines, and the
2692 details of each state can be found by cross referencing the states list
2693 in the stack with the "report" that parsergen can generate.
2694
2695 For terminal symbols, we just dump the token.  For non-terminals we
2696 print the name of the symbol.  The look ahead token is reported at the
2697 end inside square brackets.
2698
2699 ###### parser functions
2700
2701         static char *reserved_words[] = {
2702                 [TK_error]        = "ERROR",
2703                 [TK_in]           = "IN",
2704                 [TK_out]          = "OUT",
2705                 [TK_newline]      = "NEWLINE",
2706                 [TK_eof]          = "$eof",
2707         };
2708         static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2709         {
2710                 fprintf(trace, "(%d", f->state);
2711                 if (f->indents)
2712                         fprintf(trace, "%c%d", f->starts_indented?':':'.',
2713                                 f->indents);
2714                 if (states[f->state].starts_line)
2715                         fprintf(trace, "s");
2716                 if (f->newline_permitted)
2717                         fprintf(trace, "n");
2718                 fprintf(trace, ") ");
2719         }
2720
2721         void parser_trace(FILE *trace, struct parser *p,
2722                           struct token *tk, const struct state states[],
2723                           const char *non_term[], int knowns)
2724         {
2725                 int i;
2726                 for (i = 0; i < p->tos; i++) {
2727                         int sym = p->stack[i].sym;
2728                         parser_trace_state(trace, &p->stack[i], states);
2729                         if (sym < TK_reserved &&
2730                             reserved_words[sym] != NULL)
2731                                 fputs(reserved_words[sym], trace);
2732                         else if (sym < TK_reserved + knowns) {
2733                                 struct token *t = p->asn_stack[i];
2734                                 text_dump(trace, t->txt, 20);
2735                         } else
2736                                 fputs(non_term[sym - TK_reserved - knowns],
2737                                       trace);
2738                         fputs(" ", trace);
2739                 }
2740                 parser_trace_state(trace, &p->next, states);
2741                 fprintf(trace, " [");
2742                 if (tk->num < TK_reserved &&
2743                     reserved_words[tk->num] != NULL)
2744                         fputs(reserved_words[tk->num], trace);
2745                 else
2746                         text_dump(trace, tk->txt, 20);
2747                 fputs("]\n", trace);
2748         }
2749
2750 # A Worked Example
2751
2752 The obvious example for a parser is a calculator.
2753
2754 As `scanner` provides number parsing function using `libgmp` is it not much
2755 work to perform arbitrary rational number calculations.
2756
2757 This calculator takes one expression, or an equality test per line.  The
2758 results are printed and if any equality test fails, the program exits with
2759 an error.
2760
2761 ###### File: parsergen.mk
2762         calc.c calc.h : parsergen parsergen.mdc
2763                 ./parsergen --tag calc -o calc parsergen.mdc
2764         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2765                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2766
2767 # calc: header
2768
2769         #include "number.h"
2770         // what do we use for a demo-grammar?  A calculator of course.
2771         struct number {
2772                 mpq_t val;
2773                 char tail[2];
2774                 int err;
2775         };
2776
2777 # calc: code
2778
2779         #include <stdlib.h>
2780         #include <unistd.h>
2781         #include <fcntl.h>
2782         #include <sys/mman.h>
2783         #include <stdio.h>
2784         #include <malloc.h>
2785         #include <gmp.h>
2786         #include "mdcode.h"
2787         #include "scanner.h"
2788         #include "number.h"
2789         #include "parser.h"
2790
2791         #include "calc.h"
2792
2793         static void free_number(struct number *n)
2794         {
2795                 mpq_clear(n->val);
2796                 free(n);
2797         }
2798
2799         int main(int argc, char *argv[])
2800         {
2801                 int fd = open(argv[1], O_RDONLY);
2802                 int len = lseek(fd, 0, 2);
2803                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2804                 struct section *s = code_extract(file, file+len, NULL);
2805                 struct token_config config = {
2806                         .ignored = (1 << TK_line_comment)
2807                                  | (1 << TK_block_comment)
2808                                  | (1 << TK_in)
2809                                  | (1 << TK_out),
2810                         .number_chars = ".,_+-",
2811                         .word_start = "",
2812                         .word_cont = "",
2813                 };
2814                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2815                 exit(0);
2816         }
2817
2818 # calc: grammar
2819
2820         Session -> Session Line
2821                 | Line
2822
2823         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2824                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2825                                         gmp_printf("  or as a decimal: %Fg\n", fl);
2826                                         mpf_clear(fl);
2827                                         }
2828                                      }$
2829                 | Expression = Expression NEWLINE ${
2830                         if (mpq_equal($1.val, $3.val))
2831                                 gmp_printf("Both equal %Qd\n", $1.val);
2832                         else {
2833                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
2834                                         $1.val, $3.val);
2835                                 exit(1);
2836                         }
2837                 }$
2838                 | NEWLINE ${ printf("Blank line\n"); }$
2839                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2840
2841         $number
2842         Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2843                 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2844                 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2845
2846         Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2847                 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2848                 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2849
2850         Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2851                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$