ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 There are several distinct sections.
   8
   9 1. `grammar_read` will read a grammar from a literate-code file and
  10    store the productions, symbols, and code fragments.
  11 2. `grammar_analyse` will take that grammar and build LR parsing
  12    states and look-ahead tables.
  13 3. `grammar_report` will present the details of the analysis
  14    in a readable format and will report any conflicts.
  15 4. `parser_generate` will write out C code files with various
  16    tables and with the code fragments arranged in useful places.
  17 5. `parser_run` is a library function intended to be linked together
  18    with the generated parser tables to complete the implementation of
  19    a parser.
  20 6. `calc.cgm` is a test grammar for a simple calculator.
  21
  22 ###### File: parsergen.c
  23         #include <unistd.h>
  24         #include <stdlib.h>
  25         #include <stdio.h>
  26         ## includes
  27         ## forward declarations
  28         ## declarations
  29         ## functions
  30         ## grammar_read
  31         ## grammar_analyse
  32         ## grammar_report
  33         ## parser_generate
  34         ## main
  35 ###### File: parser.h
  36         ## exported types
  37         ## exported functions
  38 ###### File: libparser.c
  39         #include <unistd.h>
  40         #include <stdlib.h>
  41         #include <stdio.h>
  42         ## parser includes
  43         ## parser functions
  44         ## parser_run
  45 ###### File: calc.cgm
  46         ## demo grammar
  47 ###### File: parsergen.mk
  48         CFLAGS += -Wall -g
  49         all :: parsergen calc
  50         parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
  51                 ./md2c parsergen.mdc
  52
  53 ## Reading the grammar
  54
  55 The grammar must be stored in a markdown literate code file as parsed
  56 by "[mdcode][]".  It must have three top level (i.e. unreferenced)
  57 sections: `header`, `code`, and `grammar`.  The first two will be
  58 literally copied into the generated `.c` and `.h`. files.  The last
  59 contains the grammar.  This is tokenised with "[scanner][]".
  60
  61 If the `--tag` option is given, then any top level header that doesn't
  62 start with the tag is ignored, and the tag is striped from the rest.  So
  63 `--tag Foo`
  64 means that the three needed sections must be `Foo: header`, `Foo: code`,
  65 and `Foo: grammar`.
  66
  67 [mdcode]: mdcode.html
  68 [scanner]: scanner.html
  69
  70 ###### includes
  71         #include "mdcode.h"
  72         #include "scanner.h"
  73
  74 ###### parser includes
  75         #include "mdcode.h"
  76         #include "scanner.h"
  77
  78 The grammar contains production sets, precedence/associativity
  79 declarations, and data type declarations.  These are all parsed with
  80 _ad hoc_ parsing as we don't have a parser generator yet.
  81
  82 The precedence and associativity can be set for each production, but
  83 can be inherited from symbols.  The data type is potentially defined
  84 for each non-terminal and describes what C structure is used to store
  85 information about each symbol.
  86
  87 ###### declarations
  88         enum assoc {Left, Right, Non};
  89         char *assoc_names[] = {"Left","Right","Non"};
  90
  91         struct symbol {
  92                 struct text struct_name;
  93                 enum assoc assoc;
  94                 unsigned short precedence;
  95                 ## symbol fields
  96         };
  97         struct production {
  98                 unsigned short precedence;
  99                 enum assoc assoc;
 100                 ## production fields
 101         };
 102         struct grammar {
 103                 ## grammar fields
 104         };
 105
 106 The strings reported by `mdcode` and `scanner` are `struct text` which have
 107 length rather than being null terminated.  To help with printing and
 108 comparing we define `text_is` and `prtxt`, which should possibly go in
 109 `mdcode`.  `scanner` does provide `text_dump` which is useful for strings
 110 which might contain control characters.
 111
 112 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 113 because that is what we need to detect tags.
 114
 115 ###### functions
 116         static int text_is(struct text t, char *s)
 117         {
 118                 return (strlen(s) == t.len &&
 119                         strncmp(s, t.txt, t.len) == 0);
 120         }
 121         static void prtxt(struct text t)
 122         {
 123                 printf("%.*s", t.len, t.txt);
 124         }
 125
 126         static int strip_tag(struct text *t, char *tag)
 127         {
 128                 int skip = strlen(tag) + 1;
 129                 if (skip >= t->len ||
 130                     strncmp(t->txt, tag, skip-1) != 0 ||
 131                     t->txt[skip-1] != ':')
 132                         return 0;
 133                 while (skip < t->len && t->txt[skip] == ' ')
 134                         skip++;
 135                 t->len -= skip;
 136                 t->txt += skip;
 137                 return 1;
 138         }
 139
 140 ### Symbols
 141
 142 Productions are comprised primarily of symbols - terminal and
 143 non-terminal.  We do not make a syntactic distinction between the two,
 144 though non-terminals must be identifiers.  Non-terminal symbols are
 145 those which appear in the head of a production, terminal symbols are
 146 those which don't.  There are also "virtual" symbols used for precedence
 147 marking discussed later, and sometimes we won't know what type a symbol
 148 is yet.
 149
 150 ###### forward declarations
 151         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 152         char *symtypes = "UVTN";
 153 ###### symbol fields
 154         enum symtype type;
 155
 156 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 157 table of known symbols and the resulting parser will report them as
 158 `TK_reserved + N`.  A small set of identifiers are reserved for the
 159 different token types that `scanner` can report.
 160
 161 ###### declarations
 162         static char *reserved_words[] = {
 163                 [TK_error]        = "ERROR",
 164                 [TK_number]       = "NUMBER",
 165                 [TK_ident]        = "IDENTIFIER",
 166                 [TK_mark]         = "MARK",
 167                 [TK_string]       = "STRING",
 168                 [TK_multi_string] = "MULTI_STRING",
 169                 [TK_in]           = "IN",
 170                 [TK_out]          = "OUT",
 171                 [TK_newline]      = "NEWLINE",
 172                 [TK_eof]          = "$eof",
 173         };
 174 ###### symbol fields
 175         short num;
 176
 177 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 178 recognised.  The former is automatically expected at the end of the text
 179 being parsed. The latter are always ignored by the parser.
 180
 181 All of these symbols are stored in a simple symbol table.  We use the
 182 `struct text` from `mdcode` to store the name and link them together
 183 into a sorted list using an insertion sort.
 184
 185 We don't have separate `find` and `insert` functions as any symbol we
 186 find needs to be remembered.  We simply expect `find` to always return a
 187 symbol, but its type might be `Unknown`.
 188
 189 ###### includes
 190         #include <string.h>
 191
 192 ###### symbol fields
 193         struct text name;
 194         struct symbol *next;
 195
 196 ###### grammar fields
 197         struct symbol *syms;
 198         int num_syms;
 199
 200 ###### functions
 201         static int text_cmp(struct text a, struct text b)
 202         {
 203                 int len = a.len;
 204                 if (a.len > b.len)
 205                         len = b.len;
 206                 int cmp = strncmp(a.txt, b.txt, len);
 207                 if (cmp)
 208                         return cmp;
 209                 else
 210                         return a.len - b.len;
 211         }
 212
 213         static struct symbol *sym_find(struct grammar *g, struct text s)
 214         {
 215                 struct symbol **l = &g->syms;
 216                 struct symbol *n;
 217                 int cmp = 1;
 218
 219                 while (*l &&
 220                         (cmp = text_cmp((*l)->name, s)) < 0)
 221                                 l = & (*l)->next;
 222                 if (cmp == 0)
 223                         return *l;
 224                 n = calloc(1, sizeof(*n));
 225                 n->name = s;
 226                 n->type = Unknown;
 227                 n->next = *l;
 228                 n->num = -1;
 229                 *l = n;
 230                 return n;
 231         }
 232
 233         static void symbols_init(struct grammar *g)
 234         {
 235                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 236                 int i;
 237                 for (i = 0; i < entries; i++) {
 238                         struct text t;
 239                         struct symbol *s;
 240                         t.txt = reserved_words[i];
 241                         if (!t.txt)
 242                                 continue;
 243                         t.len = strlen(t.txt);
 244                         s = sym_find(g, t);
 245                         s->type = Terminal;
 246                         s->num = i;
 247                 }
 248         }
 249
 250 ### Data types and precedence.
 251
 252 Data type specification and precedence specification are both
 253 introduced by a dollar sign at the start of the line.  If the next
 254 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 255 precedence, otherwise it specifies a data type.
 256
 257 The data type name is simply stored and applied to the head of all
 258 subsequent productions.  It must be the name of a structure, so `$expr`
 259 maps to `struct expr`.
 260
 261 Any productions given before the first data type will have no data type
 262 and can carry no information.  In order to allow other non-terminals to
 263 have no type, the data type `$void` can be given.  This does *not* mean
 264 that `struct void` will be used, but rather than no type will be
 265 associated with future non-terminals.
 266
 267 The precedence line must contain a list of symbols - typically
 268 terminal symbols, but not necessarily.  It can only contain symbols
 269 that have not been seen yet, so precedence declaration must precede
 270 the productions that they affect.
 271
 272 A precedence line may also contain a "virtual" symbol which is an
 273 identifier preceded by `$$`.  Virtual terminals carry precedence
 274 information but are not included in the grammar.  A production can
 275 declare that it inherits the precedence of a given virtual symbol.
 276
 277 This use for `$$` precludes it from being used as a symbol in the
 278 described language.  Two other symbols: `${` and `}$` are also
 279 unavailable.
 280
 281 Each new precedence line introduces a new precedence level and
 282 declares how it associates.  This level is stored in each symbol
 283 listed and may be inherited by any production which uses the symbol.  A
 284 production inherits from the last symbol which has a precedence.
 285
 286 ###### grammar fields
 287         struct text current_type;
 288         int prec_levels;
 289
 290 ###### declarations
 291         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 292         static const char *known[] = { "$$", "${", "}$" };
 293
 294 ###### functions
 295         static char *dollar_line(struct token_state *ts, struct grammar *g)
 296         {
 297                 struct token t = token_next(ts);
 298                 char *err;
 299                 enum assoc assoc;
 300                 int found;
 301
 302                 if (t.num != TK_ident) {
 303                         err = "type or assoc expected after '$'";
 304                         goto abort;
 305                 }
 306                 if (text_is(t.txt, "LEFT"))
 307                         assoc = Left;
 308                 else if (text_is(t.txt, "RIGHT"))
 309                         assoc = Right;
 310                 else if (text_is(t.txt, "NON"))
 311                         assoc = Non;
 312                 else {
 313                         g->current_type = t.txt;
 314                         if (text_is(t.txt, "void"))
 315                                 g->current_type.txt = NULL;
 316                         t = token_next(ts);
 317                         if (t.num != TK_newline) {
 318                                 err = "Extra tokens after type name";
 319                                 goto abort;
 320                         }
 321                         return NULL;
 322                 }
 323
 324                 // This is a precedence line, need some symbols.
 325                 found = 0;
 326                 g->prec_levels += 1;
 327                 t = token_next(ts);
 328                 while (t.num != TK_newline) {
 329                         enum symtype type = Terminal;
 330                         struct symbol *s;
 331                         if (t.num == TK_virtual) {
 332                                 type = Virtual;
 333                                 t = token_next(ts);
 334                                 if (t.num != TK_ident) {
 335                                         err = "$$ must be followed by a word";
 336                                         goto abort;
 337                                 }
 338                         } else if (t.num != TK_ident &&
 339                                    t.num != TK_mark) {
 340                                 err = "Illegal token in precedence line";
 341                                 goto abort;
 342                         }
 343                         s = sym_find(g, t.txt);
 344                         if (s->type != Unknown) {
 345                                 err = "Symbols in precedence line must not already be known.";
 346                                 goto abort;
 347                         }
 348                         s->type = type;
 349                         s->precedence = g->prec_levels;
 350                         s->assoc = assoc;
 351                         found += 1;
 352                 }
 353                 if (found == 0)
 354                         err = "No symbols given on precedence line";
 355                         goto abort;
 356                 return NULL;
 357         abort:
 358                 while (t.num != TK_newline && t.num != TK_eof)
 359                         t = token_next(ts);
 360                 return err;
 361         }
 362
 363 ### Productions
 364
 365 A production either starts with an identifier which is the head
 366 non-terminal, or a vertical bar (`|`) in which case this production
 367 uses the same head as the previous one.  The identifier must be
 368 followed by a `->` mark.  All productions for a given non-terminal must
 369 be grouped together with the `nonterminal ->` given only once.
 370
 371 After this a (possibly empty) sequence of identifiers and marks form
 372 the body of the production.  A virtual symbol may be given after the
 373 body by preceding it with `$$`.  If a virtual symbol is given, the
 374 precedence of the production is that for the virtual symbol.  If none
 375 is given, the precedence is inherited from the last symbol in the
 376 production which has a precedence specified.
 377
 378 After the optional precedence may come the `${` mark.  This indicates
 379 the start of a code fragment.  If present, this must be on the same
 380 line as the start of the production.
 381
 382 All of the text from the `${` through to the matching `}$` is
 383 collected and forms the code-fragment for the production.  It must all
 384 be in one `code_node` of the literate code.  The `}$` must be
 385 at the end of a line.
 386
 387 Text in the code fragment will undergo substitutions where `$N` for
 388 some numeric `N` will be replaced with a variable holding the parse
 389 information for the particular symbol in the production.  `$0` is the
 390 head of the production, `$1` is the first symbol of the body, etc.
 391 The type of `$N` for a terminal symbol is `struct token`.  For
 392 a non-terminal, it is whatever has been declared for that symbol.
 393
 394 While building productions we will need to add to an array which needs to
 395 grow dynamically.
 396
 397 ###### functions
 398         static void array_add(void *varray, int *cnt, void *new)
 399         {
 400                 void ***array = varray;
 401                 int current = 0;
 402                 const int step = 8;
 403                 current = ((*cnt-1) | (step-1))+1;
 404                 if (*cnt == current) {
 405                         /* must grow */
 406                         current += step;
 407                         *array = realloc(*array, current * sizeof(void*));
 408                 }
 409                 (*array)[*cnt] = new;
 410                 (*cnt) += 1;
 411         }
 412
 413 Collecting the code fragment simply involves reading tokens until we
 414 hit the end token `}$`, and noting the character position of the start and
 415 the end.
 416
 417 ###### functions
 418         static struct text collect_code(struct token_state *state,
 419                                         struct token start)
 420         {
 421                 struct text code;
 422                 struct token t;
 423                 code.txt = start.txt.txt + start.txt.len;
 424                 do
 425                         t = token_next(state);
 426                 while (t.node == start.node &&
 427                        t.num != TK_close && t.num != TK_error &&
 428                        t.num != TK_eof);
 429                 if (t.num == TK_close && t.node == start.node)
 430                         code.len = t.txt.txt - code.txt;
 431                 else
 432                         code.txt = NULL;
 433                 return code;
 434         }
 435
 436 Now we have all the bit we need to parse a full production.
 437
 438 ###### includes
 439         #include <memory.h>
 440
 441 ###### grammar fields
 442         struct production **productions;
 443         int production_count;
 444
 445 ###### production fields
 446         struct symbol  *head;
 447         struct symbol **body;
 448         int             body_size;
 449         struct text     code;
 450
 451 ###### symbol fields
 452         int first_production;
 453
 454 ###### functions
 455         static char *parse_production(struct grammar *g,
 456                                       struct symbol *head,
 457                                       struct token_state *state)
 458         {
 459                 /* Head has already been parsed. */
 460                 struct token tk;
 461                 char *err;
 462                 struct production p, *pp;
 463
 464                 memset(&p, 0, sizeof(p));
 465                 p.head = head;
 466                 tk = token_next(state);
 467                 while (tk.num == TK_ident || tk.num == TK_mark) {
 468                         struct symbol *bs = sym_find(g, tk.txt);
 469                         if (bs->type == Unknown)
 470                                 bs->type = Terminal;
 471                         if (bs->type == Virtual) {
 472                                 err = "Virtual symbol not permitted in production";
 473                                 goto abort;
 474                         }
 475                         if (bs->precedence) {
 476                                 p.precedence = bs->precedence;
 477                                 p.assoc = bs->assoc;
 478                         }
 479                         array_add(&p.body, &p.body_size, bs);
 480                         tk = token_next(state);
 481                 }
 482                 if (tk.num == TK_virtual) {
 483                         struct symbol *vs;
 484                         tk = token_next(state);
 485                         if (tk.num != TK_ident) {
 486                                 err = "word required after $$";
 487                                 goto abort;
 488                         }
 489                         vs = sym_find(g, tk.txt);
 490                         if (vs->type != Virtual) {
 491                                 err = "symbol after $$ must be virtual";
 492                                 goto abort;
 493                         }
 494                         p.precedence = vs->precedence;
 495                         p.assoc = vs->assoc;
 496                         tk = token_next(state);
 497                 }
 498                 if (tk.num == TK_open) {
 499                         p.code = collect_code(state, tk);
 500                         if (p.code.txt == NULL) {
 501                                 err = "code fragment not closed properly";
 502                                 goto abort;
 503                         }
 504                         tk = token_next(state);
 505                 }
 506                 if (tk.num != TK_newline && tk.num != TK_eof) {
 507                         err = "stray tokens at end of line";
 508                         goto abort;
 509                 }
 510                 pp = malloc(sizeof(*pp));
 511                 *pp = p;
 512                 array_add(&g->productions, &g->production_count, pp);
 513                 return NULL;
 514         abort:
 515                 while (tk.num != TK_newline && tk.num != TK_eof)
 516                         tk = token_next(state);
 517                 return err;
 518         }
 519
 520 With the ability to parse production and dollar-lines, we have nearly all
 521 that we need to parse a grammar from a `code_node`.
 522
 523 The head of the first production will effectively be the `start` symbol of
 524 the grammar.  However it won't _actually_ be so.  Processing the grammar is
 525 greatly simplified if the real start symbol only has a single production,
 526 and expects `$eof` as the final terminal.  So when we find the first
 527 explicit production we insert an extra production as production zero which
 528 looks like
 529
 530 ###### Example: production 0
 531         $start -> START $eof
 532
 533 where `START` is the first non-terminal given.
 534
 535 ###### create production zero
 536         struct production *p = calloc(1,sizeof(*p));
 537         struct text start = {"$start",6};
 538         struct text eof = {"$eof",4};
 539         p->head = sym_find(g, start);
 540         p->head->type = Nonterminal;
 541         array_add(&p->body, &p->body_size, head);
 542         array_add(&p->body, &p->body_size, sym_find(g, eof));
 543         g->start = p->head->num;
 544         p->head->first_production = g->production_count;
 545         array_add(&g->productions, &g->production_count, p);
 546
 547 Now we are ready to read in the grammar.
 548
 549 ###### grammar fields
 550         int start;      // the 'start' symbol.
 551
 552 ###### grammar_read
 553         static struct grammar *grammar_read(struct code_node *code)
 554         {
 555                 struct token_config conf = {
 556                         .word_start = "",
 557                         .word_cont = "",
 558                         .words_marks = known,
 559                         .known_count = sizeof(known)/sizeof(known[0]),
 560                         .number_chars = "",
 561                         .ignored = (1 << TK_line_comment)
 562                                  | (1 << TK_block_comment)
 563                                  | (0 << TK_number)
 564                                  | (1 << TK_string)
 565                                  | (1 << TK_multi_string)
 566                                  | (1 << TK_in)
 567                                  | (1 << TK_out),
 568                 };
 569
 570                 struct token_state *state = token_open(code, &conf);
 571                 struct token tk;
 572                 struct symbol *head = NULL;
 573                 struct grammar *g;
 574                 char *err = NULL;
 575
 576                 g = calloc(1, sizeof(*g));
 577                 symbols_init(g);
 578
 579                 for (tk = token_next(state); tk.num != TK_eof;
 580                      tk = token_next(state)) {
 581                         if (tk.num == TK_newline)
 582                                 continue;
 583                         if (tk.num == TK_ident) {
 584                                 // new non-terminal
 585                                 head = sym_find(g, tk.txt);
 586                                 if (head->type == Nonterminal)
 587                                         err = "This non-terminal has already be used.";
 588                                 else if (head->type == Virtual)
 589                                         err = "Virtual symbol not permitted in head of production";
 590                                 else {
 591                                         head->type = Nonterminal;
 592                                         head->struct_name = g->current_type;
 593                                         if (g->start == 0) {
 594                                                 ## create production zero
 595                                         }
 596                                         head->first_production = g->production_count;
 597                                         tk = token_next(state);
 598                                         if (tk.num == TK_mark &&
 599                                             text_is(tk.txt, "->"))
 600                                                 err = parse_production(g, head, state);
 601                                         else
 602                                                 err = "'->' missing in production";
 603                                 }
 604                         } else if (tk.num == TK_mark
 605                                    && text_is(tk.txt, "|")) {
 606                                 // another production for same non-term
 607                                 if (head)
 608                                         err = parse_production(g, head, state);
 609                                 else
 610                                         err = "First production must have a head";
 611                         } else if (tk.num == TK_mark
 612                                    && text_is(tk.txt, "$")) {
 613                                 err = dollar_line(state, g);
 614                         } else {
 615                                 err = "Unrecognised token at start of line.";
 616                         }
 617                         if (err)
 618                                 goto abort;
 619                 }
 620                 token_close(state);
 621                 return g;
 622         abort:
 623                 fprintf(stderr, "Error at line %d: %s\n",
 624                         tk.line, err);
 625                 token_close(state);
 626                 free(g);
 627                 return NULL;
 628         }
 629
 630 ## Analysing the grammar
 631
 632 The central task in analysing the grammar is to determine a set of
 633 states to drive the parsing process.  This will require finding
 634 various sets of symbols and of "items".  Some of these sets will need
 635 to attach information to each element in the set, so they are more
 636 like maps.
 637
 638 Each "item" may have a 'look-ahead' or `LA` set associated with
 639 it.  Many of these will be the same as each other.  To avoid memory
 640 wastage, and to simplify some comparisons of sets, these sets will be
 641 stored in a list of unique sets, each assigned a number.
 642
 643 Once we have the data structures in hand to manage these sets and
 644 lists, we can start setting the 'nullable' flag, build the 'FIRST'
 645 sets, and then create the item sets which define the various states.
 646
 647 ### Symbol sets.
 648
 649 Though we don't only store symbols in these sets, they are the main
 650 things we store, so they are called symbol sets or "symsets".
 651
 652 A symset has a size, an array of shorts, and an optional array of data
 653 values, which are also shorts.  If the array of data is not present,
 654 we store an impossible pointer, as `NULL` is used to indicate that no
 655 memory has been allocated yet;
 656
 657 ###### declarations
 658         struct symset {
 659                 short cnt;
 660                 unsigned short *syms, *data;
 661         };
 662         #define NO_DATA ((unsigned short *)1)
 663         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 664         const struct symset INIT_DATASET = { 0, NULL, NULL };
 665
 666 The arrays of shorts are allocated in blocks of 8 and are kept sorted
 667 by using an insertion sort.  We don't explicitly record the amount of
 668 allocated space as it can be derived directly from the current `cnt` using
 669 `((cnt - 1) | 7) + 1`.
 670
 671 ###### functions
 672         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 673         {
 674                 int i;
 675                 int current = ((s->cnt-1) | 7) + 1;
 676                 if (current == s->cnt) {
 677                         current += 8;
 678                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 679                         if (s->data != NO_DATA)
 680                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 681                 }
 682                 i = s->cnt;
 683                 while (i > 0 && s->syms[i-1] > key) {
 684                         s->syms[i] = s->syms[i-1];
 685                         if (s->data != NO_DATA)
 686                                 s->data[i] = s->data[i-1];
 687                         i--;
 688                 }
 689                 s->syms[i] = key;
 690                 if (s->data != NO_DATA)
 691                         s->data[i] = val;
 692                 s->cnt += 1;
 693         }
 694
 695 Finding a symbol (or item) in a `symset` uses a simple binary search.
 696 We return the index where the value was found (so data can be accessed),
 697 or `-1` to indicate failure.
 698
 699         static int symset_find(struct symset *ss, unsigned short key)
 700         {
 701                 int lo = 0;
 702                 int hi = ss->cnt;
 703
 704                 if (hi == 0)
 705                         return -1;
 706                 while (lo + 1 < hi) {
 707                         int mid = (lo + hi) / 2;
 708                         if (ss->syms[mid] <= key)
 709                                 lo = mid;
 710                         else
 711                                 hi = mid;
 712                 }
 713                 if (ss->syms[lo] == key)
 714                         return lo;
 715                 return -1;
 716         }
 717
 718 We will often want to form the union of two symsets.  When we do, we
 719 will often want to know if anything changed (as they might mean there
 720 is more work to do).  So `symset_union` reports whether anything was
 721 added to the first set.  We use a slow quadratic approach as these
 722 sets don't really get very big.  If profiles shows this to be a problem is
 723 can be optimised later.
 724
 725         static int symset_union(struct symset *a, struct symset *b)
 726         {
 727                 int i;
 728                 int added = 0;
 729                 for (i = 0; i < b->cnt; i++)
 730                         if (symset_find(a, b->syms[i]) < 0) {
 731                                 unsigned short data = 0;
 732                                 if (b->data != NO_DATA)
 733                                         data = b->data[i];
 734                                 symset_add(a, b->syms[i], data);
 735                                 added++;
 736                         }
 737                 return added;
 738         }
 739
 740 And of course we must be able to free a symset.
 741
 742         static void symset_free(struct symset ss)
 743         {
 744                 free(ss.syms);
 745                 if (ss.data != NO_DATA)
 746                         free(ss.data);
 747         }
 748
 749 ### Symset Storage
 750
 751 Some symsets are simply stored somewhere appropriate in the `grammar`
 752 data structure, others needs a bit of indirection.  This applies
 753 particularly to "LA" sets.  These will be paired with "items" in an
 754 "itemset".  As itemsets will be stored in a symset, the "LA" set needs to be
 755 stored in the `data` field, so we need a mapping from a small (short)
 756 number to an LA `symset`.
 757
 758 As mentioned earlier this involves creating a list of unique symsets.
 759
 760 For now, we just use a linear list sorted by insertion.  A skiplist
 761 would be more efficient and may be added later.
 762
 763 ###### declarations
 764
 765         struct setlist {
 766                 struct symset ss;
 767                 int num;
 768                 struct setlist *next;
 769         };
 770
 771 ###### grammar fields
 772         struct setlist *sets;
 773         int nextset;
 774
 775 ###### functions
 776
 777         static int ss_cmp(struct symset a, struct symset b)
 778         {
 779                 int i;
 780                 int diff = a.cnt - b.cnt;
 781                 if (diff)
 782                         return diff;
 783                 for (i = 0; i < a.cnt; i++) {
 784                         diff = (int)a.syms[i] - (int)b.syms[i];
 785                         if (diff)
 786                                 return diff;
 787                 }
 788                 return 0;
 789         }
 790
 791         static int save_set(struct grammar *g, struct symset ss)
 792         {
 793                 struct setlist **sl = &g->sets;
 794                 int cmp = 1;
 795                 struct setlist *s;
 796
 797                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 798                         sl = & (*sl)->next;
 799                 if (cmp == 0) {
 800                         symset_free(ss);
 801                         return (*sl)->num;
 802                 }
 803
 804                 s = malloc(sizeof(*s));
 805                 s->ss = ss;
 806                 s->num = g->nextset;
 807                 g->nextset += 1;
 808                 s->next = *sl;
 809                 *sl = s;
 810                 return s->num;
 811         }
 812
 813 Finding a set by number is currently performed by a simple linear search.
 814 If this turns out to hurt performance, we can store the sets in a dynamic
 815 array like the productions.
 816
 817         static struct symset set_find(struct grammar *g, int num)
 818         {
 819                 struct setlist *sl = g->sets;
 820                 while (sl && sl->num != num)
 821                         sl = sl->next;
 822                 return sl->ss;
 823         }
 824
 825
 826 ### Setting `nullable`
 827
 828 We set `nullable` on the head symbol for any production for which all
 829 the body symbols (if any) are nullable.  As this is a recursive
 830 definition, any change in the `nullable` setting means that we need to
 831 re-evaluate where it needs to be set.
 832
 833 We simply loop around performing the same calculations until no more
 834 changes happen.
 835
 836 ###### symbol fields
 837         int nullable;
 838
 839 ###### functions
 840         static void set_nullable(struct grammar *g)
 841         {
 842                 int check_again = 1;
 843                 while (check_again) {
 844                         int p;
 845                         check_again = 0;
 846                         for (p = 0; p < g->production_count; p++) {
 847                                 struct production *pr = g->productions[p];
 848                                 int s;
 849
 850                                 if (pr->head->nullable)
 851                                         continue;
 852                                 for (s = 0; s < pr->body_size; s++)
 853                                         if (! pr->body[s]->nullable)
 854                                                 break;
 855                                 if (s == pr->body_size) {
 856                                         pr->head->nullable = 1;
 857                                         check_again = 1;
 858                                 }
 859                         }
 860                 }
 861         }
 862
 863 ### Setting `can_eol`
 864
 865 In order to be able to ignore newline tokens when not relevant, but
 866 still include them in the parse when needed, we will need to know
 867 which states can start a "line-like" section of code.  We ignore
 868 newlines when there is an indent since the most recent start of a
 869 line-like section.
 870
 871 To know what is line-like, we first need to know which symbols can end
 872 a line-like section, which is precisely those which can end with a
 873 newline token.  These symbols don't necessarily alway end with a
 874 newline, but they can.  Hence they are not described as "lines" but
 875 only "line-like".
 876
 877 Clearly the `TK_newline` token can end with a newline.  Any symbol
 878 which is the head of a production that contains a line-ending symbol
 879 followed only by nullable symbols is also a line-ending symbol.  We
 880 use a new field `can_eol` to record this attribute of symbols, and
 881 compute it in a repetitive manner similar to `set_nullable`.
 882
 883 ###### symbol fields
 884         int can_eol;
 885
 886 ###### functions
 887         static void set_can_eol(struct grammar *g)
 888         {
 889                 int check_again = 1;
 890                 g->symtab[TK_newline]->can_eol = 1;
 891                 while (check_again) {
 892                         int p;
 893                         check_again = 0;
 894                         for (p = 0; p < g->production_count; p++) {
 895                                 struct production *pr = g->productions[p];
 896                                 int s;
 897
 898                                 if (pr->head->can_eol)
 899                                         continue;
 900
 901                                 for (s = pr->body_size - 1; s >= 0; s--) {
 902                                         if (pr->body[s]->can_eol) {
 903                                                 pr->head->can_eol = 1;
 904                                                 check_again = 1;
 905                                                 break;
 906                                         }
 907                                         if (!pr->body[s]->nullable)
 908                                                 break;
 909                                 }
 910                         }
 911                 }
 912         }
 913
 914 ### Building the `first` sets
 915
 916 When calculating what can follow a particular non-terminal, we will need to
 917 know what the "first" terminal in any subsequent non-terminal might be.  So
 918 we calculate the `first` set for every non-terminal and store them in an
 919 array.  We don't bother recording the "first" set for terminals as they are
 920 trivial.
 921
 922 As the "first" for one symbol might depend on the "first" of another,
 923 we repeat the calculations until no changes happen, just like with
 924 `set_nullable`.  This makes use of the fact that `symset_union`
 925 reports if any change happens.
 926
 927 The core of this which finds the "first" of part of a production body
 928 will be reused for computing the "follow" sets, so we split it out
 929 into a separate function.
 930
 931 ###### grammar fields
 932         struct symset *first;
 933
 934 ###### functions
 935
 936         static int add_first(struct production *pr, int start,
 937                              struct symset *target, struct grammar *g,
 938                              int *to_end)
 939         {
 940                 int s;
 941                 int changed = 0;
 942                 for (s = start; s < pr->body_size; s++) {
 943                         struct symbol *bs = pr->body[s];
 944                         if (bs->type == Terminal) {
 945                                 if (symset_find(target, bs->num) < 0) {
 946                                         symset_add(target, bs->num, 0);
 947                                         changed = 1;
 948                                 }
 949                                 break;
 950                         } else if (symset_union(target, &g->first[bs->num]))
 951                                 changed = 1;
 952                         if (!bs->nullable)
 953                                 break;
 954                 }
 955                 if (to_end)
 956                         *to_end = (s == pr->body_size);
 957                 return changed;
 958         }
 959
 960         static void build_first(struct grammar *g)
 961         {
 962                 int check_again = 1;
 963                 int s;
 964                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
 965                 for (s = 0; s < g->num_syms; s++)
 966                         g->first[s] = INIT_SYMSET;
 967
 968                 while (check_again) {
 969                         int p;
 970                         check_again = 0;
 971                         for (p = 0; p < g->production_count; p++) {
 972                                 struct production *pr = g->productions[p];
 973                                 struct symset *head = &g->first[pr->head->num];
 974
 975                                 if (add_first(pr, 0, head, g, NULL))
 976                                         check_again = 1;
 977                         }
 978                 }
 979         }
 980
 981 ### Building the `follow` sets.
 982
 983 There are two different situations when we will want to generate "follow"
 984 sets.  If we are doing an SLR analysis, we want to generate a single
 985 "follow" set for each non-terminal in the grammar.  That is what is
 986 happening here.  If we are doing an LALR or LR analysis we will want
 987 to generate a separate "LA" set for each item.  We do that later
 988 in state generation.
 989
 990 There are two parts to generating a "follow" set.  Firstly we look at
 991 every place that any non-terminal appears in the body of any
 992 production, and we find the set of possible "first" symbols after
 993 there.  This is done using `add_first` above and only needs to be done
 994 once as the "first" sets are now stable and will not change.
 995
 996 ###### follow code
 997
 998         for (p = 0; p < g->production_count; p++) {
 999                 struct production *pr = g->productions[p];
1000                 int b;
1001
1002                 for (b = 0; b < pr->body_size - 1; b++) {
1003                         struct symbol *bs = pr->body[b];
1004                         if (bs->type == Terminal)
1005                                 continue;
1006                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1007                 }
1008         }
1009
1010 The second part is to add the "follow" set of the head of a production
1011 to the "follow" sets of the final symbol in the production, and any
1012 other symbol which is followed only by `nullable` symbols.  As this
1013 depends on "follow" itself we need to repeatedly perform the process
1014 until no further changes happen.
1015
1016 ###### follow code
1017
1018         for (again = 0, p = 0;
1019              p < g->production_count;
1020              p < g->production_count-1
1021                 ? p++ : again ? (again = 0, p = 0)
1022                               : p++) {
1023                 struct production *pr = g->productions[p];
1024                 int b;
1025
1026                 for (b = pr->body_size - 1; b >= 0; b--) {
1027                         struct symbol *bs = pr->body[b];
1028                         if (bs->type == Terminal)
1029                                 break;
1030                         if (symset_union(&g->follow[bs->num],
1031                                          &g->follow[pr->head->num]))
1032                                 again = 1;
1033                         if (!bs->nullable)
1034                                 break;
1035                 }
1036         }
1037
1038 We now just need to create and initialise the `follow` list to get a
1039 complete function.
1040
1041 ###### grammar fields
1042         struct symset *follow;
1043
1044 ###### functions
1045         static void build_follow(struct grammar *g)
1046         {
1047                 int s, again, p;
1048                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1049                 for (s = 0; s < g->num_syms; s++)
1050                         g->follow[s] = INIT_SYMSET;
1051                 ## follow code
1052         }
1053
1054 ### Building itemsets and states
1055
1056 There are three different levels of detail that can be chosen for
1057 building the itemsets and states for the LR grammar.  They are:
1058
1059 1. LR(0) or SLR(1), where no look-ahead is considered.
1060 2. LALR(1) where we build look-ahead sets with each item and merge
1061    the LA sets when we find two paths to the same "kernel" set of items.
1062 3. LR(1) where different look-ahead for any item in the set means
1063    a different state must be created.
1064
1065 ###### forward declarations
1066         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1067
1068 We need to be able to look through existing states to see if a newly
1069 generated state already exists.  For now we use a simple sorted linked
1070 list.
1071
1072 An item is a pair of numbers: the production number and the position of
1073 "DOT", which is an index into the body of the production.
1074 As the numbers are not enormous we can combine them into a single "short"
1075 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1076 production number (so 4000 productions with maximum of 15 symbols in the
1077 body).
1078
1079 Comparing the itemsets will be a little different to comparing symsets
1080 as we want to do the lookup after generating the "kernel" of an
1081 itemset, so we need to ignore the offset=zero items which are added during
1082 completion.
1083
1084 To facilitate this, we modify the "DOT" number so that "0" sorts to
1085 the end of the list in the symset, and then only compare items before
1086 the first "0".
1087
1088 ###### declarations
1089         static inline unsigned short item_num(int production, int index)
1090         {
1091                 return production | ((31-index) << 11);
1092         }
1093         static inline int item_prod(unsigned short item)
1094         {
1095                 return item & 0x7ff;
1096         }
1097         static inline int item_index(unsigned short item)
1098         {
1099                 return (31-(item >> 11)) & 0x1f;
1100         }
1101
1102 For LR(1) analysis we need to compare not just the itemset in a state
1103 but also the LA sets.  As we assign each unique LA set a number, we
1104 can just compare the symset and the data values together.
1105
1106 ###### functions
1107         static int itemset_cmp(struct symset a, struct symset b,
1108                                enum grammar_type type)
1109         {
1110                 int i;
1111                 int av, bv;
1112
1113                 for (i = 0;
1114                      i < a.cnt && i < b.cnt &&
1115                      item_index(a.syms[i]) > 0 &&
1116                      item_index(b.syms[i]) > 0;
1117                      i++) {
1118                         int diff = a.syms[i] - b.syms[i];
1119                         if (diff)
1120                                 return diff;
1121                         if (type == LR1) {
1122                                 diff = a.data[i] - b.data[i];
1123                                 if (diff)
1124                                         return diff;
1125                         }
1126                 }
1127                 if (i == a.cnt || item_index(a.syms[i]) == 0)
1128                         av = -1;
1129                 else
1130                         av = a.syms[i];
1131                 if (i == b.cnt || item_index(b.syms[i]) == 0)
1132                         bv = -1;
1133                 else
1134                         bv = b.syms[i];
1135                 if (av - bv)
1136                         return av - bv;
1137                 if (type < LR1 || av == -1)
1138                         return 0;
1139                 return
1140                         a.data[i] - b.data[i];
1141         }
1142
1143 And now we can build the list of itemsets.  The lookup routine returns
1144 both a success flag and a pointer to where in the list an insert
1145 should happen, so we don't need to search a second time.
1146
1147 ###### declarations
1148         struct itemset {
1149                 struct itemset *next;
1150                 short state;
1151                 struct symset items;
1152                 struct symset go_to;
1153                 char completed;
1154                 char starts_line;
1155         };
1156
1157 ###### grammar fields
1158         struct itemset *items;
1159         int states;
1160
1161 ###### functions
1162         static int itemset_find(struct grammar *g, struct itemset ***where,
1163                                 struct symset kernel, enum grammar_type type)
1164         {
1165                 struct itemset **ip;
1166
1167                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1168                         struct itemset *i = *ip;
1169                         int diff;
1170                         diff = itemset_cmp(i->items, kernel, type);
1171                         if (diff < 0)
1172                                 continue;
1173                         if (diff > 0)
1174                                 break;
1175                         /* found */
1176                         *where = ip;
1177                         return 1;
1178                 }
1179                 *where = ip;
1180                 return 0;
1181         }
1182
1183 Adding an itemset may require merging the LA sets if LALR analysis is
1184 happening. If any new LA set add symbol that weren't in the old LA set, we
1185 clear the `completed` flag so that the dependants of this itemset will be
1186 recalculated and their LA sets updated.
1187
1188 `add_itemset` must consume the symsets it is passed, either by adding
1189 them to a data structure, of freeing them.
1190
1191         static int add_itemset(struct grammar *g, struct symset ss,
1192                                enum grammar_type type, int starts_line)
1193         {
1194                 struct itemset **where, *is;
1195                 int i;
1196                 int found = itemset_find(g, &where, ss, type);
1197                 if (!found) {
1198                         is = calloc(1, sizeof(*is));
1199                         is->state = g->states;
1200                         g->states += 1;
1201                         is->items = ss;
1202                         is->next = *where;
1203                         is->go_to = INIT_DATASET;
1204                         is->starts_line = starts_line;
1205                         *where = is;
1206                         return is->state;
1207                 }
1208                 is = *where;
1209                 if (type != LALR) {
1210                         symset_free(ss);
1211                         return is->state;
1212                 }
1213                 for (i = 0; i < ss.cnt; i++) {
1214                         struct symset temp = INIT_SYMSET, s;
1215                         if (ss.data[i] == is->items.data[i])
1216                                 continue;
1217                         s = set_find(g, is->items.data[i]);
1218                         symset_union(&temp, &s);
1219                         s = set_find(g, ss.data[i]);
1220                         if (symset_union(&temp, &s)) {
1221                                 is->items.data[i] = save_set(g, temp);
1222                                 is->completed = 0;
1223                         } else
1224                                 symset_free(temp);
1225                 }
1226                 symset_free(ss);
1227                 return is->state;
1228         }
1229
1230 #### The build
1231
1232 To build all the itemsets, we first insert the initial itemset made from the
1233 start symbol, complete each itemset, and then generate new itemsets from old
1234 until no new ones can be made.
1235
1236 Completing an itemset means finding all the items where "DOT" is followed by
1237 a nonterminal and adding "DOT=0" items for every production from that
1238 non-terminal - providing each item hasn't already been added.
1239
1240 If LA sets are needed, the LA set for each new item is found using
1241 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This may
1242 be supplemented by the LA set for the item which produce the new item.
1243
1244 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1245 is used in the next stage.
1246
1247 NOTE: precedence handling should happen here - I haven't written this yet
1248 though.
1249
1250 ###### complete itemset
1251         for (i = 0; i < is->items.cnt; i++) {
1252                 int p = item_prod(is->items.syms[i]);
1253                 int bs = item_index(is->items.syms[i]);
1254                 struct production *pr = g->productions[p];
1255                 int p2;
1256                 struct symbol *s;
1257                 struct symset LA = INIT_SYMSET;
1258                 unsigned short sn = 0;
1259
1260                 if (bs == pr->body_size)
1261                         continue;
1262                 s = pr->body[bs];
1263                 if (symset_find(&done, s->num) < 0)
1264                         symset_add(&done, s->num, 0);
1265                 if (s->type != Nonterminal)
1266                         continue;
1267                 again = 1;
1268                 if (type >= LALR) {
1269                         // Need the LA set.
1270                         int to_end;
1271                         add_first(pr, bs+1, &LA, g, &to_end);
1272                         if (to_end) {
1273                                 struct symset ss = set_find(g, is->items.data[i]);
1274                                 symset_union(&LA, &ss);
1275                         }
1276                         sn = save_set(g, LA);
1277                         LA = set_find(g, sn);
1278                 }
1279
1280                 /* Add productions for this symbol */
1281                 for (p2 = s->first_production;
1282                      p2 < g->production_count &&
1283                       g->productions[p2]->head == s;
1284                      p2++) {
1285                         int itm = item_num(p2, 0);
1286                         int pos = symset_find(&is->items, itm);
1287                         if (pos < 0) {
1288                                 symset_add(&is->items, itm, sn);
1289                                 /* Will have re-ordered, so start
1290                                  * from beginning again */
1291                                 i = -1;
1292                         } else if (type >= LALR) {
1293                                 struct symset ss = set_find(g, is->items.data[pos]);
1294                                 struct symset tmp = INIT_SYMSET;
1295
1296                                 symset_union(&tmp, &ss);
1297                                 if (symset_union(&tmp, &LA)) {
1298                                         is->items.data[pos] = save_set(g, tmp);
1299                                         i = -1;
1300                                 }else
1301                                         symset_free(tmp);
1302                         }
1303                 }
1304         }
1305
1306 For each symbol we found (and placed in `done`) we collect all the items for
1307 which this symbol is next, and create a new itemset with "DOT" advanced over
1308 the symbol.  This is then added to the collection of itemsets (or merged
1309 with a pre-existing itemset).
1310
1311 ###### derive itemsets
1312         // Now we have a completed itemset, so we need to
1313         // compute all the 'next' itemsets and create them
1314         // if they don't exist.
1315         for (i = 0; i < done.cnt; i++) {
1316                 int j;
1317                 unsigned short state;
1318                 int starts_line = 0;
1319                 struct symbol *sym = g->symtab[done.syms[i]];
1320                 struct symset newitemset = INIT_SYMSET;
1321                 if (type >= LALR)
1322                         newitemset = INIT_DATASET;
1323
1324                 if (sym->can_eol ||
1325                     (sym->nullable && is->starts_line))
1326                         starts_line = 1;
1327                 for (j = 0; j < is->items.cnt; j++) {
1328                         int itm = is->items.syms[j];
1329                         int p = item_prod(itm);
1330                         int bp = item_index(itm);
1331                         struct production *pr = g->productions[p];
1332                         unsigned short la = 0;
1333                         int pos;
1334
1335                         if (bp == pr->body_size)
1336                                 continue;
1337                         if (pr->body[bp] != sym)
1338                                 continue;
1339                         if (type >= LALR)
1340                                 la = is->items.data[j];
1341                         pos = symset_find(&newitemset, pr->head->num);
1342                         if (pos < 0)
1343                                 symset_add(&newitemset, item_num(p, bp+1), la);
1344                         else if (type >= LALR) {
1345                                 // Need to merge la set.
1346                                 int la2 = newitemset.data[pos];
1347                                 if (la != la2) {
1348                                         struct symset ss = set_find(g, la2);
1349                                         struct symset LA = INIT_SYMSET;
1350                                         symset_union(&LA, &ss);
1351                                         ss = set_find(g, la);
1352                                         if (symset_union(&LA, &ss))
1353                                                 newitemset.data[pos] = save_set(g, LA);
1354                                         else
1355                                                 symset_free(LA);
1356                                 }
1357                         }
1358                 }
1359                 state = add_itemset(g, newitemset, type, starts_line);
1360                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1361                         symset_add(&is->go_to, done.syms[i], state);
1362         }
1363
1364 All that is left is to crate the initial itemset from production zero, and
1365 with `TK_eof` as the LA set.
1366
1367 ###### functions
1368         static void build_itemsets(struct grammar *g, enum grammar_type type)
1369         {
1370                 struct symset first = INIT_SYMSET;
1371                 struct itemset *is;
1372                 int again;
1373                 unsigned short la = 0;
1374                 if (type >= LALR) {
1375                         // LA set just has eof
1376                         struct symset eof = INIT_SYMSET;
1377                         symset_add(&eof, TK_eof, 0);
1378                         la = save_set(g, eof);
1379                         first = INIT_DATASET;
1380                 }
1381                 // production 0, offset 0 (with no data)
1382                 symset_add(&first, item_num(0, 0), la);
1383                 add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
1384                 for (again = 0, is = g->items;
1385                      is;
1386                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
1387                         int i;
1388                         struct symset done = INIT_SYMSET;
1389                         if (is->completed)
1390                                 continue;
1391                         is->completed = 1;
1392                         again = 1;
1393                         ## complete itemset
1394                         ## derive itemsets
1395                         symset_free(done);
1396                 }
1397         }
1398
1399 ### Completing the analysis.
1400
1401 The exact process of analysis depends on which level was requested.  For
1402 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1403 `LR1` we need first, but not follow.  For `SLR` we need both.
1404
1405 We don't build the "action" tables that you might expect as the parser
1406 doesn't use them.  They will be calculated to some extent if needed for
1407 a report.
1408
1409 Once we have built everything we allocate arrays for the two lists:
1410 symbols and itemsets.  This allows more efficient access during reporting.
1411 The symbols are grouped as terminals and non-terminals and we record the
1412 changeover point in `first_nonterm`.
1413
1414 ###### grammar fields
1415         struct symbol **symtab;
1416         struct itemset **statetab;
1417         int first_nonterm;
1418
1419 ###### grammar_analyse
1420
1421         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1422         {
1423                 struct symbol *s;
1424                 struct itemset *is;
1425                 int snum = TK_reserved;
1426                 for (s = g->syms; s; s = s->next)
1427                         if (s->num < 0 && s->type == Terminal) {
1428                                 s->num = snum;
1429                                 snum++;
1430                         }
1431                 g->first_nonterm = snum;
1432                 for (s = g->syms; s; s = s->next)
1433                         if (s->num < 0) {
1434                                 s->num = snum;
1435                                 snum++;
1436                         }
1437                 g->num_syms = snum;
1438                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1439                 for (s = g->syms; s; s = s->next)
1440                         g->symtab[s->num] = s;
1441
1442                 set_nullable(g);
1443                 set_can_eol(g);
1444                 if (type >= SLR)
1445                         build_first(g);
1446
1447                 if (type == SLR)
1448                         build_follow(g);
1449
1450                 build_itemsets(g, type);
1451
1452                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1453                 for (is = g->items; is ; is = is->next)
1454                         g->statetab[is->state] = is;
1455         }
1456
1457 ## Reporting on the Grammar
1458
1459 The purpose of the report is to give the grammar developer insight into
1460 how the grammar parser will work.  It is basically a structured dump of
1461 all the tables that have been generated, plus an description of any conflicts.
1462
1463 ###### grammar_report
1464         static int grammar_report(struct grammar *g, enum grammar_type type)
1465         {
1466                 report_symbols(g);
1467                 if (g->follow)
1468                         report_follow(g);
1469                 report_itemsets(g);
1470                 return report_conflicts(g, type);
1471         }
1472
1473 Firstly we have the complete list of symbols, together with the "FIRST"
1474 set if that was generated.
1475
1476 ###### functions
1477
1478         static void report_symbols(struct grammar *g)
1479         {
1480                 int n;
1481                 if (g->first)
1482                         printf("SYMBOLS + FIRST:\n");
1483                 else
1484                         printf("SYMBOLS:\n");
1485
1486                 for (n = 0; n < g->num_syms; n++) {
1487                         struct symbol *s = g->symtab[n];
1488                         if (!s)
1489                                 continue;
1490
1491                         printf(" %c%c%3d%c: ",
1492                                s->nullable ? '.':' ',
1493                                s->can_eol ? '>':' ',
1494                                s->num, symtypes[s->type]);
1495                         prtxt(s->name);
1496                         if (s->precedence)
1497                                 printf(" (%d%s)", s->precedence,
1498                                        assoc_names[s->assoc]);
1499
1500                         if (g->first && s->type == Nonterminal) {
1501                                 int j;
1502                                 char c = ':';
1503                                 for (j = 0; j < g->first[n].cnt; j++) {
1504                                         printf("%c ", c);
1505                                         c = ',';
1506                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1507                                 }
1508                         }
1509                         printf("\n");
1510                 }
1511         }
1512
1513 Then we have to follow sets if they were computed.
1514
1515         static void report_follow(struct grammar *g)
1516         {
1517                 int n;
1518                 printf("FOLLOW:\n");
1519                 for (n = 0; n < g->num_syms; n++)
1520                         if (g->follow[n].cnt) {
1521                                 int j;
1522                                 char c = ':';
1523                                 printf("  ");
1524                                 prtxt(g->symtab[n]->name);
1525                                 for (j = 0; j < g->follow[n].cnt; j++) {
1526                                         printf("%c ", c);
1527                                         c = ',';
1528                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1529                                 }
1530                                 printf("\n");
1531                         }
1532         }
1533
1534 And finally the item sets.  These include the GO TO tables and, for
1535 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1536 it up a bit.  First the items, with production number and associativity.
1537
1538         static void report_item(struct grammar *g, int itm)
1539         {
1540                 int p = item_prod(itm);
1541                 int dot = item_index(itm);
1542                 struct production *pr = g->productions[p];
1543                 int i;
1544
1545                 printf("    ");
1546                 prtxt(pr->head->name);
1547                 printf(" ->");
1548                 for (i = 0; i < pr->body_size; i++) {
1549                         printf(" %s", (dot == i ? ". ": ""));
1550                         prtxt(pr->body[i]->name);
1551                 }
1552                 if (dot == pr->body_size)
1553                         printf(" .");
1554                 printf(" [%d]", p);
1555                 if (pr->precedence)
1556                         printf(" (%d%s)", pr->precedence,
1557                                assoc_names[pr->assoc]);
1558                 printf("\n");
1559         }
1560
1561 The LA sets which are (possibly) reported with each item:
1562
1563         static void report_la(struct grammar *g, int lanum)
1564         {
1565                 struct symset la = set_find(g, lanum);
1566                 int i;
1567                 char c = ':';
1568
1569                 printf("        LOOK AHEAD(%d)", lanum);
1570                 for (i = 0; i < la.cnt; i++) {
1571                         printf("%c ", c);
1572                         c = ',';
1573                         prtxt(g->symtab[la.syms[i]]->name);
1574                 }
1575                 printf("\n");
1576         }
1577
1578 Then the go to sets:
1579
1580
1581         static void report_goto(struct grammar *g, struct symset gt)
1582         {
1583                 int i;
1584                 printf("    GOTO:\n");
1585
1586                 for (i = 0; i < gt.cnt; i++) {
1587                         printf("      ");
1588                         prtxt(g->symtab[gt.syms[i]]->name);
1589                         printf(" -> %d\n", gt.data[i]);
1590                 }
1591         }
1592
1593 Now we can report all the item sets complete with items, LA sets, and GO TO.
1594
1595         static void report_itemsets(struct grammar *g)
1596         {
1597                 int s;
1598                 printf("ITEM SETS(%d)\n", g->states);
1599                 for (s = 0; s < g->states; s++) {
1600                         int j;
1601                         struct itemset *is = g->statetab[s];
1602                         printf("  Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
1603                         for (j = 0; j < is->items.cnt; j++) {
1604                                 report_item(g, is->items.syms[j]);
1605                                 if (is->items.data != NO_DATA)
1606                                         report_la(g, is->items.data[j]);
1607                         }
1608                         report_goto(g, is->go_to);
1609                 }
1610         }
1611
1612 ### Reporting conflicts
1613
1614 Conflict detection varies a lot among different analysis levels.  However
1615 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1616 LR1 are also similar as they have FOLLOW or LA sets.
1617
1618 ###### functions
1619
1620         ## conflict functions
1621
1622         static int report_conflicts(struct grammar *g, enum grammar_type type)
1623         {
1624                 int cnt = 0;
1625                 printf("Conflicts:\n");
1626                 if (type < SLR)
1627                         cnt = conflicts_lr0(g, type);
1628                 else
1629                         cnt = conflicts_slr(g, type);
1630                 if (cnt == 0)
1631                         printf(" - no conflicts\n");
1632                 return cnt;
1633         }
1634
1635 LR0 conflicts are any state which have both a reducible item and
1636 a shiftable item.
1637
1638 LR05 conflicts only occurs if two possibly reductions exist,
1639 as shifts always over-ride reductions.
1640
1641 ###### conflict functions
1642         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1643         {
1644                 int i;
1645                 int cnt = 0;
1646                 for (i = 0; i < g->states; i++) {
1647                         struct itemset *is = g->statetab[i];
1648                         int last_reduce = -1;
1649                         int prev_reduce = -1;
1650                         int last_shift = -1;
1651                         int j;
1652                         if (!is)
1653                                 continue;
1654                         for (j = 0; j < is->items.cnt; j++) {
1655                                 int itm = is->items.syms[j];
1656                                 int p = item_prod(itm);
1657                                 int bp = item_index(itm);
1658                                 struct production *pr = g->productions[p];
1659
1660                                 if (bp == pr->body_size) {
1661                                         prev_reduce = last_reduce;
1662                                         last_reduce = j;
1663                                         continue;
1664                                 }
1665                                 if (pr->body[bp]->type == Terminal)
1666                                         last_shift = j;
1667                         }
1668                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1669                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1670                                 report_item(g, is->items.syms[last_shift]);
1671                                 report_item(g, is->items.syms[last_reduce]);
1672                                 cnt++;
1673                         }
1674                         if (prev_reduce >= 0) {
1675                                 printf("  State %d has 2 (or more) reducible items\n", i);
1676                                 report_item(g, is->items.syms[prev_reduce]);
1677                                 report_item(g, is->items.syms[last_reduce]);
1678                                 cnt++;
1679                         }
1680                 }
1681                 return cnt;
1682         }
1683
1684 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1685 look ahead, or if a symbol in a look-ahead can be shifted.  The differ only
1686 in the source of the look ahead set.
1687
1688 We build a dataset mapping terminal to item for possible SHIFTs and then
1689 another for possible REDUCE operations. We report when we get conflicts
1690 between the two.
1691
1692         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1693         {
1694                 int i;
1695                 int cnt = 0;
1696
1697                 for (i = 0; i < g->states; i++) {
1698                         struct itemset *is = g->statetab[i];
1699                         struct symset shifts = INIT_DATASET;
1700                         struct symset reduce = INIT_DATASET;
1701                         int j;
1702                         if (!is)
1703                                 continue;
1704                         /* First collect the shifts */
1705                         for (j = 0; j < is->items.cnt; j++) {
1706                                 unsigned short itm = is->items.syms[j];
1707                                 int p = item_prod(itm);
1708                                 int bp = item_index(itm);
1709                                 struct production *pr = g->productions[p];
1710
1711                                 if (bp < pr->body_size &&
1712                                     pr->body[bp]->type == Terminal) {
1713                                         /* shiftable */
1714                                         int sym = pr->body[bp]->num;
1715                                         if (symset_find(&shifts, sym) < 0)
1716                                                 symset_add(&shifts, sym, itm);
1717                                 }
1718                         }
1719                         /* Now look for reduction and conflicts */
1720                         for (j = 0; j < is->items.cnt; j++) {
1721                                 unsigned short itm = is->items.syms[j];
1722                                 int p = item_prod(itm);
1723                                 int bp = item_index(itm);
1724                                 struct production *pr = g->productions[p];
1725
1726                                 if (bp < pr->body_size)
1727                                         continue;
1728                                 /* reducible */
1729                                 struct symset la;
1730                                 if (type == SLR)
1731                                         la = g->follow[pr->head->num];
1732                                 else
1733                                         la = set_find(g, is->items.data[j]);
1734                                 int k;
1735                                 for (k = 0; k < la.cnt; k++) {
1736                                         int pos = symset_find(&shifts, la.syms[k]);
1737                                         if (pos >= 0) {
1738                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1739                                                 prtxt(g->symtab[la.syms[k]]->name);
1740                                                 printf(":\n");
1741                                                 report_item(g, shifts.data[pos]);
1742                                                 report_item(g, itm);
1743                                                 cnt++;
1744                                         }
1745                                         pos = symset_find(&reduce, la.syms[k]);
1746                                         if (pos < 0) {
1747                                                 symset_add(&reduce, la.syms[k], itm);
1748                                                 continue;
1749                                         }
1750                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1751                                         prtxt(g->symtab[la.syms[k]]->name);
1752                                         printf(":\n");
1753                                         report_item(g, itm);
1754                                         report_item(g, reduce.data[pos]);
1755                                         cnt++;
1756                                 }
1757                         }
1758                         symset_free(shifts);
1759                         symset_free(reduce);
1760                 }
1761                 return cnt;
1762         }
1763
1764
1765 ## Generating the parser
1766
1767 The export part of the parser is the `parse_XX` function, where the name
1768 `XX` is based on the name of the parser files.
1769
1770 This takes a `code_node`, a partially initialized `token_config`, and an
1771 optional `FILE` to send tracing to.  The `token_config` gets the list of
1772 known words added and then is used with the `code_node` to initialize the
1773 scanner.
1774
1775 `parse_XX` then call the library function `parser_run` to actually complete
1776 the parse.  This needs the `states` table and function to call the various
1777 pieces of code provided in the grammar file, so they are generated first.
1778
1779 ###### parser_generate
1780
1781         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1782         {
1783                 gen_known(f, g);
1784                 gen_non_term(f, g);
1785                 gen_goto(f, g);
1786                 gen_states(f, g);
1787                 gen_reduce(f, g, file);
1788                 gen_free(f, g);
1789
1790                 fprintf(f, "#line 0 \"gen_parser\"\n");
1791                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1792                         name);
1793                 fprintf(f, "{\n");
1794                 fprintf(f, "\tstruct token_state *tokens;\n");
1795                 fprintf(f, "\tconfig->words_marks = known;\n");
1796                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1797                 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1798                 fprintf(f, "\ttokens = token_open(code, config);\n");
1799                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
1800                 fprintf(f, "\ttoken_close(tokens);\n");
1801                 fprintf(f, "\treturn rv;\n");
1802                 fprintf(f, "}\n\n");
1803         }
1804
1805 ### Table words table
1806
1807 The know words is simply an array of terminal symbols.
1808 The table of nonterminals used for tracing is a similar array.
1809
1810 ###### functions
1811
1812         static void gen_known(FILE *f, struct grammar *g)
1813         {
1814                 int i;
1815                 fprintf(f, "#line 0 \"gen_known\"\n");
1816                 fprintf(f, "static const char *known[] = {\n");
1817                 for (i = TK_reserved;
1818                      i < g->num_syms && g->symtab[i]->type == Terminal;
1819                      i++)
1820                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1821                                 g->symtab[i]->name.txt);
1822                 fprintf(f, "};\n\n");
1823         }
1824
1825         static void gen_non_term(FILE *f, struct grammar *g)
1826         {
1827                 int i;
1828                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1829                 fprintf(f, "static const char *non_term[] = {\n");
1830                 for (i = TK_reserved;
1831                      i < g->num_syms;
1832                      i++)
1833                         if (g->symtab[i]->type == Nonterminal)
1834                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1835                                         g->symtab[i]->name.txt);
1836                 fprintf(f, "};\n\n");
1837         }
1838
1839 ### States and the goto tables.
1840
1841 For each state we record the goto table, the reducible production if
1842 there is one, or a symbol to shift for error recovery.
1843 Some of the details of the reducible production are stored in the
1844 `do_reduce` function to come later.  Here we store the production number,
1845 the body size (useful for stack management) and the resulting symbol (useful
1846 for knowing how to free data later).
1847
1848 The go to table is stored in a simple array of `sym` and corresponding
1849 `state`.
1850
1851 ###### exported types
1852
1853         struct lookup {
1854                 short sym;
1855                 short state;
1856         };
1857         struct state {
1858                 short go_to_cnt;
1859                 const struct lookup * go_to;
1860                 short reduce_prod;
1861                 short reduce_size;
1862                 short reduce_sym;
1863                 short shift_sym;
1864                 short starts_line;
1865         };
1866
1867
1868 ###### functions
1869
1870         static void gen_goto(FILE *f, struct grammar *g)
1871         {
1872                 int i;
1873                 fprintf(f, "#line 0 \"gen_goto\"\n");
1874                 for (i = 0; i < g->states; i++) {
1875                         int j;
1876                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
1877                                 i);
1878                         struct symset gt = g->statetab[i]->go_to;
1879                         for (j = 0; j < gt.cnt; j++)
1880                                 fprintf(f, "\t{ %d, %d },\n",
1881                                         gt.syms[j], gt.data[j]);
1882                         fprintf(f, "};\n");
1883                 }
1884         }
1885
1886 ###### functions
1887
1888         static void gen_states(FILE *f, struct grammar *g)
1889         {
1890                 int i;
1891                 fprintf(f, "#line 0 \"gen_states\"\n");
1892                 fprintf(f, "static const struct state states[] = {\n");
1893                 for (i = 0; i < g->states; i++) {
1894                         struct itemset *is = g->statetab[i];
1895                         int j, prod = -1, prod_len;
1896                         int shift_sym = -1;
1897                         int shift_len = 0, shift_remain = 0;
1898                         for (j = 0; j < is->items.cnt; j++) {
1899                                 int itm = is->items.syms[j];
1900                                 int p = item_prod(itm);
1901                                 int bp = item_index(itm);
1902                                 struct production *pr = g->productions[p];
1903
1904                                 if (bp < pr->body_size) {
1905                                         if (shift_sym < 0 ||
1906                                             (shift_len == bp && shift_remain > pr->body_size - bp)) {
1907                                                 shift_sym = pr->body[bp]->num;
1908                                                 shift_len = bp;
1909                                                 shift_remain = pr->body_size - bp;
1910                                         }
1911                                         continue;
1912                                 }
1913                                 /* This is what we reduce */
1914                                 if (prod < 0 || prod_len < pr->body_size) {
1915                                         prod = p;
1916                                         prod_len = pr->body_size;
1917                                 }
1918                         }
1919
1920                         if (prod >= 0)
1921                                 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
1922                                         i, is->go_to.cnt, i, prod,
1923                                         g->productions[prod]->body_size,
1924                                         g->productions[prod]->head->num,
1925                                         is->starts_line);
1926                         else
1927                                 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
1928                                         i, is->go_to.cnt, i, shift_sym,
1929                                         is->starts_line);
1930                 }
1931                 fprintf(f, "};\n\n");
1932         }
1933
1934 ### The `do_reduce` function and the code
1935
1936 When the parser engine decides to reduce a production, it calls `do_reduce`.
1937 This has two functions.
1938
1939 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1940 production being reduced.  Secondly it runs the code that was included with
1941 the production if any.
1942
1943 This code needs to be able to store data somewhere.  Rather than requiring
1944 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1945 `do_reduce` return the size to be saved.
1946
1947 The code fragment requires translation when written out.  Any `$N` needs to
1948 be converted to a reference either to that buffer (if `$0`) or to the
1949 structure returned by a previous reduction.  These pointer need to be cast
1950 to the appropriate type for each access.  All this is handling in
1951 `gen_code`.
1952
1953
1954 ###### functions
1955
1956         static void gen_code(struct production *p, FILE *f, struct grammar *g)
1957         {
1958                 char *c;
1959                 fprintf(f, "\t\t\t");
1960                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1961                         int n;
1962                         if (*c != '$') {
1963                                 fputc(*c, f);
1964                                 if (*c == '\n')
1965                                         fputs("\t\t\t", f);
1966                                 continue;
1967                         }
1968                         c++;
1969                         if (*c < '0' || *c > '9') {
1970                                 fputc(*c, f);
1971                                 continue;
1972                         }
1973                         n = *c - '0';
1974                         while (c[1] >= '0' && c[1] <= '9') {
1975                                 c += 1;
1976                                 n = n * 10 + *c - '0';
1977                         }
1978                         if (n == 0)
1979                                 fprintf(f, "(*(struct %.*s*)ret)",
1980                                         p->head->struct_name.len,
1981                                         p->head->struct_name.txt);
1982                         else if (n > p->body_size)
1983                                 fprintf(f, "$%d", n);
1984                         else if (p->body[n-1]->type == Terminal)
1985                                 fprintf(f, "(*(struct token *)body[%d])",
1986                                         n-1);
1987                         else if (p->body[n-1]->struct_name.txt == NULL)
1988                                 fprintf(f, "$%d", n);
1989                         else
1990                                 fprintf(f, "(*(struct %.*s*)body[%d])",
1991                                         p->body[n-1]->struct_name.len,
1992                                         p->body[n-1]->struct_name.txt, n-1);
1993                 }
1994                 fputs("\n", f);
1995         }
1996
1997 ###### functions
1998
1999         static void gen_reduce(FILE *f, struct grammar *g, char *file)
2000         {
2001                 int i;
2002                 fprintf(f, "#line 0 \"gen_reduce\"\n");
2003                 fprintf(f, "static int do_reduce(int prod, void **body, void *ret)\n");
2004                 fprintf(f, "{\n");
2005                 fprintf(f, "\tint ret_size = 0;\n");
2006
2007                 fprintf(f, "\tswitch(prod) {\n");
2008                 for (i = 0; i < g->production_count; i++) {
2009                         struct production *p = g->productions[i];
2010                         fprintf(f, "\tcase %d:\n", i);
2011
2012                         if (p->code.txt)
2013                                 gen_code(p, f, g);
2014
2015                         if (p->head->struct_name.txt)
2016                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
2017                                         p->head->struct_name.len,
2018                                         p->head->struct_name.txt);
2019
2020                         fprintf(f, "\t\tbreak;\n");
2021                 }
2022                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2023         }
2024
2025 ### `do_free`
2026
2027 As each non-terminal can potentially cause a different type of data
2028 structure to be allocated and filled in, we need to be able to free it when
2029 done.
2030
2031 It is particularly important to have fine control over freeing during error
2032 recovery where individual stack frames might need to be freed.
2033
2034 For this, the grammar author required to defined a `free_XX` function for
2035 each structure that is used by a non-terminal.  `do_free` all call whichever
2036 is appropriate given a symbol number, and will call `free` (as is
2037 appropriate for tokens` on any terminal symbol.
2038
2039 ###### functions
2040
2041         static void gen_free(FILE *f, struct grammar *g)
2042         {
2043                 int i;
2044
2045                 fprintf(f, "#line 0 \"gen_free\"\n");
2046                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2047                 fprintf(f, "{\n");
2048                 fprintf(f, "\tif (!asn) return;\n");
2049                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2050                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2051                 fprintf(f, "\tswitch(sym) {\n");
2052
2053                 for (i = 0; i < g->num_syms; i++) {
2054                         struct symbol *s = g->symtab[i];
2055                         if (!s ||
2056                             s->type != Nonterminal ||
2057                             s->struct_name.txt == NULL)
2058                                 continue;
2059
2060                         fprintf(f, "\tcase %d:\n", s->num);
2061                         fprintf(f, "\t\tfree_%.*s(asn);\n",
2062                                 s->struct_name.len,
2063                                 s->struct_name.txt);
2064                         fprintf(f, "\t\tbreak;\n");
2065                 }
2066                 fprintf(f, "\t}\n}\n\n");
2067         }
2068
2069 ## The main routine.
2070
2071 There are three key parts to the "main" function of parsergen: processing
2072 the arguments, loading the grammar file, and dealing with the grammar.
2073
2074 ### Argument processing.
2075
2076 Command line options allow the selection of analysis mode, name of output
2077 file, and whether or not a report should be generated.  By default we create
2078 a report only if no code output was requested.
2079
2080 The `parse_XX` function name uses the basename of the output file which
2081 should not have a suffix (`.c`).  `.c` is added to the given name for the
2082 code, and `.h` is added for the header (if header text is specifed in the
2083 grammar file).
2084
2085 ###### includes
2086         #include <getopt.h>
2087
2088 ###### declarations
2089         static const struct option long_options[] = {
2090                 { "LR0",        0, NULL, '0' },
2091                 { "LR05",       0, NULL, '5' },
2092                 { "SLR",        0, NULL, 'S' },
2093                 { "LALR",       0, NULL, 'L' },
2094                 { "LR1",        0, NULL, '1' },
2095                 { "tag",        1, NULL, 't' },
2096                 { "report",     0, NULL, 'R' },
2097                 { "output",     1, NULL, 'o' },
2098                 { NULL,         0, NULL, 0   }
2099         };
2100         const char *options = "05SL1t:Ro:";
2101
2102 ###### process arguments
2103         int opt;
2104         char *outfile = NULL;
2105         char *infile;
2106         char *name;
2107         char *tag = NULL;
2108         int report = 1;
2109         enum grammar_type type = LR05;
2110         while ((opt = getopt_long(argc, argv, options,
2111                                   long_options, NULL)) != -1) {
2112                 switch(opt) {
2113                 case '0':
2114                         type = LR0; break;
2115                 case '5':
2116                         type = LR05; break;
2117                 case 'S':
2118                         type = SLR; break;
2119                 case 'L':
2120                         type = LALR; break;
2121                 case '1':
2122                         type = LR1; break;
2123                 case 'R':
2124                         report = 2; break;
2125                 case 'o':
2126                         outfile = optarg; break;
2127                 case 't':
2128                         tag = optarg; break;
2129                 default:
2130                         fprintf(stderr, "Usage: parsergen ...\n");
2131                         exit(1);
2132                 }
2133         }
2134         if (optind < argc)
2135                 infile = argv[optind++];
2136         else {
2137                 fprintf(stderr, "No input file given\n");
2138                 exit(1);
2139         }
2140         if (outfile && report == 1)
2141                 report = 0;
2142         name = outfile;
2143         if (name && strchr(name, '/'))
2144                 name = strrchr(name, '/')+1;
2145
2146         if (optind < argc) {
2147                 fprintf(stderr, "Excess command line arguments\n");
2148                 exit(1);
2149         }
2150
2151 ### Loading the grammar file
2152
2153 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2154 map it.
2155
2156 One we have extracted the code (with `mdcode`) we expect to file three
2157 sections: header, code, and grammar.  Anything else is an error.
2158
2159 "header" and "code" are optional, though it is hard to build a working
2160 parser with neither. "grammar" must be provided.
2161
2162 ###### includes
2163         #include <fcntl.h>
2164         #include <sys/mman.h>
2165         #include <errno.h>
2166
2167 ###### functions
2168         static int errs;
2169         static void pr_err(char *msg)
2170         {
2171                 errs++;
2172                 fprintf(stderr, "%s\n", msg);
2173         }
2174
2175 ###### load file
2176         struct section *table;
2177         int fd;
2178         int len;
2179         char *file;
2180         fd = open(infile, O_RDONLY);
2181         if (fd < 0) {
2182                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2183                         infile, strerror(errno));
2184                 exit(1);
2185         }
2186         len = lseek(fd, 0, 2);
2187         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2188         table = code_extract(file, file+len, pr_err);
2189
2190         struct code_node *hdr = NULL;
2191         struct code_node *code = NULL;
2192         struct code_node *gram = NULL;
2193         for (s = table; s; s = s->next) {
2194                 struct text sec = s->section;
2195                 if (tag && !strip_tag(&sec, tag))
2196                         continue;
2197                 if (text_is(sec, "header"))
2198                         hdr = s->code;
2199                 else if (text_is(sec, "code"))
2200                         code = s->code;
2201                 else if (text_is(sec, "grammar"))
2202                         gram = s->code;
2203                 else {
2204                         fprintf(stderr, "Unknown content section: %.*s\n",
2205                                 s->section.len, s->section.txt);
2206                         rv |= 2;
2207                 }
2208         }
2209
2210 ### Processing the input
2211
2212 As we need to append an extention to a filename and then open it for
2213 writing, and we need to do this twice, it helps to have a separate function.
2214
2215 ###### functions
2216
2217         static FILE *open_ext(char *base, char *ext)
2218         {
2219                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2220                 FILE *f;
2221                 strcat(strcpy(fn, base), ext);
2222                 f = fopen(fn, "w");
2223                 free(fn);
2224                 return f;
2225         }
2226
2227 If we can read the grammar, then we analyse and optionally report on it.  If
2228 the report finds conflicts we will exit with an error status.
2229
2230 ###### process input
2231         struct grammar *g = NULL;
2232         if (gram == NULL) {
2233                 fprintf(stderr, "No grammar section provided\n");
2234                 rv |= 2;
2235         } else {
2236                 g = grammar_read(gram);
2237                 if (!g) {
2238                         fprintf(stderr, "Failure to parse grammar\n");
2239                         rv |= 2;
2240                 }
2241         }
2242         if (g) {
2243                 grammar_analyse(g, type);
2244                 if (report)
2245                         if (grammar_report(g, type))
2246                                 rv |= 1;
2247         }
2248
2249 If a headers section is defined, we write it out and include a declaration
2250 for the `parse_XX` function so it can be used from separate code.
2251
2252         if (rv == 0 && hdr && outfile) {
2253                 FILE *f = open_ext(outfile, ".h");
2254                 if (f) {
2255                         code_node_print(f, hdr, infile);
2256                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2257                                 name);
2258                         fclose(f);
2259                 } else {
2260                         fprintf(stderr, "Cannot create %s.h\n",
2261                                 outfile);
2262                         rv |= 4;
2263                 }
2264         }
2265
2266 And if all goes well and an output file was provided, we create the `.c`
2267 file with the code section (if any) and the parser tables and function.
2268
2269         if (rv == 0 && outfile) {
2270                 FILE *f = open_ext(outfile, ".c");
2271                 if (f) {
2272                         if (code)
2273                                 code_node_print(f, code, infile);
2274                         gen_parser(f, g, infile, name);
2275                         fclose(f);
2276                 } else {
2277                         fprintf(stderr, "Cannot create %s.c\n",
2278                                 outfile);
2279                         rv |= 4;
2280                 }
2281         }
2282
2283 And that about wraps it up.  We need to set the locale so that UTF-8 is
2284 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2285
2286 ###### File: parsergen.mk
2287         parsergen : parsergen.o libscanner.o libmdcode.o
2288                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2289
2290 ###### includes
2291         #include <locale.h>
2292
2293 ###### main
2294
2295         int main(int argc, char *argv[])
2296         {
2297                 struct section *s;
2298                 int rv = 0;
2299
2300                 setlocale(LC_ALL,"");
2301
2302                 ## process arguments
2303                 ## load file
2304                 ## process input
2305
2306                 return rv;
2307         }
2308
2309 ## The SHIFT/REDUCE parser
2310
2311 Having analysed the grammar and generated all the table, we only need the
2312 shift/reduce engine to bring it all together.
2313
2314 ### Goto table lookup
2315
2316 The parser generator has nicely provided us with goto tables sorted by
2317 symbol number.  We need a binary search function to find a symbol in the
2318 table.
2319
2320 ###### parser functions
2321
2322         static int search(const struct state *l, int sym)
2323         {
2324                 int lo = 0;
2325                 int hi = l->go_to_cnt;
2326
2327                 if (hi == 0)
2328                         return -1;
2329                 while (lo + 1 < hi) {
2330                         int mid = (lo + hi) / 2;
2331                         if (l->go_to[mid].sym <= sym)
2332                                 lo = mid;
2333                         else
2334                                 hi = mid;
2335                 }
2336                 if (l->go_to[lo].sym == sym)
2337                         return l->go_to[lo].state;
2338                 else
2339                         return -1;
2340         }
2341
2342 ### The state stack.
2343
2344 The core data structure for the parser is the stack.  This tracks all the
2345 symbols that have been recognised or partially recognised.
2346
2347 The stack usually won't grow very large - maybe a few tens of entries.  So
2348 we dynamically resize and array as required but never bother to shrink it
2349 down again.
2350
2351 We keep the stack as two separate allocations.  One, `asn_stack` stores the
2352 "abstract syntax nodes" which are created by each reduction.  When we call
2353 `do_reduce` we need to pass an array of the `asn`s of the body of the
2354 production, and by keeping a separate `asn` stack, we can just pass a
2355 pointer into this stack.
2356
2357 The other allocation stores all other stack fields of which there are four.
2358 The `state` is the most important one and guides the parsing process.  The
2359 `sym` is nearly unnecessary.  However when we want to free entries from the
2360 `asn_stack`, it helps to know what type they are so we can call the right
2361 freeing function.  The symbol leads us to the right free function through
2362 `do_free`.
2363
2364 The `indents` count and the `starts_indented` flag track the line
2365 indents in the symbol.  These are used to allow indent information to
2366 guide parsing and error recovery.
2367
2368 As well as the stack of frames we have a `next` frame which is
2369 assembled from the incoming token and other information prior to
2370 pushing it onto the stack.
2371
2372 ###### parser functions
2373
2374         struct parser {
2375                 struct frame {
2376                         short state;
2377                         short sym;
2378                         short starts_indented;
2379                         short indents;
2380                         short newline_permitted;
2381                 } *stack, next;
2382                 void **asn_stack;
2383                 int stack_size;
2384                 int tos;
2385         };
2386
2387 #### Shift and pop
2388
2389 Two operations are needed on the stack - shift (which is like push) and pop.
2390
2391 Shift applies not only to terminals but also to non-terminals.  When we
2392 reduce a production we will pop off entries corresponding to the body
2393 symbols, then push on an item for the head of the production.  This last is
2394 exactly the same process as shifting in a terminal so we use the same
2395 function for both.
2396
2397 To simplify other code we arrange for `shift` to fail if there is no `goto`
2398 state for the symbol.  This is useful in basic parsing due to our design
2399 that we shift when we can, and reduce when we cannot.  So the `shift`
2400 function reports if it could.
2401
2402 So `shift` finds the next state.  If that succeed it extends the allocations
2403 if needed and pushes all the information onto the stacks.
2404
2405 ###### parser functions
2406
2407         static int shift(struct parser *p,
2408                          void *asn,
2409                          const struct state states[])
2410         {
2411                 // Push an entry onto the stack
2412                 int newstate = search(&states[p->next.state], p->next.sym);
2413                 if (newstate < 0)
2414                         return 0;
2415                 if (p->tos >= p->stack_size) {
2416                         p->stack_size += 10;
2417                         p->stack = realloc(p->stack, p->stack_size
2418                                            * sizeof(p->stack[0]));
2419                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2420                                            * sizeof(p->asn_stack[0]));
2421                 }
2422                 p->stack[p->tos] = p->next;
2423                 p->asn_stack[p->tos] = asn;
2424                 p->tos++;
2425                 p->next.state = newstate;
2426                 p->next.indents = 0;
2427                 p->next.starts_indented = 0;
2428                 // if new state doesn't start a line, we inherit newline_permitted status
2429                 if (states[newstate].starts_line)
2430                         p->next.newline_permitted = 1;
2431                 return 1;
2432         }
2433
2434 `pop` simply moves the top of stack (`tos`) back down the required amount
2435 and frees any `asn` entries that need to be freed.  It is called _after_ we
2436 reduce a production, just before we `shift` the nonterminal in.
2437
2438 ###### parser functions
2439
2440         static void pop(struct parser *p, int num,
2441                         void(*do_free)(short sym, void *asn))
2442         {
2443                 int i;
2444                 p->tos -= num;
2445                 for (i = 0; i < num; i++) {
2446                         p->next.indents += p->stack[p->tos+i].indents;
2447                         do_free(p->stack[p->tos+i].sym,
2448                                 p->asn_stack[p->tos+i]);
2449                 }
2450
2451                 if (num) {
2452                         p->next.state = p->stack[p->tos].state;
2453                         p->next.starts_indented = p->stack[p->tos].starts_indented;
2454                         p->next.newline_permitted = p->stack[p->tos].newline_permitted;
2455                         if (p->next.indents > p->next.starts_indented)
2456                                 p->next.newline_permitted = 0;
2457                 }
2458         }
2459
2460 ### Memory allocation
2461
2462 The `scanner` returns tokens in a local variable - we want them in allocated
2463 memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
2464 a reduce is in a large buffer.  Both of these require some allocation and
2465 copying, hence `memdup` and `tokcopy`.
2466
2467 ###### parser includes
2468         #include <memory.h>
2469
2470 ###### parser functions
2471
2472         void *memdup(void *m, int len)
2473         {
2474                 void *ret;
2475                 if (len == 0)
2476                         return NULL;
2477                 ret = malloc(len);
2478                 memcpy(ret, m, len);
2479                 return ret;
2480         }
2481
2482         static struct token *tok_copy(struct token tk)
2483         {
2484                 struct token *new = malloc(sizeof(*new));
2485                 *new = tk;
2486                 return new;
2487         }
2488
2489 ### The heart of the parser.
2490
2491 Now we have the parser.  If we can shift, we do.  If not and we can reduce
2492 we do.  If the production we reduced was production zero, then we have
2493 accepted the input and can finish.
2494
2495 We return whatever `asn` was returned by reducing production zero.
2496
2497 If we can neither shift nor reduce we have an error to handle.  We pop
2498 single entries off the stack until we can shift the `TK_error` symbol, then
2499 drop input tokens until we find one we can shift into the new error state.
2500
2501 When we find `TK_in` and `TK_out` tokens which report indents we need
2502 to handle them directly as the grammar cannot express what we want to
2503 do with them.
2504
2505 `TK_in` tokens are easy: we simply update the `next` stack frame to
2506 record how many indents there are and that the next token started with
2507 an indent.
2508
2509 `TK_out` tokens must either be counted off against any pending indent,
2510 or must force reductions until there is a pending indent which isn't
2511 at the start of a production.
2512
2513 ###### parser includes
2514         #include "parser.h"
2515 ###### parser_run
2516         void *parser_run(struct token_state *tokens,
2517                          const struct state states[],
2518                          int (*do_reduce)(int, void**, void*),
2519                          void (*do_free)(short, void*),
2520                          FILE *trace, const char *non_term[], int knowns)
2521         {
2522                 struct parser p = { 0 };
2523                 struct token *tk = NULL;
2524                 int accepted = 0;
2525                 void *ret;
2526
2527                 p.next.newline_permitted = states[0].starts_line;
2528                 while (!accepted) {
2529                         struct token *err_tk;
2530                         if (!tk)
2531                                 tk = tok_copy(token_next(tokens));
2532                         p.next.sym = tk->num;
2533                         if (trace)
2534                                 parser_trace(trace, &p, tk, states, non_term, knowns);
2535
2536                         if (p.next.sym == TK_in) {
2537                                 p.next.starts_indented = 1;
2538                                 p.next.indents = 1;
2539                                 free(tk);
2540                                 tk = NULL;
2541                                 continue;
2542                         }
2543                         if (p.next.sym == TK_out) {
2544                                 if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
2545                                     (p.stack[p.tos-1].indents == 1 &&
2546                                      states[p.next.state].reduce_size > 1)) {
2547                                         p.stack[p.tos-1].indents -= 1;
2548                                         if (p.stack[p.tos-1].indents == p.stack[p.tos-1].starts_indented) {
2549                                                 // no internal indent any more, reassess 'newline_permitted'
2550                                                 if (states[p.stack[p.tos-1].state].starts_line)
2551                                                         p.stack[p.tos-1].newline_permitted = 1;
2552                                                 else if (p.tos > 1)
2553                                                         p.stack[p.tos-1].newline_permitted = p.stack[p.tos-2].newline_permitted;
2554                                         }
2555                                         free(tk);
2556                                         tk = NULL;
2557                                         continue;
2558                                 }
2559                                 // fall through and force a REDUCE (as 'shift'
2560                                 // will fail).
2561                         }
2562                         if (p.next.sym == TK_newline) {
2563                                 if (!p.tos || ! p.stack[p.tos-1].newline_permitted) {
2564                                         free(tk);
2565                                         tk = NULL;
2566                                         continue;
2567                                 }
2568                         }
2569                         if (shift(&p, tk, states)) {
2570                                 tk = NULL;
2571                                 continue;
2572                         }
2573                         if (states[p.next.state].reduce_prod >= 0) {
2574                                 void **body;
2575                                 int prod = states[p.next.state].reduce_prod;
2576                                 int size = states[p.next.state].reduce_size;
2577                                 int bufsize;
2578                                 static char buf[16*1024];
2579                                 p.next.sym = states[p.next.state].reduce_sym;
2580
2581                                 body = p.asn_stack +
2582                                         (p.tos - states[p.next.state].reduce_size);
2583
2584                                 bufsize = do_reduce(prod, body, buf);
2585
2586                                 pop(&p, size, do_free);
2587                                 shift(&p, memdup(buf, bufsize), states);
2588                                 if (prod == 0)
2589                                         accepted = 1;
2590                                 continue;
2591                         }
2592                         if (tk->num == TK_out) {
2593                                 // Indent problem - synthesise tokens to get us
2594                                 // out of here.
2595                                 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
2596                                 p.next.sym = states[p.next.state].shift_sym;
2597                                 shift(&p, tok_copy(*tk), states);
2598                                 // FIXME need to report this error somehow
2599                                 continue;
2600                         }
2601                         /* Error. We walk up the stack until we
2602                          * find a state which will accept TK_error.
2603                          * We then shift in TK_error and see what state
2604                          * that takes us too.
2605                          * Then we discard input tokens until
2606                          * we find one that is acceptable.
2607                          */
2608
2609                         err_tk = tok_copy(*tk);
2610                         p.next.sym = TK_error;
2611                         while (shift(&p, err_tk, states) == 0
2612                                && p.tos > 0)
2613                                 // discard this state
2614                                 pop(&p, 1, do_free);
2615                         if (p.tos == 0) {
2616                                 free(err_tk);
2617                                 // no state accepted TK_error
2618                                 break;
2619                         }
2620                         while (search(&states[p.next.state], tk->num) < 0 &&
2621                                tk->num != TK_eof) {
2622                                 free(tk);
2623                                 tk = tok_copy(token_next(tokens));
2624                                 if (tk->num == TK_in)
2625                                         p.next.indents += 1;
2626                                 if (tk->num == TK_out) {
2627                                         if (p.next.indents == 0)
2628                                                 break;
2629                                         p.next.indents -= 1;
2630                                 }
2631                         }
2632                         if (p.tos == 0 && tk->num == TK_eof)
2633                                 break;
2634                 }
2635                 free(tk);
2636                 if (accepted)
2637                         ret = p.asn_stack[0];
2638                 else
2639                         pop(&p, p.tos, do_free);
2640                 free(p.asn_stack);
2641                 free(p.stack);
2642                 return ret;
2643         }
2644
2645 ###### exported functions
2646         void *parser_run(struct token_state *tokens,
2647                          const struct state states[],
2648                          int (*do_reduce)(int, void**, void*),
2649                          void (*do_free)(short, void*),
2650                          FILE *trace, const char *non_term[], int knowns);
2651
2652 ### Tracing
2653
2654 Being able to visualize the parser in action can be invaluable when
2655 debugging the parser code, or trying to understand how the parse of a
2656 particular grammar progresses.  The stack contains all the important
2657 state, so just printing out the stack every time around the parse loop
2658 can make it possible to see exactly what is happening.
2659
2660 This doesn't explicitly show each SHIFT and REDUCE action.  However they
2661 are easily deduced from the change between consecutive lines, and the
2662 details of each state can be found by cross referencing the states list
2663 in the stack with the "report" that parsergen can generate.
2664
2665 For terminal symbols, we just dump the token.  For non-terminals we
2666 print the name of the symbol.  The look ahead token is reported at the
2667 end inside square brackets.
2668
2669 ###### parser functions
2670
2671         static char *reserved_words[] = {
2672                 [TK_error]        = "ERROR",
2673                 [TK_in]           = "IN",
2674                 [TK_out]          = "OUT",
2675                 [TK_newline]      = "NEWLINE",
2676                 [TK_eof]          = "$eof",
2677         };
2678         static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2679         {
2680                 fprintf(trace, "(%d", f->state);
2681                 if (f->indents)
2682                         fprintf(trace, "%c%d", f->starts_indented?':':'.',
2683                                 f->indents);
2684                 if (states[f->state].starts_line)
2685                         fprintf(trace, "s");
2686                 if (f->newline_permitted)
2687                         fprintf(trace, "n");
2688                 fprintf(trace, ") ");
2689         }
2690
2691         void parser_trace(FILE *trace, struct parser *p,
2692                           struct token *tk, const struct state states[],
2693                           const char *non_term[], int knowns)
2694         {
2695                 int i;
2696                 for (i = 0; i < p->tos; i++) {
2697                         int sym = p->stack[i].sym;
2698                         parser_trace_state(trace, &p->stack[i], states);
2699                         if (sym < TK_reserved &&
2700                             reserved_words[sym] != NULL)
2701                                 fputs(reserved_words[sym], trace);
2702                         else if (sym < TK_reserved + knowns) {
2703                                 struct token *t = p->asn_stack[i];
2704                                 text_dump(trace, t->txt, 20);
2705                         } else
2706                                 fputs(non_term[sym - TK_reserved - knowns],
2707                                       trace);
2708                         fputs(" ", trace);
2709                 }
2710                 parser_trace_state(trace, &p->next, states);
2711                 fprintf(trace, " [");
2712                 if (tk->num < TK_reserved &&
2713                     reserved_words[tk->num] != NULL)
2714                         fputs(reserved_words[tk->num], trace);
2715                 else
2716                         text_dump(trace, tk->txt, 20);
2717                 fputs("]\n", trace);
2718         }
2719
2720 # A Worked Example
2721
2722 The obvious example for a parser is a calculator.
2723
2724 As `scanner` provides number parsing function using `libgmp` is it not much
2725 work to perform arbitrary rational number calculations.
2726
2727 This calculator takes one expression, or an equality test per line.  The
2728 results are printed and in any equality test fails, the program exits with
2729 an error.
2730
2731 Embedding mdcode inside mdcode is rather horrible.  I'd like to find a
2732 better approach, but as the grammar file must have 3 components I need
2733 something like this.
2734
2735 ###### File: parsergen.mk
2736         calc.c : parsergen calc.cgm
2737                 ./parsergen -o calc calc.cgm
2738         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2739                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2740
2741 ###### demo grammar
2742
2743         # header
2744         ~~~~~
2745
2746         #include "number.h"
2747         // what do we use for a demo-grammar?  A calculator of course.
2748         struct number {
2749                 mpq_t val;
2750                 char tail[2];
2751                 int err;
2752         };
2753
2754         ~~~~~
2755         # code
2756         ~~~~~
2757
2758         #include <stdlib.h>
2759         #include <unistd.h>
2760         #include <fcntl.h>
2761         #include <sys/mman.h>
2762         #include <stdio.h>
2763         #include <malloc.h>
2764         #include <gmp.h>
2765         #include "mdcode.h"
2766         #include "scanner.h"
2767         #include "number.h"
2768         #include "parser.h"
2769
2770         #include "calc.h"
2771
2772         static void free_number(struct number *n)
2773         {
2774                 mpq_clear(n->val);
2775                 free(n);
2776         }
2777
2778         int main(int argc, char *argv[])
2779         {
2780                 int fd = open(argv[1], O_RDONLY);
2781                 int len = lseek(fd, 0, 2);
2782                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2783                 struct section *s = code_extract(file, file+len, NULL);
2784                 struct token_config config = {
2785                         .ignored = (1 << TK_line_comment)
2786                                  | (1 << TK_block_comment)
2787                                  | (1 << TK_in)
2788                                  | (1 << TK_out),
2789                         .number_chars = ".,_+-",
2790                         .word_start = "",
2791                         .word_cont = "",
2792                 };
2793                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2794                 exit(0);
2795         }
2796
2797         ~~~~~
2798         # grammar
2799         ~~~~~
2800
2801         Session -> Session Line
2802                 | Line
2803
2804         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2805                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2806                                         gmp_printf("  or as a decimal: %Fg\n", fl);
2807                                         mpf_clear(fl);
2808                                         }
2809                                      }$
2810                 | Expression = Expression NEWLINE ${
2811                         if (mpq_equal($1.val, $3.val))
2812                                 gmp_printf("Both equal %Qd\n", $1.val);
2813                         else {
2814                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
2815                                         $1.val, $3.val);
2816                                 exit(1);
2817                         }
2818                 }$
2819                 | NEWLINE ${ printf("Blank line\n"); }$
2820                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2821
2822         $number
2823         Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2824                 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2825                 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2826
2827         Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2828                 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2829                 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2830
2831         Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2832                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$