ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 There are several distinct sections.
   8
   9 1. `grammar_read` will read a grammar from a literate-code file and
  10    store the productions, symbols, and code fragments.
  11 2. `grammar_analyse` will take that grammar and build LR parsing
  12    states and look-ahead tables.
  13 3. `grammar_report` will present the details of the analysis
  14    in a readable format and will report any conflicts.
  15 4. `parser_generate` will write out C code files with various
  16    tables and with the code fragments arranged in useful places.
  17 5. `parser_run` is a library function intended to be linked together
  18    with the generated parser tables to complete the implementation of
  19    a parser.
  20 6. Finally `calc` is a test grammar for a simple calculator.  The
  21    `parsergen` program built from the C code in this file can extract
  22    that grammar directly from this file and process it.
  23
  24
  25 ###### File: parsergen.c
  26         #include <unistd.h>
  27         #include <stdlib.h>
  28         #include <stdio.h>
  29         ## includes
  30         ## forward declarations
  31         ## declarations
  32         ## functions
  33         ## grammar_read
  34         ## grammar_analyse
  35         ## grammar_report
  36         ## parser_generate
  37         ## main
  38 ###### File: parser.h
  39         ## exported types
  40         ## exported functions
  41 ###### File: libparser.c
  42         #include <unistd.h>
  43         #include <stdlib.h>
  44         #include <stdio.h>
  45         ## parser includes
  46         ## parser functions
  47         ## parser_run
  48 ###### File: parsergen.mk
  49         CFLAGS += -Wall -g
  50         all :: parsergen calc
  51         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  52                 ./md2c parsergen.mdc
  53
  54 ## Reading the grammar
  55
  56 The grammar must be stored in a markdown literate code file as parsed
  57 by "[mdcode][]".  It must have three top level (i.e. unreferenced)
  58 sections: `header`, `code`, and `grammar`.  The first two will be
  59 literally copied into the generated `.c` and `.h`. files.  The last
  60 contains the grammar.  This is tokenised with "[scanner][]".
  61
  62 If the `--tag` option is given, then any top level header that doesn't
  63 start with the tag is ignored, and the tag is striped from the rest.  So
  64 `--tag Foo`
  65 means that the three needed sections must be `Foo: header`, `Foo: code`,
  66 and `Foo: grammar`.
  67
  68 [mdcode]: mdcode.html
  69 [scanner]: scanner.html
  70
  71 ###### includes
  72         #include "mdcode.h"
  73         #include "scanner.h"
  74
  75 ###### parser includes
  76         #include "mdcode.h"
  77         #include "scanner.h"
  78
  79 The grammar contains production sets, precedence/associativity
  80 declarations, and data type declarations.  These are all parsed with
  81 _ad hoc_ parsing as we don't have a parser generator yet.
  82
  83 The precedence and associativity can be set for each production, but
  84 can be inherited from symbols.  The data type is potentially defined
  85 for each non-terminal and describes what C structure is used to store
  86 information about each symbol.
  87
  88 ###### declarations
  89         enum assoc {Left, Right, Non};
  90         char *assoc_names[] = {"Left","Right","Non"};
  91
  92         struct symbol {
  93                 struct text struct_name;
  94                 enum assoc assoc;
  95                 unsigned short precedence;
  96                 ## symbol fields
  97         };
  98         struct production {
  99                 unsigned short precedence;
 100                 enum assoc assoc;
 101                 ## production fields
 102         };
 103         struct grammar {
 104                 ## grammar fields
 105         };
 106
 107 The strings reported by `mdcode` and `scanner` are `struct text` which have
 108 length rather than being null terminated.  To help with printing and
 109 comparing we define `text_is` and `prtxt`, which should possibly go in
 110 `mdcode`.  `scanner` does provide `text_dump` which is useful for strings
 111 which might contain control characters.
 112
 113 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 114 because that is what we need to detect tags.
 115
 116 ###### functions
 117         static int text_is(struct text t, char *s)
 118         {
 119                 return (strlen(s) == t.len &&
 120                         strncmp(s, t.txt, t.len) == 0);
 121         }
 122         static void prtxt(struct text t)
 123         {
 124                 printf("%.*s", t.len, t.txt);
 125         }
 126
 127         static int strip_tag(struct text *t, char *tag)
 128         {
 129                 int skip = strlen(tag) + 1;
 130                 if (skip >= t->len ||
 131                     strncmp(t->txt, tag, skip-1) != 0 ||
 132                     t->txt[skip-1] != ':')
 133                         return 0;
 134                 while (skip < t->len && t->txt[skip] == ' ')
 135                         skip++;
 136                 t->len -= skip;
 137                 t->txt += skip;
 138                 return 1;
 139         }
 140
 141 ### Symbols
 142
 143 Productions are comprised primarily of symbols - terminal and
 144 non-terminal.  We do not make a syntactic distinction between the two,
 145 though non-terminals must be identifiers.  Non-terminal symbols are
 146 those which appear in the head of a production, terminal symbols are
 147 those which don't.  There are also "virtual" symbols used for precedence
 148 marking discussed later, and sometimes we won't know what type a symbol
 149 is yet.
 150
 151 ###### forward declarations
 152         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 153         char *symtypes = "UVTN";
 154 ###### symbol fields
 155         enum symtype type;
 156
 157 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 158 table of known symbols and the resulting parser will report them as
 159 `TK_reserved + N`.  A small set of identifiers are reserved for the
 160 different token types that `scanner` can report.
 161
 162 ###### declarations
 163         static char *reserved_words[] = {
 164                 [TK_error]        = "ERROR",
 165                 [TK_number]       = "NUMBER",
 166                 [TK_ident]        = "IDENTIFIER",
 167                 [TK_mark]         = "MARK",
 168                 [TK_string]       = "STRING",
 169                 [TK_multi_string] = "MULTI_STRING",
 170                 [TK_in]           = "IN",
 171                 [TK_out]          = "OUT",
 172                 [TK_newline]      = "NEWLINE",
 173                 [TK_eof]          = "$eof",
 174         };
 175 ###### symbol fields
 176         short num;
 177
 178 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 179 recognised.  The former is automatically expected at the end of the text
 180 being parsed. The latter are always ignored by the parser.
 181
 182 All of these symbols are stored in a simple symbol table.  We use the
 183 `struct text` from `mdcode` to store the name and link them together
 184 into a sorted list using an insertion sort.
 185
 186 We don't have separate `find` and `insert` functions as any symbol we
 187 find needs to be remembered.  We simply expect `find` to always return a
 188 symbol, but its type might be `Unknown`.
 189
 190 ###### includes
 191         #include <string.h>
 192
 193 ###### symbol fields
 194         struct text name;
 195         struct symbol *next;
 196
 197 ###### grammar fields
 198         struct symbol *syms;
 199         int num_syms;
 200
 201 ###### functions
 202         static int text_cmp(struct text a, struct text b)
 203         {
 204                 int len = a.len;
 205                 if (a.len > b.len)
 206                         len = b.len;
 207                 int cmp = strncmp(a.txt, b.txt, len);
 208                 if (cmp)
 209                         return cmp;
 210                 else
 211                         return a.len - b.len;
 212         }
 213
 214         static struct symbol *sym_find(struct grammar *g, struct text s)
 215         {
 216                 struct symbol **l = &g->syms;
 217                 struct symbol *n;
 218                 int cmp = 1;
 219
 220                 while (*l &&
 221                         (cmp = text_cmp((*l)->name, s)) < 0)
 222                                 l = & (*l)->next;
 223                 if (cmp == 0)
 224                         return *l;
 225                 n = calloc(1, sizeof(*n));
 226                 n->name = s;
 227                 n->type = Unknown;
 228                 n->next = *l;
 229                 n->num = -1;
 230                 *l = n;
 231                 return n;
 232         }
 233
 234         static void symbols_init(struct grammar *g)
 235         {
 236                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 237                 int i;
 238                 for (i = 0; i < entries; i++) {
 239                         struct text t;
 240                         struct symbol *s;
 241                         t.txt = reserved_words[i];
 242                         if (!t.txt)
 243                                 continue;
 244                         t.len = strlen(t.txt);
 245                         s = sym_find(g, t);
 246                         s->type = Terminal;
 247                         s->num = i;
 248                 }
 249         }
 250
 251 ### Data types and precedence.
 252
 253 Data type specification and precedence specification are both
 254 introduced by a dollar sign at the start of the line.  If the next
 255 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 256 precedence, otherwise it specifies a data type.
 257
 258 The data type name is simply stored and applied to the head of all
 259 subsequent productions.  It must be the name of a structure, so `$expr`
 260 maps to `struct expr`.
 261
 262 Any productions given before the first data type will have no data type
 263 and can carry no information.  In order to allow other non-terminals to
 264 have no type, the data type `$void` can be given.  This does *not* mean
 265 that `struct void` will be used, but rather than no type will be
 266 associated with future non-terminals.
 267
 268 The precedence line must contain a list of symbols - typically
 269 terminal symbols, but not necessarily.  It can only contain symbols
 270 that have not been seen yet, so precedence declaration must precede
 271 the productions that they affect.
 272
 273 A precedence line may also contain a "virtual" symbol which is an
 274 identifier preceded by `$$`.  Virtual terminals carry precedence
 275 information but are not included in the grammar.  A production can
 276 declare that it inherits the precedence of a given virtual symbol.
 277
 278 This use for `$$` precludes it from being used as a symbol in the
 279 described language.  Two other symbols: `${` and `}$` are also
 280 unavailable.
 281
 282 Each new precedence line introduces a new precedence level and
 283 declares how it associates.  This level is stored in each symbol
 284 listed and may be inherited by any production which uses the symbol.  A
 285 production inherits from the last symbol which has a precedence.
 286
 287 ###### grammar fields
 288         struct text current_type;
 289         int prec_levels;
 290
 291 ###### declarations
 292         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 293         static const char *known[] = { "$$", "${", "}$" };
 294
 295 ###### functions
 296         static char *dollar_line(struct token_state *ts, struct grammar *g)
 297         {
 298                 struct token t = token_next(ts);
 299                 char *err;
 300                 enum assoc assoc;
 301                 int found;
 302
 303                 if (t.num != TK_ident) {
 304                         err = "type or assoc expected after '$'";
 305                         goto abort;
 306                 }
 307                 if (text_is(t.txt, "LEFT"))
 308                         assoc = Left;
 309                 else if (text_is(t.txt, "RIGHT"))
 310                         assoc = Right;
 311                 else if (text_is(t.txt, "NON"))
 312                         assoc = Non;
 313                 else {
 314                         g->current_type = t.txt;
 315                         if (text_is(t.txt, "void"))
 316                                 g->current_type.txt = NULL;
 317                         t = token_next(ts);
 318                         if (t.num != TK_newline) {
 319                                 err = "Extra tokens after type name";
 320                                 goto abort;
 321                         }
 322                         return NULL;
 323                 }
 324
 325                 // This is a precedence line, need some symbols.
 326                 found = 0;
 327                 g->prec_levels += 1;
 328                 t = token_next(ts);
 329                 while (t.num != TK_newline) {
 330                         enum symtype type = Terminal;
 331                         struct symbol *s;
 332                         if (t.num == TK_virtual) {
 333                                 type = Virtual;
 334                                 t = token_next(ts);
 335                                 if (t.num != TK_ident) {
 336                                         err = "$$ must be followed by a word";
 337                                         goto abort;
 338                                 }
 339                         } else if (t.num != TK_ident &&
 340                                    t.num != TK_mark) {
 341                                 err = "Illegal token in precedence line";
 342                                 goto abort;
 343                         }
 344                         s = sym_find(g, t.txt);
 345                         if (s->type != Unknown) {
 346                                 err = "Symbols in precedence line must not already be known.";
 347                                 goto abort;
 348                         }
 349                         s->type = type;
 350                         s->precedence = g->prec_levels;
 351                         s->assoc = assoc;
 352                         found += 1;
 353                 }
 354                 if (found == 0)
 355                         err = "No symbols given on precedence line";
 356                         goto abort;
 357                 return NULL;
 358         abort:
 359                 while (t.num != TK_newline && t.num != TK_eof)
 360                         t = token_next(ts);
 361                 return err;
 362         }
 363
 364 ### Productions
 365
 366 A production either starts with an identifier which is the head
 367 non-terminal, or a vertical bar (`|`) in which case this production
 368 uses the same head as the previous one.  The identifier must be
 369 followed by a `->` mark.  All productions for a given non-terminal must
 370 be grouped together with the `nonterminal ->` given only once.
 371
 372 After this a (possibly empty) sequence of identifiers and marks form
 373 the body of the production.  A virtual symbol may be given after the
 374 body by preceding it with `$$`.  If a virtual symbol is given, the
 375 precedence of the production is that for the virtual symbol.  If none
 376 is given, the precedence is inherited from the last symbol in the
 377 production which has a precedence specified.
 378
 379 After the optional precedence may come the `${` mark.  This indicates
 380 the start of a code fragment.  If present, this must be on the same
 381 line as the start of the production.
 382
 383 All of the text from the `${` through to the matching `}$` is
 384 collected and forms the code-fragment for the production.  It must all
 385 be in one `code_node` of the literate code.  The `}$` must be
 386 at the end of a line.
 387
 388 Text in the code fragment will undergo substitutions where `$N` for
 389 some numeric `N` will be replaced with a variable holding the parse
 390 information for the particular symbol in the production.  `$0` is the
 391 head of the production, `$1` is the first symbol of the body, etc.
 392 The type of `$N` for a terminal symbol is `struct token`.  For
 393 a non-terminal, it is whatever has been declared for that symbol.
 394
 395 While building productions we will need to add to an array which needs to
 396 grow dynamically.
 397
 398 ###### functions
 399         static void array_add(void *varray, int *cnt, void *new)
 400         {
 401                 void ***array = varray;
 402                 int current = 0;
 403                 const int step = 8;
 404                 current = ((*cnt-1) | (step-1))+1;
 405                 if (*cnt == current) {
 406                         /* must grow */
 407                         current += step;
 408                         *array = realloc(*array, current * sizeof(void*));
 409                 }
 410                 (*array)[*cnt] = new;
 411                 (*cnt) += 1;
 412         }
 413
 414 Collecting the code fragment simply involves reading tokens until we
 415 hit the end token `}$`, and noting the character position of the start and
 416 the end.
 417
 418 ###### functions
 419         static struct text collect_code(struct token_state *state,
 420                                         struct token start)
 421         {
 422                 struct text code;
 423                 struct token t;
 424                 code.txt = start.txt.txt + start.txt.len;
 425                 do
 426                         t = token_next(state);
 427                 while (t.node == start.node &&
 428                        t.num != TK_close && t.num != TK_error &&
 429                        t.num != TK_eof);
 430                 if (t.num == TK_close && t.node == start.node)
 431                         code.len = t.txt.txt - code.txt;
 432                 else
 433                         code.txt = NULL;
 434                 return code;
 435         }
 436
 437 Now we have all the bit we need to parse a full production.
 438
 439 ###### includes
 440         #include <memory.h>
 441
 442 ###### grammar fields
 443         struct production **productions;
 444         int production_count;
 445
 446 ###### production fields
 447         struct symbol  *head;
 448         struct symbol **body;
 449         int             body_size;
 450         struct text     code;
 451
 452 ###### symbol fields
 453         int first_production;
 454
 455 ###### functions
 456         static char *parse_production(struct grammar *g,
 457                                       struct symbol *head,
 458                                       struct token_state *state)
 459         {
 460                 /* Head has already been parsed. */
 461                 struct token tk;
 462                 char *err;
 463                 struct production p, *pp;
 464
 465                 memset(&p, 0, sizeof(p));
 466                 p.head = head;
 467                 tk = token_next(state);
 468                 while (tk.num == TK_ident || tk.num == TK_mark) {
 469                         struct symbol *bs = sym_find(g, tk.txt);
 470                         if (bs->type == Unknown)
 471                                 bs->type = Terminal;
 472                         if (bs->type == Virtual) {
 473                                 err = "Virtual symbol not permitted in production";
 474                                 goto abort;
 475                         }
 476                         if (bs->precedence) {
 477                                 p.precedence = bs->precedence;
 478                                 p.assoc = bs->assoc;
 479                         }
 480                         array_add(&p.body, &p.body_size, bs);
 481                         tk = token_next(state);
 482                 }
 483                 if (tk.num == TK_virtual) {
 484                         struct symbol *vs;
 485                         tk = token_next(state);
 486                         if (tk.num != TK_ident) {
 487                                 err = "word required after $$";
 488                                 goto abort;
 489                         }
 490                         vs = sym_find(g, tk.txt);
 491                         if (vs->type != Virtual) {
 492                                 err = "symbol after $$ must be virtual";
 493                                 goto abort;
 494                         }
 495                         p.precedence = vs->precedence;
 496                         p.assoc = vs->assoc;
 497                         tk = token_next(state);
 498                 }
 499                 if (tk.num == TK_open) {
 500                         p.code = collect_code(state, tk);
 501                         if (p.code.txt == NULL) {
 502                                 err = "code fragment not closed properly";
 503                                 goto abort;
 504                         }
 505                         tk = token_next(state);
 506                 }
 507                 if (tk.num != TK_newline && tk.num != TK_eof) {
 508                         err = "stray tokens at end of line";
 509                         goto abort;
 510                 }
 511                 pp = malloc(sizeof(*pp));
 512                 *pp = p;
 513                 array_add(&g->productions, &g->production_count, pp);
 514                 return NULL;
 515         abort:
 516                 while (tk.num != TK_newline && tk.num != TK_eof)
 517                         tk = token_next(state);
 518                 return err;
 519         }
 520
 521 With the ability to parse production and dollar-lines, we have nearly all
 522 that we need to parse a grammar from a `code_node`.
 523
 524 The head of the first production will effectively be the `start` symbol of
 525 the grammar.  However it won't _actually_ be so.  Processing the grammar is
 526 greatly simplified if the real start symbol only has a single production,
 527 and expects `$eof` as the final terminal.  So when we find the first
 528 explicit production we insert an extra production as production zero which
 529 looks like
 530
 531 ###### Example: production 0
 532         $start -> START $eof
 533
 534 where `START` is the first non-terminal given.
 535
 536 ###### create production zero
 537         struct production *p = calloc(1,sizeof(*p));
 538         struct text start = {"$start",6};
 539         struct text eof = {"$eof",4};
 540         p->head = sym_find(g, start);
 541         p->head->type = Nonterminal;
 542         array_add(&p->body, &p->body_size, head);
 543         array_add(&p->body, &p->body_size, sym_find(g, eof));
 544         g->start = p->head->num;
 545         p->head->first_production = g->production_count;
 546         array_add(&g->productions, &g->production_count, p);
 547
 548 Now we are ready to read in the grammar.
 549
 550 ###### grammar fields
 551         int start;      // the 'start' symbol.
 552
 553 ###### grammar_read
 554         static struct grammar *grammar_read(struct code_node *code)
 555         {
 556                 struct token_config conf = {
 557                         .word_start = "",
 558                         .word_cont = "",
 559                         .words_marks = known,
 560                         .known_count = sizeof(known)/sizeof(known[0]),
 561                         .number_chars = "",
 562                         .ignored = (1 << TK_line_comment)
 563                                  | (1 << TK_block_comment)
 564                                  | (0 << TK_number)
 565                                  | (1 << TK_string)
 566                                  | (1 << TK_multi_string)
 567                                  | (1 << TK_in)
 568                                  | (1 << TK_out),
 569                 };
 570
 571                 struct token_state *state = token_open(code, &conf);
 572                 struct token tk;
 573                 struct symbol *head = NULL;
 574                 struct grammar *g;
 575                 char *err = NULL;
 576
 577                 g = calloc(1, sizeof(*g));
 578                 symbols_init(g);
 579
 580                 for (tk = token_next(state); tk.num != TK_eof;
 581                      tk = token_next(state)) {
 582                         if (tk.num == TK_newline)
 583                                 continue;
 584                         if (tk.num == TK_ident) {
 585                                 // new non-terminal
 586                                 head = sym_find(g, tk.txt);
 587                                 if (head->type == Nonterminal)
 588                                         err = "This non-terminal has already be used.";
 589                                 else if (head->type == Virtual)
 590                                         err = "Virtual symbol not permitted in head of production";
 591                                 else {
 592                                         head->type = Nonterminal;
 593                                         head->struct_name = g->current_type;
 594                                         if (g->start == 0) {
 595                                                 ## create production zero
 596                                         }
 597                                         head->first_production = g->production_count;
 598                                         tk = token_next(state);
 599                                         if (tk.num == TK_mark &&
 600                                             text_is(tk.txt, "->"))
 601                                                 err = parse_production(g, head, state);
 602                                         else
 603                                                 err = "'->' missing in production";
 604                                 }
 605                         } else if (tk.num == TK_mark
 606                                    && text_is(tk.txt, "|")) {
 607                                 // another production for same non-term
 608                                 if (head)
 609                                         err = parse_production(g, head, state);
 610                                 else
 611                                         err = "First production must have a head";
 612                         } else if (tk.num == TK_mark
 613                                    && text_is(tk.txt, "$")) {
 614                                 err = dollar_line(state, g);
 615                         } else {
 616                                 err = "Unrecognised token at start of line.";
 617                         }
 618                         if (err)
 619                                 goto abort;
 620                 }
 621                 token_close(state);
 622                 return g;
 623         abort:
 624                 fprintf(stderr, "Error at line %d: %s\n",
 625                         tk.line, err);
 626                 token_close(state);
 627                 free(g);
 628                 return NULL;
 629         }
 630
 631 ## Analysing the grammar
 632
 633 The central task in analysing the grammar is to determine a set of
 634 states to drive the parsing process.  This will require finding
 635 various sets of symbols and of "items".  Some of these sets will need
 636 to attach information to each element in the set, so they are more
 637 like maps.
 638
 639 Each "item" may have a 'look-ahead' or `LA` set associated with
 640 it.  Many of these will be the same as each other.  To avoid memory
 641 wastage, and to simplify some comparisons of sets, these sets will be
 642 stored in a list of unique sets, each assigned a number.
 643
 644 Once we have the data structures in hand to manage these sets and
 645 lists, we can start setting the 'nullable' flag, build the 'FIRST'
 646 sets, and then create the item sets which define the various states.
 647
 648 ### Symbol sets.
 649
 650 Though we don't only store symbols in these sets, they are the main
 651 things we store, so they are called symbol sets or "symsets".
 652
 653 A symset has a size, an array of shorts, and an optional array of data
 654 values, which are also shorts.  If the array of data is not present,
 655 we store an impossible pointer, as `NULL` is used to indicate that no
 656 memory has been allocated yet;
 657
 658 ###### declarations
 659         struct symset {
 660                 short cnt;
 661                 unsigned short *syms, *data;
 662         };
 663         #define NO_DATA ((unsigned short *)1)
 664         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 665         const struct symset INIT_DATASET = { 0, NULL, NULL };
 666
 667 The arrays of shorts are allocated in blocks of 8 and are kept sorted
 668 by using an insertion sort.  We don't explicitly record the amount of
 669 allocated space as it can be derived directly from the current `cnt` using
 670 `((cnt - 1) | 7) + 1`.
 671
 672 ###### functions
 673         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 674         {
 675                 int i;
 676                 int current = ((s->cnt-1) | 7) + 1;
 677                 if (current == s->cnt) {
 678                         current += 8;
 679                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 680                         if (s->data != NO_DATA)
 681                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 682                 }
 683                 i = s->cnt;
 684                 while (i > 0 && s->syms[i-1] > key) {
 685                         s->syms[i] = s->syms[i-1];
 686                         if (s->data != NO_DATA)
 687                                 s->data[i] = s->data[i-1];
 688                         i--;
 689                 }
 690                 s->syms[i] = key;
 691                 if (s->data != NO_DATA)
 692                         s->data[i] = val;
 693                 s->cnt += 1;
 694         }
 695
 696 Finding a symbol (or item) in a `symset` uses a simple binary search.
 697 We return the index where the value was found (so data can be accessed),
 698 or `-1` to indicate failure.
 699
 700         static int symset_find(struct symset *ss, unsigned short key)
 701         {
 702                 int lo = 0;
 703                 int hi = ss->cnt;
 704
 705                 if (hi == 0)
 706                         return -1;
 707                 while (lo + 1 < hi) {
 708                         int mid = (lo + hi) / 2;
 709                         if (ss->syms[mid] <= key)
 710                                 lo = mid;
 711                         else
 712                                 hi = mid;
 713                 }
 714                 if (ss->syms[lo] == key)
 715                         return lo;
 716                 return -1;
 717         }
 718
 719 We will often want to form the union of two symsets.  When we do, we
 720 will often want to know if anything changed (as they might mean there
 721 is more work to do).  So `symset_union` reports whether anything was
 722 added to the first set.  We use a slow quadratic approach as these
 723 sets don't really get very big.  If profiles shows this to be a problem is
 724 can be optimised later.
 725
 726         static int symset_union(struct symset *a, struct symset *b)
 727         {
 728                 int i;
 729                 int added = 0;
 730                 for (i = 0; i < b->cnt; i++)
 731                         if (symset_find(a, b->syms[i]) < 0) {
 732                                 unsigned short data = 0;
 733                                 if (b->data != NO_DATA)
 734                                         data = b->data[i];
 735                                 symset_add(a, b->syms[i], data);
 736                                 added++;
 737                         }
 738                 return added;
 739         }
 740
 741 And of course we must be able to free a symset.
 742
 743         static void symset_free(struct symset ss)
 744         {
 745                 free(ss.syms);
 746                 if (ss.data != NO_DATA)
 747                         free(ss.data);
 748         }
 749
 750 ### Symset Storage
 751
 752 Some symsets are simply stored somewhere appropriate in the `grammar`
 753 data structure, others needs a bit of indirection.  This applies
 754 particularly to "LA" sets.  These will be paired with "items" in an
 755 "itemset".  As itemsets will be stored in a symset, the "LA" set needs to be
 756 stored in the `data` field, so we need a mapping from a small (short)
 757 number to an LA `symset`.
 758
 759 As mentioned earlier this involves creating a list of unique symsets.
 760
 761 For now, we just use a linear list sorted by insertion.  A skiplist
 762 would be more efficient and may be added later.
 763
 764 ###### declarations
 765
 766         struct setlist {
 767                 struct symset ss;
 768                 int num;
 769                 struct setlist *next;
 770         };
 771
 772 ###### grammar fields
 773         struct setlist *sets;
 774         int nextset;
 775
 776 ###### functions
 777
 778         static int ss_cmp(struct symset a, struct symset b)
 779         {
 780                 int i;
 781                 int diff = a.cnt - b.cnt;
 782                 if (diff)
 783                         return diff;
 784                 for (i = 0; i < a.cnt; i++) {
 785                         diff = (int)a.syms[i] - (int)b.syms[i];
 786                         if (diff)
 787                                 return diff;
 788                 }
 789                 return 0;
 790         }
 791
 792         static int save_set(struct grammar *g, struct symset ss)
 793         {
 794                 struct setlist **sl = &g->sets;
 795                 int cmp = 1;
 796                 struct setlist *s;
 797
 798                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 799                         sl = & (*sl)->next;
 800                 if (cmp == 0) {
 801                         symset_free(ss);
 802                         return (*sl)->num;
 803                 }
 804
 805                 s = malloc(sizeof(*s));
 806                 s->ss = ss;
 807                 s->num = g->nextset;
 808                 g->nextset += 1;
 809                 s->next = *sl;
 810                 *sl = s;
 811                 return s->num;
 812         }
 813
 814 Finding a set by number is currently performed by a simple linear search.
 815 If this turns out to hurt performance, we can store the sets in a dynamic
 816 array like the productions.
 817
 818         static struct symset set_find(struct grammar *g, int num)
 819         {
 820                 struct setlist *sl = g->sets;
 821                 while (sl && sl->num != num)
 822                         sl = sl->next;
 823                 return sl->ss;
 824         }
 825
 826
 827 ### Setting `nullable`
 828
 829 We set `nullable` on the head symbol for any production for which all
 830 the body symbols (if any) are nullable.  As this is a recursive
 831 definition, any change in the `nullable` setting means that we need to
 832 re-evaluate where it needs to be set.
 833
 834 We simply loop around performing the same calculations until no more
 835 changes happen.
 836
 837 ###### symbol fields
 838         int nullable;
 839
 840 ###### functions
 841         static void set_nullable(struct grammar *g)
 842         {
 843                 int check_again = 1;
 844                 while (check_again) {
 845                         int p;
 846                         check_again = 0;
 847                         for (p = 0; p < g->production_count; p++) {
 848                                 struct production *pr = g->productions[p];
 849                                 int s;
 850
 851                                 if (pr->head->nullable)
 852                                         continue;
 853                                 for (s = 0; s < pr->body_size; s++)
 854                                         if (! pr->body[s]->nullable)
 855                                                 break;
 856                                 if (s == pr->body_size) {
 857                                         pr->head->nullable = 1;
 858                                         check_again = 1;
 859                                 }
 860                         }
 861                 }
 862         }
 863
 864 ### Setting `can_eol`
 865
 866 In order to be able to ignore newline tokens when not relevant, but
 867 still include them in the parse when needed, we will need to know
 868 which states can start a "line-like" section of code.  We ignore
 869 newlines when there is an indent since the most recent start of a
 870 line-like section.
 871
 872 To know what is line-like, we first need to know which symbols can end
 873 a line-like section, which is precisely those which can end with a
 874 newline token.  These symbols don't necessarily alway end with a
 875 newline, but they can.  Hence they are not described as "lines" but
 876 only "line-like".
 877
 878 Clearly the `TK_newline` token can end with a newline.  Any symbol
 879 which is the head of a production that contains a line-ending symbol
 880 followed only by nullable symbols is also a line-ending symbol.  We
 881 use a new field `can_eol` to record this attribute of symbols, and
 882 compute it in a repetitive manner similar to `set_nullable`.
 883
 884 ###### symbol fields
 885         int can_eol;
 886
 887 ###### functions
 888         static void set_can_eol(struct grammar *g)
 889         {
 890                 int check_again = 1;
 891                 g->symtab[TK_newline]->can_eol = 1;
 892                 while (check_again) {
 893                         int p;
 894                         check_again = 0;
 895                         for (p = 0; p < g->production_count; p++) {
 896                                 struct production *pr = g->productions[p];
 897                                 int s;
 898
 899                                 if (pr->head->can_eol)
 900                                         continue;
 901
 902                                 for (s = pr->body_size - 1; s >= 0; s--) {
 903                                         if (pr->body[s]->can_eol) {
 904                                                 pr->head->can_eol = 1;
 905                                                 check_again = 1;
 906                                                 break;
 907                                         }
 908                                         if (!pr->body[s]->nullable)
 909                                                 break;
 910                                 }
 911                         }
 912                 }
 913         }
 914
 915 ### Building the `first` sets
 916
 917 When calculating what can follow a particular non-terminal, we will need to
 918 know what the "first" terminal in any subsequent non-terminal might be.  So
 919 we calculate the `first` set for every non-terminal and store them in an
 920 array.  We don't bother recording the "first" set for terminals as they are
 921 trivial.
 922
 923 As the "first" for one symbol might depend on the "first" of another,
 924 we repeat the calculations until no changes happen, just like with
 925 `set_nullable`.  This makes use of the fact that `symset_union`
 926 reports if any change happens.
 927
 928 The core of this which finds the "first" of part of a production body
 929 will be reused for computing the "follow" sets, so we split it out
 930 into a separate function.
 931
 932 ###### grammar fields
 933         struct symset *first;
 934
 935 ###### functions
 936
 937         static int add_first(struct production *pr, int start,
 938                              struct symset *target, struct grammar *g,
 939                              int *to_end)
 940         {
 941                 int s;
 942                 int changed = 0;
 943                 for (s = start; s < pr->body_size; s++) {
 944                         struct symbol *bs = pr->body[s];
 945                         if (bs->type == Terminal) {
 946                                 if (symset_find(target, bs->num) < 0) {
 947                                         symset_add(target, bs->num, 0);
 948                                         changed = 1;
 949                                 }
 950                                 break;
 951                         } else if (symset_union(target, &g->first[bs->num]))
 952                                 changed = 1;
 953                         if (!bs->nullable)
 954                                 break;
 955                 }
 956                 if (to_end)
 957                         *to_end = (s == pr->body_size);
 958                 return changed;
 959         }
 960
 961         static void build_first(struct grammar *g)
 962         {
 963                 int check_again = 1;
 964                 int s;
 965                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
 966                 for (s = 0; s < g->num_syms; s++)
 967                         g->first[s] = INIT_SYMSET;
 968
 969                 while (check_again) {
 970                         int p;
 971                         check_again = 0;
 972                         for (p = 0; p < g->production_count; p++) {
 973                                 struct production *pr = g->productions[p];
 974                                 struct symset *head = &g->first[pr->head->num];
 975
 976                                 if (add_first(pr, 0, head, g, NULL))
 977                                         check_again = 1;
 978                         }
 979                 }
 980         }
 981
 982 ### Building the `follow` sets.
 983
 984 There are two different situations when we will want to generate "follow"
 985 sets.  If we are doing an SLR analysis, we want to generate a single
 986 "follow" set for each non-terminal in the grammar.  That is what is
 987 happening here.  If we are doing an LALR or LR analysis we will want
 988 to generate a separate "LA" set for each item.  We do that later
 989 in state generation.
 990
 991 There are two parts to generating a "follow" set.  Firstly we look at
 992 every place that any non-terminal appears in the body of any
 993 production, and we find the set of possible "first" symbols after
 994 there.  This is done using `add_first` above and only needs to be done
 995 once as the "first" sets are now stable and will not change.
 996
 997 ###### follow code
 998
 999         for (p = 0; p < g->production_count; p++) {
1000                 struct production *pr = g->productions[p];
1001                 int b;
1002
1003                 for (b = 0; b < pr->body_size - 1; b++) {
1004                         struct symbol *bs = pr->body[b];
1005                         if (bs->type == Terminal)
1006                                 continue;
1007                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1008                 }
1009         }
1010
1011 The second part is to add the "follow" set of the head of a production
1012 to the "follow" sets of the final symbol in the production, and any
1013 other symbol which is followed only by `nullable` symbols.  As this
1014 depends on "follow" itself we need to repeatedly perform the process
1015 until no further changes happen.
1016
1017 ###### follow code
1018
1019         for (again = 0, p = 0;
1020              p < g->production_count;
1021              p < g->production_count-1
1022                 ? p++ : again ? (again = 0, p = 0)
1023                               : p++) {
1024                 struct production *pr = g->productions[p];
1025                 int b;
1026
1027                 for (b = pr->body_size - 1; b >= 0; b--) {
1028                         struct symbol *bs = pr->body[b];
1029                         if (bs->type == Terminal)
1030                                 break;
1031                         if (symset_union(&g->follow[bs->num],
1032                                          &g->follow[pr->head->num]))
1033                                 again = 1;
1034                         if (!bs->nullable)
1035                                 break;
1036                 }
1037         }
1038
1039 We now just need to create and initialise the `follow` list to get a
1040 complete function.
1041
1042 ###### grammar fields
1043         struct symset *follow;
1044
1045 ###### functions
1046         static void build_follow(struct grammar *g)
1047         {
1048                 int s, again, p;
1049                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1050                 for (s = 0; s < g->num_syms; s++)
1051                         g->follow[s] = INIT_SYMSET;
1052                 ## follow code
1053         }
1054
1055 ### Building itemsets and states
1056
1057 There are three different levels of detail that can be chosen for
1058 building the itemsets and states for the LR grammar.  They are:
1059
1060 1. LR(0) or SLR(1), where no look-ahead is considered.
1061 2. LALR(1) where we build look-ahead sets with each item and merge
1062    the LA sets when we find two paths to the same "kernel" set of items.
1063 3. LR(1) where different look-ahead for any item in the set means
1064    a different state must be created.
1065
1066 ###### forward declarations
1067         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1068
1069 We need to be able to look through existing states to see if a newly
1070 generated state already exists.  For now we use a simple sorted linked
1071 list.
1072
1073 An item is a pair of numbers: the production number and the position of
1074 "DOT", which is an index into the body of the production.
1075 As the numbers are not enormous we can combine them into a single "short"
1076 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1077 production number (so 4000 productions with maximum of 15 symbols in the
1078 body).
1079
1080 Comparing the itemsets will be a little different to comparing symsets
1081 as we want to do the lookup after generating the "kernel" of an
1082 itemset, so we need to ignore the offset=zero items which are added during
1083 completion.
1084
1085 To facilitate this, we modify the "DOT" number so that "0" sorts to
1086 the end of the list in the symset, and then only compare items before
1087 the first "0".
1088
1089 ###### declarations
1090         static inline unsigned short item_num(int production, int index)
1091         {
1092                 return production | ((31-index) << 11);
1093         }
1094         static inline int item_prod(unsigned short item)
1095         {
1096                 return item & 0x7ff;
1097         }
1098         static inline int item_index(unsigned short item)
1099         {
1100                 return (31-(item >> 11)) & 0x1f;
1101         }
1102
1103 For LR(1) analysis we need to compare not just the itemset in a state
1104 but also the LA sets.  As we assign each unique LA set a number, we
1105 can just compare the symset and the data values together.
1106
1107 ###### functions
1108         static int itemset_cmp(struct symset a, struct symset b,
1109                                enum grammar_type type)
1110         {
1111                 int i;
1112                 int av, bv;
1113
1114                 for (i = 0;
1115                      i < a.cnt && i < b.cnt &&
1116                      item_index(a.syms[i]) > 0 &&
1117                      item_index(b.syms[i]) > 0;
1118                      i++) {
1119                         int diff = a.syms[i] - b.syms[i];
1120                         if (diff)
1121                                 return diff;
1122                         if (type == LR1) {
1123                                 diff = a.data[i] - b.data[i];
1124                                 if (diff)
1125                                         return diff;
1126                         }
1127                 }
1128                 if (i == a.cnt || item_index(a.syms[i]) == 0)
1129                         av = -1;
1130                 else
1131                         av = a.syms[i];
1132                 if (i == b.cnt || item_index(b.syms[i]) == 0)
1133                         bv = -1;
1134                 else
1135                         bv = b.syms[i];
1136                 if (av - bv)
1137                         return av - bv;
1138                 if (type < LR1 || av == -1)
1139                         return 0;
1140                 return
1141                         a.data[i] - b.data[i];
1142         }
1143
1144 And now we can build the list of itemsets.  The lookup routine returns
1145 both a success flag and a pointer to where in the list an insert
1146 should happen, so we don't need to search a second time.
1147
1148 ###### declarations
1149         struct itemset {
1150                 struct itemset *next;
1151                 short state;
1152                 struct symset items;
1153                 struct symset go_to;
1154                 char completed;
1155                 char starts_line;
1156         };
1157
1158 ###### grammar fields
1159         struct itemset *items;
1160         int states;
1161
1162 ###### functions
1163         static int itemset_find(struct grammar *g, struct itemset ***where,
1164                                 struct symset kernel, enum grammar_type type)
1165         {
1166                 struct itemset **ip;
1167
1168                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1169                         struct itemset *i = *ip;
1170                         int diff;
1171                         diff = itemset_cmp(i->items, kernel, type);
1172                         if (diff < 0)
1173                                 continue;
1174                         if (diff > 0)
1175                                 break;
1176                         /* found */
1177                         *where = ip;
1178                         return 1;
1179                 }
1180                 *where = ip;
1181                 return 0;
1182         }
1183
1184 Adding an itemset may require merging the LA sets if LALR analysis is
1185 happening. If any new LA set add symbol that weren't in the old LA set, we
1186 clear the `completed` flag so that the dependants of this itemset will be
1187 recalculated and their LA sets updated.
1188
1189 `add_itemset` must consume the symsets it is passed, either by adding
1190 them to a data structure, of freeing them.
1191
1192         static int add_itemset(struct grammar *g, struct symset ss,
1193                                enum grammar_type type, int starts_line)
1194         {
1195                 struct itemset **where, *is;
1196                 int i;
1197                 int found = itemset_find(g, &where, ss, type);
1198                 if (!found) {
1199                         is = calloc(1, sizeof(*is));
1200                         is->state = g->states;
1201                         g->states += 1;
1202                         is->items = ss;
1203                         is->next = *where;
1204                         is->go_to = INIT_DATASET;
1205                         is->starts_line = starts_line;
1206                         *where = is;
1207                         return is->state;
1208                 }
1209                 is = *where;
1210                 if (type != LALR) {
1211                         symset_free(ss);
1212                         return is->state;
1213                 }
1214                 for (i = 0; i < ss.cnt; i++) {
1215                         struct symset temp = INIT_SYMSET, s;
1216                         if (ss.data[i] == is->items.data[i])
1217                                 continue;
1218                         s = set_find(g, is->items.data[i]);
1219                         symset_union(&temp, &s);
1220                         s = set_find(g, ss.data[i]);
1221                         if (symset_union(&temp, &s)) {
1222                                 is->items.data[i] = save_set(g, temp);
1223                                 is->completed = 0;
1224                         } else
1225                                 symset_free(temp);
1226                 }
1227                 symset_free(ss);
1228                 return is->state;
1229         }
1230
1231 #### The build
1232
1233 To build all the itemsets, we first insert the initial itemset made from the
1234 start symbol, complete each itemset, and then generate new itemsets from old
1235 until no new ones can be made.
1236
1237 Completing an itemset means finding all the items where "DOT" is followed by
1238 a nonterminal and adding "DOT=0" items for every production from that
1239 non-terminal - providing each item hasn't already been added.
1240
1241 If LA sets are needed, the LA set for each new item is found using
1242 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This may
1243 be supplemented by the LA set for the item which produce the new item.
1244
1245 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1246 is used in the next stage.
1247
1248 NOTE: precedence handling should happen here - I haven't written this yet
1249 though.
1250
1251 ###### complete itemset
1252         for (i = 0; i < is->items.cnt; i++) {
1253                 int p = item_prod(is->items.syms[i]);
1254                 int bs = item_index(is->items.syms[i]);
1255                 struct production *pr = g->productions[p];
1256                 int p2;
1257                 struct symbol *s;
1258                 struct symset LA = INIT_SYMSET;
1259                 unsigned short sn = 0;
1260
1261                 if (bs == pr->body_size)
1262                         continue;
1263                 s = pr->body[bs];
1264                 if (symset_find(&done, s->num) < 0)
1265                         symset_add(&done, s->num, 0);
1266                 if (s->type != Nonterminal)
1267                         continue;
1268                 again = 1;
1269                 if (type >= LALR) {
1270                         // Need the LA set.
1271                         int to_end;
1272                         add_first(pr, bs+1, &LA, g, &to_end);
1273                         if (to_end) {
1274                                 struct symset ss = set_find(g, is->items.data[i]);
1275                                 symset_union(&LA, &ss);
1276                         }
1277                         sn = save_set(g, LA);
1278                         LA = set_find(g, sn);
1279                 }
1280
1281                 /* Add productions for this symbol */
1282                 for (p2 = s->first_production;
1283                      p2 < g->production_count &&
1284                       g->productions[p2]->head == s;
1285                      p2++) {
1286                         int itm = item_num(p2, 0);
1287                         int pos = symset_find(&is->items, itm);
1288                         if (pos < 0) {
1289                                 symset_add(&is->items, itm, sn);
1290                                 /* Will have re-ordered, so start
1291                                  * from beginning again */
1292                                 i = -1;
1293                         } else if (type >= LALR) {
1294                                 struct symset ss = set_find(g, is->items.data[pos]);
1295                                 struct symset tmp = INIT_SYMSET;
1296
1297                                 symset_union(&tmp, &ss);
1298                                 if (symset_union(&tmp, &LA)) {
1299                                         is->items.data[pos] = save_set(g, tmp);
1300                                         i = -1;
1301                                 }else
1302                                         symset_free(tmp);
1303                         }
1304                 }
1305         }
1306
1307 For each symbol we found (and placed in `done`) we collect all the items for
1308 which this symbol is next, and create a new itemset with "DOT" advanced over
1309 the symbol.  This is then added to the collection of itemsets (or merged
1310 with a pre-existing itemset).
1311
1312 ###### derive itemsets
1313         // Now we have a completed itemset, so we need to
1314         // compute all the 'next' itemsets and create them
1315         // if they don't exist.
1316         for (i = 0; i < done.cnt; i++) {
1317                 int j;
1318                 unsigned short state;
1319                 int starts_line = 0;
1320                 struct symbol *sym = g->symtab[done.syms[i]];
1321                 struct symset newitemset = INIT_SYMSET;
1322                 if (type >= LALR)
1323                         newitemset = INIT_DATASET;
1324
1325                 if (sym->can_eol ||
1326                     (sym->nullable && is->starts_line))
1327                         starts_line = 1;
1328                 for (j = 0; j < is->items.cnt; j++) {
1329                         int itm = is->items.syms[j];
1330                         int p = item_prod(itm);
1331                         int bp = item_index(itm);
1332                         struct production *pr = g->productions[p];
1333                         unsigned short la = 0;
1334                         int pos;
1335
1336                         if (bp == pr->body_size)
1337                                 continue;
1338                         if (pr->body[bp] != sym)
1339                                 continue;
1340                         if (type >= LALR)
1341                                 la = is->items.data[j];
1342                         pos = symset_find(&newitemset, pr->head->num);
1343                         if (pos < 0)
1344                                 symset_add(&newitemset, item_num(p, bp+1), la);
1345                         else if (type >= LALR) {
1346                                 // Need to merge la set.
1347                                 int la2 = newitemset.data[pos];
1348                                 if (la != la2) {
1349                                         struct symset ss = set_find(g, la2);
1350                                         struct symset LA = INIT_SYMSET;
1351                                         symset_union(&LA, &ss);
1352                                         ss = set_find(g, la);
1353                                         if (symset_union(&LA, &ss))
1354                                                 newitemset.data[pos] = save_set(g, LA);
1355                                         else
1356                                                 symset_free(LA);
1357                                 }
1358                         }
1359                 }
1360                 state = add_itemset(g, newitemset, type, starts_line);
1361                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1362                         symset_add(&is->go_to, done.syms[i], state);
1363         }
1364
1365 All that is left is to crate the initial itemset from production zero, and
1366 with `TK_eof` as the LA set.
1367
1368 ###### functions
1369         static void build_itemsets(struct grammar *g, enum grammar_type type)
1370         {
1371                 struct symset first = INIT_SYMSET;
1372                 struct itemset *is;
1373                 int again;
1374                 unsigned short la = 0;
1375                 if (type >= LALR) {
1376                         // LA set just has eof
1377                         struct symset eof = INIT_SYMSET;
1378                         symset_add(&eof, TK_eof, 0);
1379                         la = save_set(g, eof);
1380                         first = INIT_DATASET;
1381                 }
1382                 // production 0, offset 0 (with no data)
1383                 symset_add(&first, item_num(0, 0), la);
1384                 add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
1385                 for (again = 0, is = g->items;
1386                      is;
1387                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
1388                         int i;
1389                         struct symset done = INIT_SYMSET;
1390                         if (is->completed)
1391                                 continue;
1392                         is->completed = 1;
1393                         again = 1;
1394                         ## complete itemset
1395                         ## derive itemsets
1396                         symset_free(done);
1397                 }
1398         }
1399
1400 ### Completing the analysis.
1401
1402 The exact process of analysis depends on which level was requested.  For
1403 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1404 `LR1` we need first, but not follow.  For `SLR` we need both.
1405
1406 We don't build the "action" tables that you might expect as the parser
1407 doesn't use them.  They will be calculated to some extent if needed for
1408 a report.
1409
1410 Once we have built everything we allocate arrays for the two lists:
1411 symbols and itemsets.  This allows more efficient access during reporting.
1412 The symbols are grouped as terminals and non-terminals and we record the
1413 changeover point in `first_nonterm`.
1414
1415 ###### grammar fields
1416         struct symbol **symtab;
1417         struct itemset **statetab;
1418         int first_nonterm;
1419
1420 ###### grammar_analyse
1421
1422         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1423         {
1424                 struct symbol *s;
1425                 struct itemset *is;
1426                 int snum = TK_reserved;
1427                 for (s = g->syms; s; s = s->next)
1428                         if (s->num < 0 && s->type == Terminal) {
1429                                 s->num = snum;
1430                                 snum++;
1431                         }
1432                 g->first_nonterm = snum;
1433                 for (s = g->syms; s; s = s->next)
1434                         if (s->num < 0) {
1435                                 s->num = snum;
1436                                 snum++;
1437                         }
1438                 g->num_syms = snum;
1439                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1440                 for (s = g->syms; s; s = s->next)
1441                         g->symtab[s->num] = s;
1442
1443                 set_nullable(g);
1444                 set_can_eol(g);
1445                 if (type >= SLR)
1446                         build_first(g);
1447
1448                 if (type == SLR)
1449                         build_follow(g);
1450
1451                 build_itemsets(g, type);
1452
1453                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1454                 for (is = g->items; is ; is = is->next)
1455                         g->statetab[is->state] = is;
1456         }
1457
1458 ## Reporting on the Grammar
1459
1460 The purpose of the report is to give the grammar developer insight into
1461 how the grammar parser will work.  It is basically a structured dump of
1462 all the tables that have been generated, plus an description of any conflicts.
1463
1464 ###### grammar_report
1465         static int grammar_report(struct grammar *g, enum grammar_type type)
1466         {
1467                 report_symbols(g);
1468                 if (g->follow)
1469                         report_follow(g);
1470                 report_itemsets(g);
1471                 return report_conflicts(g, type);
1472         }
1473
1474 Firstly we have the complete list of symbols, together with the "FIRST"
1475 set if that was generated.
1476
1477 ###### functions
1478
1479         static void report_symbols(struct grammar *g)
1480         {
1481                 int n;
1482                 if (g->first)
1483                         printf("SYMBOLS + FIRST:\n");
1484                 else
1485                         printf("SYMBOLS:\n");
1486
1487                 for (n = 0; n < g->num_syms; n++) {
1488                         struct symbol *s = g->symtab[n];
1489                         if (!s)
1490                                 continue;
1491
1492                         printf(" %c%c%3d%c: ",
1493                                s->nullable ? '.':' ',
1494                                s->can_eol ? '>':' ',
1495                                s->num, symtypes[s->type]);
1496                         prtxt(s->name);
1497                         if (s->precedence)
1498                                 printf(" (%d%s)", s->precedence,
1499                                        assoc_names[s->assoc]);
1500
1501                         if (g->first && s->type == Nonterminal) {
1502                                 int j;
1503                                 char c = ':';
1504                                 for (j = 0; j < g->first[n].cnt; j++) {
1505                                         printf("%c ", c);
1506                                         c = ',';
1507                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1508                                 }
1509                         }
1510                         printf("\n");
1511                 }
1512         }
1513
1514 Then we have to follow sets if they were computed.
1515
1516         static void report_follow(struct grammar *g)
1517         {
1518                 int n;
1519                 printf("FOLLOW:\n");
1520                 for (n = 0; n < g->num_syms; n++)
1521                         if (g->follow[n].cnt) {
1522                                 int j;
1523                                 char c = ':';
1524                                 printf("  ");
1525                                 prtxt(g->symtab[n]->name);
1526                                 for (j = 0; j < g->follow[n].cnt; j++) {
1527                                         printf("%c ", c);
1528                                         c = ',';
1529                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1530                                 }
1531                                 printf("\n");
1532                         }
1533         }
1534
1535 And finally the item sets.  These include the GO TO tables and, for
1536 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1537 it up a bit.  First the items, with production number and associativity.
1538
1539         static void report_item(struct grammar *g, int itm)
1540         {
1541                 int p = item_prod(itm);
1542                 int dot = item_index(itm);
1543                 struct production *pr = g->productions[p];
1544                 int i;
1545
1546                 printf("    ");
1547                 prtxt(pr->head->name);
1548                 printf(" ->");
1549                 for (i = 0; i < pr->body_size; i++) {
1550                         printf(" %s", (dot == i ? ". ": ""));
1551                         prtxt(pr->body[i]->name);
1552                 }
1553                 if (dot == pr->body_size)
1554                         printf(" .");
1555                 printf(" [%d]", p);
1556                 if (pr->precedence)
1557                         printf(" (%d%s)", pr->precedence,
1558                                assoc_names[pr->assoc]);
1559                 printf("\n");
1560         }
1561
1562 The LA sets which are (possibly) reported with each item:
1563
1564         static void report_la(struct grammar *g, int lanum)
1565         {
1566                 struct symset la = set_find(g, lanum);
1567                 int i;
1568                 char c = ':';
1569
1570                 printf("        LOOK AHEAD(%d)", lanum);
1571                 for (i = 0; i < la.cnt; i++) {
1572                         printf("%c ", c);
1573                         c = ',';
1574                         prtxt(g->symtab[la.syms[i]]->name);
1575                 }
1576                 printf("\n");
1577         }
1578
1579 Then the go to sets:
1580
1581
1582         static void report_goto(struct grammar *g, struct symset gt)
1583         {
1584                 int i;
1585                 printf("    GOTO:\n");
1586
1587                 for (i = 0; i < gt.cnt; i++) {
1588                         printf("      ");
1589                         prtxt(g->symtab[gt.syms[i]]->name);
1590                         printf(" -> %d\n", gt.data[i]);
1591                 }
1592         }
1593
1594 Now we can report all the item sets complete with items, LA sets, and GO TO.
1595
1596         static void report_itemsets(struct grammar *g)
1597         {
1598                 int s;
1599                 printf("ITEM SETS(%d)\n", g->states);
1600                 for (s = 0; s < g->states; s++) {
1601                         int j;
1602                         struct itemset *is = g->statetab[s];
1603                         printf("  Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
1604                         for (j = 0; j < is->items.cnt; j++) {
1605                                 report_item(g, is->items.syms[j]);
1606                                 if (is->items.data != NO_DATA)
1607                                         report_la(g, is->items.data[j]);
1608                         }
1609                         report_goto(g, is->go_to);
1610                 }
1611         }
1612
1613 ### Reporting conflicts
1614
1615 Conflict detection varies a lot among different analysis levels.  However
1616 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1617 LR1 are also similar as they have FOLLOW or LA sets.
1618
1619 ###### functions
1620
1621         ## conflict functions
1622
1623         static int report_conflicts(struct grammar *g, enum grammar_type type)
1624         {
1625                 int cnt = 0;
1626                 printf("Conflicts:\n");
1627                 if (type < SLR)
1628                         cnt = conflicts_lr0(g, type);
1629                 else
1630                         cnt = conflicts_slr(g, type);
1631                 if (cnt == 0)
1632                         printf(" - no conflicts\n");
1633                 return cnt;
1634         }
1635
1636 LR0 conflicts are any state which have both a reducible item and
1637 a shiftable item.
1638
1639 LR05 conflicts only occurs if two possibly reductions exist,
1640 as shifts always over-ride reductions.
1641
1642 ###### conflict functions
1643         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1644         {
1645                 int i;
1646                 int cnt = 0;
1647                 for (i = 0; i < g->states; i++) {
1648                         struct itemset *is = g->statetab[i];
1649                         int last_reduce = -1;
1650                         int prev_reduce = -1;
1651                         int last_shift = -1;
1652                         int j;
1653                         if (!is)
1654                                 continue;
1655                         for (j = 0; j < is->items.cnt; j++) {
1656                                 int itm = is->items.syms[j];
1657                                 int p = item_prod(itm);
1658                                 int bp = item_index(itm);
1659                                 struct production *pr = g->productions[p];
1660
1661                                 if (bp == pr->body_size) {
1662                                         prev_reduce = last_reduce;
1663                                         last_reduce = j;
1664                                         continue;
1665                                 }
1666                                 if (pr->body[bp]->type == Terminal)
1667                                         last_shift = j;
1668                         }
1669                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1670                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1671                                 report_item(g, is->items.syms[last_shift]);
1672                                 report_item(g, is->items.syms[last_reduce]);
1673                                 cnt++;
1674                         }
1675                         if (prev_reduce >= 0) {
1676                                 printf("  State %d has 2 (or more) reducible items\n", i);
1677                                 report_item(g, is->items.syms[prev_reduce]);
1678                                 report_item(g, is->items.syms[last_reduce]);
1679                                 cnt++;
1680                         }
1681                 }
1682                 return cnt;
1683         }
1684
1685 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1686 look ahead, or if a symbol in a look-ahead can be shifted.  The differ only
1687 in the source of the look ahead set.
1688
1689 We build a dataset mapping terminal to item for possible SHIFTs and then
1690 another for possible REDUCE operations. We report when we get conflicts
1691 between the two.
1692
1693         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1694         {
1695                 int i;
1696                 int cnt = 0;
1697
1698                 for (i = 0; i < g->states; i++) {
1699                         struct itemset *is = g->statetab[i];
1700                         struct symset shifts = INIT_DATASET;
1701                         struct symset reduce = INIT_DATASET;
1702                         int j;
1703                         if (!is)
1704                                 continue;
1705                         /* First collect the shifts */
1706                         for (j = 0; j < is->items.cnt; j++) {
1707                                 unsigned short itm = is->items.syms[j];
1708                                 int p = item_prod(itm);
1709                                 int bp = item_index(itm);
1710                                 struct production *pr = g->productions[p];
1711
1712                                 if (bp < pr->body_size &&
1713                                     pr->body[bp]->type == Terminal) {
1714                                         /* shiftable */
1715                                         int sym = pr->body[bp]->num;
1716                                         if (symset_find(&shifts, sym) < 0)
1717                                                 symset_add(&shifts, sym, itm);
1718                                 }
1719                         }
1720                         /* Now look for reduction and conflicts */
1721                         for (j = 0; j < is->items.cnt; j++) {
1722                                 unsigned short itm = is->items.syms[j];
1723                                 int p = item_prod(itm);
1724                                 int bp = item_index(itm);
1725                                 struct production *pr = g->productions[p];
1726
1727                                 if (bp < pr->body_size)
1728                                         continue;
1729                                 /* reducible */
1730                                 struct symset la;
1731                                 if (type == SLR)
1732                                         la = g->follow[pr->head->num];
1733                                 else
1734                                         la = set_find(g, is->items.data[j]);
1735                                 int k;
1736                                 for (k = 0; k < la.cnt; k++) {
1737                                         int pos = symset_find(&shifts, la.syms[k]);
1738                                         if (pos >= 0) {
1739                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1740                                                 prtxt(g->symtab[la.syms[k]]->name);
1741                                                 printf(":\n");
1742                                                 report_item(g, shifts.data[pos]);
1743                                                 report_item(g, itm);
1744                                                 cnt++;
1745                                         }
1746                                         pos = symset_find(&reduce, la.syms[k]);
1747                                         if (pos < 0) {
1748                                                 symset_add(&reduce, la.syms[k], itm);
1749                                                 continue;
1750                                         }
1751                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1752                                         prtxt(g->symtab[la.syms[k]]->name);
1753                                         printf(":\n");
1754                                         report_item(g, itm);
1755                                         report_item(g, reduce.data[pos]);
1756                                         cnt++;
1757                                 }
1758                         }
1759                         symset_free(shifts);
1760                         symset_free(reduce);
1761                 }
1762                 return cnt;
1763         }
1764
1765
1766 ## Generating the parser
1767
1768 The export part of the parser is the `parse_XX` function, where the name
1769 `XX` is based on the name of the parser files.
1770
1771 This takes a `code_node`, a partially initialized `token_config`, and an
1772 optional `FILE` to send tracing to.  The `token_config` gets the list of
1773 known words added and then is used with the `code_node` to initialize the
1774 scanner.
1775
1776 `parse_XX` then call the library function `parser_run` to actually complete
1777 the parse.  This needs the `states` table and function to call the various
1778 pieces of code provided in the grammar file, so they are generated first.
1779
1780 ###### parser_generate
1781
1782         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1783         {
1784                 gen_known(f, g);
1785                 gen_non_term(f, g);
1786                 gen_goto(f, g);
1787                 gen_states(f, g);
1788                 gen_reduce(f, g, file);
1789                 gen_free(f, g);
1790
1791                 fprintf(f, "#line 0 \"gen_parser\"\n");
1792                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1793                         name);
1794                 fprintf(f, "{\n");
1795                 fprintf(f, "\tstruct token_state *tokens;\n");
1796                 fprintf(f, "\tconfig->words_marks = known;\n");
1797                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1798                 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1799                 fprintf(f, "\ttokens = token_open(code, config);\n");
1800                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
1801                 fprintf(f, "\ttoken_close(tokens);\n");
1802                 fprintf(f, "\treturn rv;\n");
1803                 fprintf(f, "}\n\n");
1804         }
1805
1806 ### Table words table
1807
1808 The know words is simply an array of terminal symbols.
1809 The table of nonterminals used for tracing is a similar array.
1810
1811 ###### functions
1812
1813         static void gen_known(FILE *f, struct grammar *g)
1814         {
1815                 int i;
1816                 fprintf(f, "#line 0 \"gen_known\"\n");
1817                 fprintf(f, "static const char *known[] = {\n");
1818                 for (i = TK_reserved;
1819                      i < g->num_syms && g->symtab[i]->type == Terminal;
1820                      i++)
1821                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1822                                 g->symtab[i]->name.txt);
1823                 fprintf(f, "};\n\n");
1824         }
1825
1826         static void gen_non_term(FILE *f, struct grammar *g)
1827         {
1828                 int i;
1829                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1830                 fprintf(f, "static const char *non_term[] = {\n");
1831                 for (i = TK_reserved;
1832                      i < g->num_syms;
1833                      i++)
1834                         if (g->symtab[i]->type == Nonterminal)
1835                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1836                                         g->symtab[i]->name.txt);
1837                 fprintf(f, "};\n\n");
1838         }
1839
1840 ### States and the goto tables.
1841
1842 For each state we record the goto table, the reducible production if
1843 there is one, or a symbol to shift for error recovery.
1844 Some of the details of the reducible production are stored in the
1845 `do_reduce` function to come later.  Here we store the production number,
1846 the body size (useful for stack management) and the resulting symbol (useful
1847 for knowing how to free data later).
1848
1849 The go to table is stored in a simple array of `sym` and corresponding
1850 `state`.
1851
1852 ###### exported types
1853
1854         struct lookup {
1855                 short sym;
1856                 short state;
1857         };
1858         struct state {
1859                 short go_to_cnt;
1860                 const struct lookup * go_to;
1861                 short reduce_prod;
1862                 short reduce_size;
1863                 short reduce_sym;
1864                 short shift_sym;
1865                 short starts_line;
1866         };
1867
1868
1869 ###### functions
1870
1871         static void gen_goto(FILE *f, struct grammar *g)
1872         {
1873                 int i;
1874                 fprintf(f, "#line 0 \"gen_goto\"\n");
1875                 for (i = 0; i < g->states; i++) {
1876                         int j;
1877                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
1878                                 i);
1879                         struct symset gt = g->statetab[i]->go_to;
1880                         for (j = 0; j < gt.cnt; j++)
1881                                 fprintf(f, "\t{ %d, %d },\n",
1882                                         gt.syms[j], gt.data[j]);
1883                         fprintf(f, "};\n");
1884                 }
1885         }
1886
1887 ###### functions
1888
1889         static void gen_states(FILE *f, struct grammar *g)
1890         {
1891                 int i;
1892                 fprintf(f, "#line 0 \"gen_states\"\n");
1893                 fprintf(f, "static const struct state states[] = {\n");
1894                 for (i = 0; i < g->states; i++) {
1895                         struct itemset *is = g->statetab[i];
1896                         int j, prod = -1, prod_len;
1897                         int shift_sym = -1;
1898                         int shift_len = 0, shift_remain = 0;
1899                         for (j = 0; j < is->items.cnt; j++) {
1900                                 int itm = is->items.syms[j];
1901                                 int p = item_prod(itm);
1902                                 int bp = item_index(itm);
1903                                 struct production *pr = g->productions[p];
1904
1905                                 if (bp < pr->body_size) {
1906                                         if (shift_sym < 0 ||
1907                                             (shift_len == bp && shift_remain > pr->body_size - bp)) {
1908                                                 shift_sym = pr->body[bp]->num;
1909                                                 shift_len = bp;
1910                                                 shift_remain = pr->body_size - bp;
1911                                         }
1912                                         continue;
1913                                 }
1914                                 /* This is what we reduce */
1915                                 if (prod < 0 || prod_len < pr->body_size) {
1916                                         prod = p;
1917                                         prod_len = pr->body_size;
1918                                 }
1919                         }
1920
1921                         if (prod >= 0)
1922                                 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
1923                                         i, is->go_to.cnt, i, prod,
1924                                         g->productions[prod]->body_size,
1925                                         g->productions[prod]->head->num,
1926                                         is->starts_line);
1927                         else
1928                                 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
1929                                         i, is->go_to.cnt, i, shift_sym,
1930                                         is->starts_line);
1931                 }
1932                 fprintf(f, "};\n\n");
1933         }
1934
1935 ### The `do_reduce` function and the code
1936
1937 When the parser engine decides to reduce a production, it calls `do_reduce`.
1938 This has two functions.
1939
1940 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1941 production being reduced.  Secondly it runs the code that was included with
1942 the production if any.
1943
1944 This code needs to be able to store data somewhere.  Rather than requiring
1945 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1946 `do_reduce` return the size to be saved.
1947
1948 The code fragment requires translation when written out.  Any `$N` needs to
1949 be converted to a reference either to that buffer (if `$0`) or to the
1950 structure returned by a previous reduction.  These pointer need to be cast
1951 to the appropriate type for each access.  All this is handling in
1952 `gen_code`.
1953
1954
1955 ###### functions
1956
1957         static void gen_code(struct production *p, FILE *f, struct grammar *g)
1958         {
1959                 char *c;
1960                 fprintf(f, "\t\t\t");
1961                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1962                         int n;
1963                         if (*c != '$') {
1964                                 fputc(*c, f);
1965                                 if (*c == '\n')
1966                                         fputs("\t\t\t", f);
1967                                 continue;
1968                         }
1969                         c++;
1970                         if (*c < '0' || *c > '9') {
1971                                 fputc(*c, f);
1972                                 continue;
1973                         }
1974                         n = *c - '0';
1975                         while (c[1] >= '0' && c[1] <= '9') {
1976                                 c += 1;
1977                                 n = n * 10 + *c - '0';
1978                         }
1979                         if (n == 0)
1980                                 fprintf(f, "(*(struct %.*s*)ret)",
1981                                         p->head->struct_name.len,
1982                                         p->head->struct_name.txt);
1983                         else if (n > p->body_size)
1984                                 fprintf(f, "$%d", n);
1985                         else if (p->body[n-1]->type == Terminal)
1986                                 fprintf(f, "(*(struct token *)body[%d])",
1987                                         n-1);
1988                         else if (p->body[n-1]->struct_name.txt == NULL)
1989                                 fprintf(f, "$%d", n);
1990                         else
1991                                 fprintf(f, "(*(struct %.*s*)body[%d])",
1992                                         p->body[n-1]->struct_name.len,
1993                                         p->body[n-1]->struct_name.txt, n-1);
1994                 }
1995                 fputs("\n", f);
1996         }
1997
1998 ###### functions
1999
2000         static void gen_reduce(FILE *f, struct grammar *g, char *file)
2001         {
2002                 int i;
2003                 fprintf(f, "#line 0 \"gen_reduce\"\n");
2004                 fprintf(f, "static int do_reduce(int prod, void **body, void *ret)\n");
2005                 fprintf(f, "{\n");
2006                 fprintf(f, "\tint ret_size = 0;\n");
2007
2008                 fprintf(f, "\tswitch(prod) {\n");
2009                 for (i = 0; i < g->production_count; i++) {
2010                         struct production *p = g->productions[i];
2011                         fprintf(f, "\tcase %d:\n", i);
2012
2013                         if (p->code.txt)
2014                                 gen_code(p, f, g);
2015
2016                         if (p->head->struct_name.txt)
2017                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
2018                                         p->head->struct_name.len,
2019                                         p->head->struct_name.txt);
2020
2021                         fprintf(f, "\t\tbreak;\n");
2022                 }
2023                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2024         }
2025
2026 ### `do_free`
2027
2028 As each non-terminal can potentially cause a different type of data
2029 structure to be allocated and filled in, we need to be able to free it when
2030 done.
2031
2032 It is particularly important to have fine control over freeing during error
2033 recovery where individual stack frames might need to be freed.
2034
2035 For this, the grammar author required to defined a `free_XX` function for
2036 each structure that is used by a non-terminal.  `do_free` all call whichever
2037 is appropriate given a symbol number, and will call `free` (as is
2038 appropriate for tokens` on any terminal symbol.
2039
2040 ###### functions
2041
2042         static void gen_free(FILE *f, struct grammar *g)
2043         {
2044                 int i;
2045
2046                 fprintf(f, "#line 0 \"gen_free\"\n");
2047                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2048                 fprintf(f, "{\n");
2049                 fprintf(f, "\tif (!asn) return;\n");
2050                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2051                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2052                 fprintf(f, "\tswitch(sym) {\n");
2053
2054                 for (i = 0; i < g->num_syms; i++) {
2055                         struct symbol *s = g->symtab[i];
2056                         if (!s ||
2057                             s->type != Nonterminal ||
2058                             s->struct_name.txt == NULL)
2059                                 continue;
2060
2061                         fprintf(f, "\tcase %d:\n", s->num);
2062                         fprintf(f, "\t\tfree_%.*s(asn);\n",
2063                                 s->struct_name.len,
2064                                 s->struct_name.txt);
2065                         fprintf(f, "\t\tbreak;\n");
2066                 }
2067                 fprintf(f, "\t}\n}\n\n");
2068         }
2069
2070 ## The main routine.
2071
2072 There are three key parts to the "main" function of parsergen: processing
2073 the arguments, loading the grammar file, and dealing with the grammar.
2074
2075 ### Argument processing.
2076
2077 Command line options allow the selection of analysis mode, name of output
2078 file, and whether or not a report should be generated.  By default we create
2079 a report only if no code output was requested.
2080
2081 The `parse_XX` function name uses the basename of the output file which
2082 should not have a suffix (`.c`).  `.c` is added to the given name for the
2083 code, and `.h` is added for the header (if header text is specifed in the
2084 grammar file).
2085
2086 ###### includes
2087         #include <getopt.h>
2088
2089 ###### declarations
2090         static const struct option long_options[] = {
2091                 { "LR0",        0, NULL, '0' },
2092                 { "LR05",       0, NULL, '5' },
2093                 { "SLR",        0, NULL, 'S' },
2094                 { "LALR",       0, NULL, 'L' },
2095                 { "LR1",        0, NULL, '1' },
2096                 { "tag",        1, NULL, 't' },
2097                 { "report",     0, NULL, 'R' },
2098                 { "output",     1, NULL, 'o' },
2099                 { NULL,         0, NULL, 0   }
2100         };
2101         const char *options = "05SL1t:Ro:";
2102
2103 ###### process arguments
2104         int opt;
2105         char *outfile = NULL;
2106         char *infile;
2107         char *name;
2108         char *tag = NULL;
2109         int report = 1;
2110         enum grammar_type type = LR05;
2111         while ((opt = getopt_long(argc, argv, options,
2112                                   long_options, NULL)) != -1) {
2113                 switch(opt) {
2114                 case '0':
2115                         type = LR0; break;
2116                 case '5':
2117                         type = LR05; break;
2118                 case 'S':
2119                         type = SLR; break;
2120                 case 'L':
2121                         type = LALR; break;
2122                 case '1':
2123                         type = LR1; break;
2124                 case 'R':
2125                         report = 2; break;
2126                 case 'o':
2127                         outfile = optarg; break;
2128                 case 't':
2129                         tag = optarg; break;
2130                 default:
2131                         fprintf(stderr, "Usage: parsergen ...\n");
2132                         exit(1);
2133                 }
2134         }
2135         if (optind < argc)
2136                 infile = argv[optind++];
2137         else {
2138                 fprintf(stderr, "No input file given\n");
2139                 exit(1);
2140         }
2141         if (outfile && report == 1)
2142                 report = 0;
2143         name = outfile;
2144         if (name && strchr(name, '/'))
2145                 name = strrchr(name, '/')+1;
2146
2147         if (optind < argc) {
2148                 fprintf(stderr, "Excess command line arguments\n");
2149                 exit(1);
2150         }
2151
2152 ### Loading the grammar file
2153
2154 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2155 map it.
2156
2157 One we have extracted the code (with `mdcode`) we expect to file three
2158 sections: header, code, and grammar.  Anything else is an error.
2159
2160 "header" and "code" are optional, though it is hard to build a working
2161 parser with neither. "grammar" must be provided.
2162
2163 ###### includes
2164         #include <fcntl.h>
2165         #include <sys/mman.h>
2166         #include <errno.h>
2167
2168 ###### functions
2169         static int errs;
2170         static void pr_err(char *msg)
2171         {
2172                 errs++;
2173                 fprintf(stderr, "%s\n", msg);
2174         }
2175
2176 ###### load file
2177         struct section *table;
2178         int fd;
2179         int len;
2180         char *file;
2181         fd = open(infile, O_RDONLY);
2182         if (fd < 0) {
2183                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2184                         infile, strerror(errno));
2185                 exit(1);
2186         }
2187         len = lseek(fd, 0, 2);
2188         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2189         table = code_extract(file, file+len, pr_err);
2190
2191         struct code_node *hdr = NULL;
2192         struct code_node *code = NULL;
2193         struct code_node *gram = NULL;
2194         for (s = table; s; s = s->next) {
2195                 struct text sec = s->section;
2196                 if (tag && !strip_tag(&sec, tag))
2197                         continue;
2198                 if (text_is(sec, "header"))
2199                         hdr = s->code;
2200                 else if (text_is(sec, "code"))
2201                         code = s->code;
2202                 else if (text_is(sec, "grammar"))
2203                         gram = s->code;
2204                 else {
2205                         fprintf(stderr, "Unknown content section: %.*s\n",
2206                                 s->section.len, s->section.txt);
2207                         rv |= 2;
2208                 }
2209         }
2210
2211 ### Processing the input
2212
2213 As we need to append an extention to a filename and then open it for
2214 writing, and we need to do this twice, it helps to have a separate function.
2215
2216 ###### functions
2217
2218         static FILE *open_ext(char *base, char *ext)
2219         {
2220                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2221                 FILE *f;
2222                 strcat(strcpy(fn, base), ext);
2223                 f = fopen(fn, "w");
2224                 free(fn);
2225                 return f;
2226         }
2227
2228 If we can read the grammar, then we analyse and optionally report on it.  If
2229 the report finds conflicts we will exit with an error status.
2230
2231 ###### process input
2232         struct grammar *g = NULL;
2233         if (gram == NULL) {
2234                 fprintf(stderr, "No grammar section provided\n");
2235                 rv |= 2;
2236         } else {
2237                 g = grammar_read(gram);
2238                 if (!g) {
2239                         fprintf(stderr, "Failure to parse grammar\n");
2240                         rv |= 2;
2241                 }
2242         }
2243         if (g) {
2244                 grammar_analyse(g, type);
2245                 if (report)
2246                         if (grammar_report(g, type))
2247                                 rv |= 1;
2248         }
2249
2250 If a headers section is defined, we write it out and include a declaration
2251 for the `parse_XX` function so it can be used from separate code.
2252
2253         if (rv == 0 && hdr && outfile) {
2254                 FILE *f = open_ext(outfile, ".h");
2255                 if (f) {
2256                         code_node_print(f, hdr, infile);
2257                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2258                                 name);
2259                         fclose(f);
2260                 } else {
2261                         fprintf(stderr, "Cannot create %s.h\n",
2262                                 outfile);
2263                         rv |= 4;
2264                 }
2265         }
2266
2267 And if all goes well and an output file was provided, we create the `.c`
2268 file with the code section (if any) and the parser tables and function.
2269
2270         if (rv == 0 && outfile) {
2271                 FILE *f = open_ext(outfile, ".c");
2272                 if (f) {
2273                         if (code)
2274                                 code_node_print(f, code, infile);
2275                         gen_parser(f, g, infile, name);
2276                         fclose(f);
2277                 } else {
2278                         fprintf(stderr, "Cannot create %s.c\n",
2279                                 outfile);
2280                         rv |= 4;
2281                 }
2282         }
2283
2284 And that about wraps it up.  We need to set the locale so that UTF-8 is
2285 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2286
2287 ###### File: parsergen.mk
2288         parsergen : parsergen.o libscanner.o libmdcode.o
2289                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2290
2291 ###### includes
2292         #include <locale.h>
2293
2294 ###### main
2295
2296         int main(int argc, char *argv[])
2297         {
2298                 struct section *s;
2299                 int rv = 0;
2300
2301                 setlocale(LC_ALL,"");
2302
2303                 ## process arguments
2304                 ## load file
2305                 ## process input
2306
2307                 return rv;
2308         }
2309
2310 ## The SHIFT/REDUCE parser
2311
2312 Having analysed the grammar and generated all the table, we only need the
2313 shift/reduce engine to bring it all together.
2314
2315 ### Goto table lookup
2316
2317 The parser generator has nicely provided us with goto tables sorted by
2318 symbol number.  We need a binary search function to find a symbol in the
2319 table.
2320
2321 ###### parser functions
2322
2323         static int search(const struct state *l, int sym)
2324         {
2325                 int lo = 0;
2326                 int hi = l->go_to_cnt;
2327
2328                 if (hi == 0)
2329                         return -1;
2330                 while (lo + 1 < hi) {
2331                         int mid = (lo + hi) / 2;
2332                         if (l->go_to[mid].sym <= sym)
2333                                 lo = mid;
2334                         else
2335                                 hi = mid;
2336                 }
2337                 if (l->go_to[lo].sym == sym)
2338                         return l->go_to[lo].state;
2339                 else
2340                         return -1;
2341         }
2342
2343 ### The state stack.
2344
2345 The core data structure for the parser is the stack.  This tracks all the
2346 symbols that have been recognised or partially recognised.
2347
2348 The stack usually won't grow very large - maybe a few tens of entries.  So
2349 we dynamically resize and array as required but never bother to shrink it
2350 down again.
2351
2352 We keep the stack as two separate allocations.  One, `asn_stack` stores the
2353 "abstract syntax nodes" which are created by each reduction.  When we call
2354 `do_reduce` we need to pass an array of the `asn`s of the body of the
2355 production, and by keeping a separate `asn` stack, we can just pass a
2356 pointer into this stack.
2357
2358 The other allocation stores all other stack fields of which there are four.
2359 The `state` is the most important one and guides the parsing process.  The
2360 `sym` is nearly unnecessary.  However when we want to free entries from the
2361 `asn_stack`, it helps to know what type they are so we can call the right
2362 freeing function.  The symbol leads us to the right free function through
2363 `do_free`.
2364
2365 The `indents` count and the `starts_indented` flag track the line
2366 indents in the symbol.  These are used to allow indent information to
2367 guide parsing and error recovery.
2368
2369 As well as the stack of frames we have a `next` frame which is
2370 assembled from the incoming token and other information prior to
2371 pushing it onto the stack.
2372
2373 ###### parser functions
2374
2375         struct parser {
2376                 struct frame {
2377                         short state;
2378                         short sym;
2379                         short starts_indented;
2380                         short indents;
2381                         short newline_permitted;
2382                 } *stack, next;
2383                 void **asn_stack;
2384                 int stack_size;
2385                 int tos;
2386         };
2387
2388 #### Shift and pop
2389
2390 Two operations are needed on the stack - shift (which is like push) and pop.
2391
2392 Shift applies not only to terminals but also to non-terminals.  When we
2393 reduce a production we will pop off entries corresponding to the body
2394 symbols, then push on an item for the head of the production.  This last is
2395 exactly the same process as shifting in a terminal so we use the same
2396 function for both.
2397
2398 To simplify other code we arrange for `shift` to fail if there is no `goto`
2399 state for the symbol.  This is useful in basic parsing due to our design
2400 that we shift when we can, and reduce when we cannot.  So the `shift`
2401 function reports if it could.
2402
2403 So `shift` finds the next state.  If that succeed it extends the allocations
2404 if needed and pushes all the information onto the stacks.
2405
2406 ###### parser functions
2407
2408         static int shift(struct parser *p,
2409                          void *asn,
2410                          const struct state states[])
2411         {
2412                 // Push an entry onto the stack
2413                 int newstate = search(&states[p->next.state], p->next.sym);
2414                 if (newstate < 0)
2415                         return 0;
2416                 if (p->tos >= p->stack_size) {
2417                         p->stack_size += 10;
2418                         p->stack = realloc(p->stack, p->stack_size
2419                                            * sizeof(p->stack[0]));
2420                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2421                                            * sizeof(p->asn_stack[0]));
2422                 }
2423                 p->stack[p->tos] = p->next;
2424                 p->asn_stack[p->tos] = asn;
2425                 p->tos++;
2426                 p->next.state = newstate;
2427                 p->next.indents = 0;
2428                 p->next.starts_indented = 0;
2429                 // if new state doesn't start a line, we inherit newline_permitted status
2430                 if (states[newstate].starts_line)
2431                         p->next.newline_permitted = 1;
2432                 return 1;
2433         }
2434
2435 `pop` simply moves the top of stack (`tos`) back down the required amount
2436 and frees any `asn` entries that need to be freed.  It is called _after_ we
2437 reduce a production, just before we `shift` the nonterminal in.
2438
2439 ###### parser functions
2440
2441         static void pop(struct parser *p, int num,
2442                         void(*do_free)(short sym, void *asn))
2443         {
2444                 int i;
2445                 p->tos -= num;
2446                 for (i = 0; i < num; i++) {
2447                         p->next.indents += p->stack[p->tos+i].indents;
2448                         do_free(p->stack[p->tos+i].sym,
2449                                 p->asn_stack[p->tos+i]);
2450                 }
2451
2452                 if (num) {
2453                         p->next.state = p->stack[p->tos].state;
2454                         p->next.starts_indented = p->stack[p->tos].starts_indented;
2455                         p->next.newline_permitted = p->stack[p->tos].newline_permitted;
2456                         if (p->next.indents > p->next.starts_indented)
2457                                 p->next.newline_permitted = 0;
2458                 }
2459         }
2460
2461 ### Memory allocation
2462
2463 The `scanner` returns tokens in a local variable - we want them in allocated
2464 memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
2465 a reduce is in a large buffer.  Both of these require some allocation and
2466 copying, hence `memdup` and `tokcopy`.
2467
2468 ###### parser includes
2469         #include <memory.h>
2470
2471 ###### parser functions
2472
2473         void *memdup(void *m, int len)
2474         {
2475                 void *ret;
2476                 if (len == 0)
2477                         return NULL;
2478                 ret = malloc(len);
2479                 memcpy(ret, m, len);
2480                 return ret;
2481         }
2482
2483         static struct token *tok_copy(struct token tk)
2484         {
2485                 struct token *new = malloc(sizeof(*new));
2486                 *new = tk;
2487                 return new;
2488         }
2489
2490 ### The heart of the parser.
2491
2492 Now we have the parser.  If we can shift, we do.  If not and we can reduce
2493 we do.  If the production we reduced was production zero, then we have
2494 accepted the input and can finish.
2495
2496 We return whatever `asn` was returned by reducing production zero.
2497
2498 If we can neither shift nor reduce we have an error to handle.  We pop
2499 single entries off the stack until we can shift the `TK_error` symbol, then
2500 drop input tokens until we find one we can shift into the new error state.
2501
2502 When we find `TK_in` and `TK_out` tokens which report indents we need
2503 to handle them directly as the grammar cannot express what we want to
2504 do with them.
2505
2506 `TK_in` tokens are easy: we simply update the `next` stack frame to
2507 record how many indents there are and that the next token started with
2508 an indent.
2509
2510 `TK_out` tokens must either be counted off against any pending indent,
2511 or must force reductions until there is a pending indent which isn't
2512 at the start of a production.
2513
2514 ###### parser includes
2515         #include "parser.h"
2516 ###### parser_run
2517         void *parser_run(struct token_state *tokens,
2518                          const struct state states[],
2519                          int (*do_reduce)(int, void**, void*),
2520                          void (*do_free)(short, void*),
2521                          FILE *trace, const char *non_term[], int knowns)
2522         {
2523                 struct parser p = { 0 };
2524                 struct token *tk = NULL;
2525                 int accepted = 0;
2526                 void *ret;
2527
2528                 p.next.newline_permitted = states[0].starts_line;
2529                 while (!accepted) {
2530                         struct token *err_tk;
2531                         if (!tk)
2532                                 tk = tok_copy(token_next(tokens));
2533                         p.next.sym = tk->num;
2534                         if (trace)
2535                                 parser_trace(trace, &p, tk, states, non_term, knowns);
2536
2537                         if (p.next.sym == TK_in) {
2538                                 p.next.starts_indented = 1;
2539                                 p.next.indents = 1;
2540                                 free(tk);
2541                                 tk = NULL;
2542                                 continue;
2543                         }
2544                         if (p.next.sym == TK_out) {
2545                                 if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
2546                                     (p.stack[p.tos-1].indents == 1 &&
2547                                      states[p.next.state].reduce_size > 1)) {
2548                                         p.stack[p.tos-1].indents -= 1;
2549                                         if (p.stack[p.tos-1].indents == p.stack[p.tos-1].starts_indented) {
2550                                                 // no internal indent any more, reassess 'newline_permitted'
2551                                                 if (states[p.stack[p.tos-1].state].starts_line)
2552                                                         p.stack[p.tos-1].newline_permitted = 1;
2553                                                 else if (p.tos > 1)
2554                                                         p.stack[p.tos-1].newline_permitted = p.stack[p.tos-2].newline_permitted;
2555                                         }
2556                                         free(tk);
2557                                         tk = NULL;
2558                                         continue;
2559                                 }
2560                                 // fall through and force a REDUCE (as 'shift'
2561                                 // will fail).
2562                         }
2563                         if (p.next.sym == TK_newline) {
2564                                 if (!p.tos || ! p.stack[p.tos-1].newline_permitted) {
2565                                         free(tk);
2566                                         tk = NULL;
2567                                         continue;
2568                                 }
2569                         }
2570                         if (shift(&p, tk, states)) {
2571                                 tk = NULL;
2572                                 continue;
2573                         }
2574                         if (states[p.next.state].reduce_prod >= 0) {
2575                                 void **body;
2576                                 int prod = states[p.next.state].reduce_prod;
2577                                 int size = states[p.next.state].reduce_size;
2578                                 int bufsize;
2579                                 static char buf[16*1024];
2580                                 p.next.sym = states[p.next.state].reduce_sym;
2581
2582                                 body = p.asn_stack +
2583                                         (p.tos - states[p.next.state].reduce_size);
2584
2585                                 bufsize = do_reduce(prod, body, buf);
2586
2587                                 pop(&p, size, do_free);
2588                                 shift(&p, memdup(buf, bufsize), states);
2589                                 if (prod == 0)
2590                                         accepted = 1;
2591                                 continue;
2592                         }
2593                         if (tk->num == TK_out) {
2594                                 // Indent problem - synthesise tokens to get us
2595                                 // out of here.
2596                                 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
2597                                 p.next.sym = states[p.next.state].shift_sym;
2598                                 shift(&p, tok_copy(*tk), states);
2599                                 // FIXME need to report this error somehow
2600                                 continue;
2601                         }
2602                         /* Error. We walk up the stack until we
2603                          * find a state which will accept TK_error.
2604                          * We then shift in TK_error and see what state
2605                          * that takes us too.
2606                          * Then we discard input tokens until
2607                          * we find one that is acceptable.
2608                          */
2609
2610                         err_tk = tok_copy(*tk);
2611                         p.next.sym = TK_error;
2612                         while (shift(&p, err_tk, states) == 0
2613                                && p.tos > 0)
2614                                 // discard this state
2615                                 pop(&p, 1, do_free);
2616                         if (p.tos == 0) {
2617                                 free(err_tk);
2618                                 // no state accepted TK_error
2619                                 break;
2620                         }
2621                         while (search(&states[p.next.state], tk->num) < 0 &&
2622                                tk->num != TK_eof) {
2623                                 free(tk);
2624                                 tk = tok_copy(token_next(tokens));
2625                                 if (tk->num == TK_in)
2626                                         p.next.indents += 1;
2627                                 if (tk->num == TK_out) {
2628                                         if (p.next.indents == 0)
2629                                                 break;
2630                                         p.next.indents -= 1;
2631                                 }
2632                         }
2633                         if (p.tos == 0 && tk->num == TK_eof)
2634                                 break;
2635                 }
2636                 free(tk);
2637                 if (accepted)
2638                         ret = p.asn_stack[0];
2639                 else
2640                         pop(&p, p.tos, do_free);
2641                 free(p.asn_stack);
2642                 free(p.stack);
2643                 return ret;
2644         }
2645
2646 ###### exported functions
2647         void *parser_run(struct token_state *tokens,
2648                          const struct state states[],
2649                          int (*do_reduce)(int, void**, void*),
2650                          void (*do_free)(short, void*),
2651                          FILE *trace, const char *non_term[], int knowns);
2652
2653 ### Tracing
2654
2655 Being able to visualize the parser in action can be invaluable when
2656 debugging the parser code, or trying to understand how the parse of a
2657 particular grammar progresses.  The stack contains all the important
2658 state, so just printing out the stack every time around the parse loop
2659 can make it possible to see exactly what is happening.
2660
2661 This doesn't explicitly show each SHIFT and REDUCE action.  However they
2662 are easily deduced from the change between consecutive lines, and the
2663 details of each state can be found by cross referencing the states list
2664 in the stack with the "report" that parsergen can generate.
2665
2666 For terminal symbols, we just dump the token.  For non-terminals we
2667 print the name of the symbol.  The look ahead token is reported at the
2668 end inside square brackets.
2669
2670 ###### parser functions
2671
2672         static char *reserved_words[] = {
2673                 [TK_error]        = "ERROR",
2674                 [TK_in]           = "IN",
2675                 [TK_out]          = "OUT",
2676                 [TK_newline]      = "NEWLINE",
2677                 [TK_eof]          = "$eof",
2678         };
2679         static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2680         {
2681                 fprintf(trace, "(%d", f->state);
2682                 if (f->indents)
2683                         fprintf(trace, "%c%d", f->starts_indented?':':'.',
2684                                 f->indents);
2685                 if (states[f->state].starts_line)
2686                         fprintf(trace, "s");
2687                 if (f->newline_permitted)
2688                         fprintf(trace, "n");
2689                 fprintf(trace, ") ");
2690         }
2691
2692         void parser_trace(FILE *trace, struct parser *p,
2693                           struct token *tk, const struct state states[],
2694                           const char *non_term[], int knowns)
2695         {
2696                 int i;
2697                 for (i = 0; i < p->tos; i++) {
2698                         int sym = p->stack[i].sym;
2699                         parser_trace_state(trace, &p->stack[i], states);
2700                         if (sym < TK_reserved &&
2701                             reserved_words[sym] != NULL)
2702                                 fputs(reserved_words[sym], trace);
2703                         else if (sym < TK_reserved + knowns) {
2704                                 struct token *t = p->asn_stack[i];
2705                                 text_dump(trace, t->txt, 20);
2706                         } else
2707                                 fputs(non_term[sym - TK_reserved - knowns],
2708                                       trace);
2709                         fputs(" ", trace);
2710                 }
2711                 parser_trace_state(trace, &p->next, states);
2712                 fprintf(trace, " [");
2713                 if (tk->num < TK_reserved &&
2714                     reserved_words[tk->num] != NULL)
2715                         fputs(reserved_words[tk->num], trace);
2716                 else
2717                         text_dump(trace, tk->txt, 20);
2718                 fputs("]\n", trace);
2719         }
2720
2721 # A Worked Example
2722
2723 The obvious example for a parser is a calculator.
2724
2725 As `scanner` provides number parsing function using `libgmp` is it not much
2726 work to perform arbitrary rational number calculations.
2727
2728 This calculator takes one expression, or an equality test per line.  The
2729 results are printed and in any equality test fails, the program exits with
2730 an error.
2731
2732 Embedding mdcode inside mdcode is rather horrible.  I'd like to find a
2733 better approach, but as the grammar file must have 3 components I need
2734 something like this.
2735
2736 ###### File: parsergen.mk
2737         calc.c calc.h : parsergen parsergen.mdc
2738                 ./parsergen --tag calc -o calc parsergen.mdc
2739         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2740                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2741
2742 # calc: header
2743
2744         #include "number.h"
2745         // what do we use for a demo-grammar?  A calculator of course.
2746         struct number {
2747                 mpq_t val;
2748                 char tail[2];
2749                 int err;
2750         };
2751
2752 # calc: code
2753
2754         #include <stdlib.h>
2755         #include <unistd.h>
2756         #include <fcntl.h>
2757         #include <sys/mman.h>
2758         #include <stdio.h>
2759         #include <malloc.h>
2760         #include <gmp.h>
2761         #include "mdcode.h"
2762         #include "scanner.h"
2763         #include "number.h"
2764         #include "parser.h"
2765
2766         #include "calc.h"
2767
2768         static void free_number(struct number *n)
2769         {
2770                 mpq_clear(n->val);
2771                 free(n);
2772         }
2773
2774         int main(int argc, char *argv[])
2775         {
2776                 int fd = open(argv[1], O_RDONLY);
2777                 int len = lseek(fd, 0, 2);
2778                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2779                 struct section *s = code_extract(file, file+len, NULL);
2780                 struct token_config config = {
2781                         .ignored = (1 << TK_line_comment)
2782                                  | (1 << TK_block_comment)
2783                                  | (1 << TK_in)
2784                                  | (1 << TK_out),
2785                         .number_chars = ".,_+-",
2786                         .word_start = "",
2787                         .word_cont = "",
2788                 };
2789                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2790                 exit(0);
2791         }
2792
2793 # calc: grammar
2794
2795         Session -> Session Line
2796                 | Line
2797
2798         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2799                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2800                                         gmp_printf("  or as a decimal: %Fg\n", fl);
2801                                         mpf_clear(fl);
2802                                         }
2803                                      }$
2804                 | Expression = Expression NEWLINE ${
2805                         if (mpq_equal($1.val, $3.val))
2806                                 gmp_printf("Both equal %Qd\n", $1.val);
2807                         else {
2808                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
2809                                         $1.val, $3.val);
2810                                 exit(1);
2811                         }
2812                 }$
2813                 | NEWLINE ${ printf("Blank line\n"); }$
2814                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2815
2816         $number
2817         Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2818                 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2819                 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2820
2821         Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2822                 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2823                 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2824
2825         Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2826                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$