ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 There are several distinct sections.
   8
   9 1. `grammar_read` will read a grammar from a literate-code file and
  10    store the productions, symbols, and code fragments.
  11 2. `grammar_analyse` will take that grammar and build LR parsing
  12    states and look-ahead tables.
  13 3. `grammar_report` will present the details of the analysis
  14    in a readable format and will report any conflicts.
  15 4. `parser_generate` will write out C code files with various
  16    tables and with the code fragments arranged in useful places.
  17 5. `parser_run` is a library function intended to be linked together
  18    with the generated parser tables to complete the implementation of
  19    a parser.
  20 6. Finally `calc` is a test grammar for a simple calculator.  The
  21    `parsergen` program built from the C code in this file can extract
  22    that grammar directly from this file and process it.
  23
  24
  25 ###### File: parsergen.c
  26         #include <unistd.h>
  27         #include <stdlib.h>
  28         #include <stdio.h>
  29         ## includes
  30         ## forward declarations
  31         ## declarations
  32         ## functions
  33         ## grammar_read
  34         ## grammar_analyse
  35         ## grammar_report
  36         ## parser_generate
  37         ## main
  38 ###### File: parser.h
  39         ## exported types
  40         ## exported functions
  41 ###### File: libparser.c
  42         #include <unistd.h>
  43         #include <stdlib.h>
  44         #include <stdio.h>
  45         ## parser includes
  46         ## parser functions
  47         ## parser_run
  48 ###### File: parsergen.mk
  49         CFLAGS += -Wall -g
  50         all :: parsergen calc
  51         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  52                 ./md2c parsergen.mdc
  53
  54 ## Reading the grammar
  55
  56 The grammar must be stored in a markdown literate code file as parsed
  57 by "[mdcode][]".  It must have three top level (i.e. unreferenced)
  58 sections: `header`, `code`, and `grammar`.  The first two will be
  59 literally copied into the generated `.c` and `.h`. files.  The last
  60 contains the grammar.  This is tokenised with "[scanner][]".
  61
  62 If the `--tag` option is given, then any top level header that doesn't
  63 start with the tag is ignored, and the tag is striped from the rest.  So
  64 `--tag Foo`
  65 means that the three needed sections must be `Foo: header`, `Foo: code`,
  66 and `Foo: grammar`.
  67
  68 [mdcode]: mdcode.html
  69 [scanner]: scanner.html
  70
  71 ###### includes
  72         #include "mdcode.h"
  73         #include "scanner.h"
  74
  75 ###### parser includes
  76         #include "mdcode.h"
  77         #include "scanner.h"
  78
  79 The grammar contains production sets, precedence/associativity
  80 declarations, and data type declarations.  These are all parsed with
  81 _ad hoc_ parsing as we don't have a parser generator yet.
  82
  83 The precedence and associativity can be set for each production, but
  84 can be inherited from symbols.  The data type is potentially defined
  85 for each non-terminal and describes what C structure is used to store
  86 information about each symbol.
  87
  88 ###### declarations
  89         enum assoc {Left, Right, Non};
  90         char *assoc_names[] = {"Left","Right","Non"};
  91
  92         struct symbol {
  93                 struct text struct_name;
  94                 enum assoc assoc;
  95                 unsigned short precedence;
  96                 ## symbol fields
  97         };
  98         struct production {
  99                 unsigned short precedence;
 100                 enum assoc assoc;
 101                 ## production fields
 102         };
 103         struct grammar {
 104                 ## grammar fields
 105         };
 106
 107 The strings reported by `mdcode` and `scanner` are `struct text` which have
 108 length rather than being null terminated.  To help with printing and
 109 comparing we define `text_is` and `prtxt`, which should possibly go in
 110 `mdcode`.  `scanner` does provide `text_dump` which is useful for strings
 111 which might contain control characters.
 112
 113 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 114 because that is what we need to detect tags.
 115
 116 ###### functions
 117         static int text_is(struct text t, char *s)
 118         {
 119                 return (strlen(s) == t.len &&
 120                         strncmp(s, t.txt, t.len) == 0);
 121         }
 122         static void prtxt(struct text t)
 123         {
 124                 printf("%.*s", t.len, t.txt);
 125         }
 126
 127         static int strip_tag(struct text *t, char *tag)
 128         {
 129                 int skip = strlen(tag) + 1;
 130                 if (skip >= t->len ||
 131                     strncmp(t->txt, tag, skip-1) != 0 ||
 132                     t->txt[skip-1] != ':')
 133                         return 0;
 134                 while (skip < t->len && t->txt[skip] == ' ')
 135                         skip++;
 136                 t->len -= skip;
 137                 t->txt += skip;
 138                 return 1;
 139         }
 140
 141 ### Symbols
 142
 143 Productions are comprised primarily of symbols - terminal and
 144 non-terminal.  We do not make a syntactic distinction between the two,
 145 though non-terminals must be identifiers.  Non-terminal symbols are
 146 those which appear in the head of a production, terminal symbols are
 147 those which don't.  There are also "virtual" symbols used for precedence
 148 marking discussed later, and sometimes we won't know what type a symbol
 149 is yet.
 150
 151 ###### forward declarations
 152         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 153         char *symtypes = "UVTN";
 154 ###### symbol fields
 155         enum symtype type;
 156
 157 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 158 table of known symbols and the resulting parser will report them as
 159 `TK_reserved + N`.  A small set of identifiers are reserved for the
 160 different token types that `scanner` can report.
 161
 162 ###### declarations
 163         static char *reserved_words[] = {
 164                 [TK_error]        = "ERROR",
 165                 [TK_number]       = "NUMBER",
 166                 [TK_ident]        = "IDENTIFIER",
 167                 [TK_mark]         = "MARK",
 168                 [TK_string]       = "STRING",
 169                 [TK_multi_string] = "MULTI_STRING",
 170                 [TK_in]           = "IN",
 171                 [TK_out]          = "OUT",
 172                 [TK_newline]      = "NEWLINE",
 173                 [TK_eof]          = "$eof",
 174         };
 175 ###### symbol fields
 176         short num;
 177
 178 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 179 recognised.  The former is automatically expected at the end of the text
 180 being parsed. The latter are always ignored by the parser.
 181
 182 All of these symbols are stored in a simple symbol table.  We use the
 183 `struct text` from `mdcode` to store the name and link them together
 184 into a sorted list using an insertion sort.
 185
 186 We don't have separate `find` and `insert` functions as any symbol we
 187 find needs to be remembered.  We simply expect `find` to always return a
 188 symbol, but its type might be `Unknown`.
 189
 190 ###### includes
 191         #include <string.h>
 192
 193 ###### symbol fields
 194         struct text name;
 195         struct symbol *next;
 196
 197 ###### grammar fields
 198         struct symbol *syms;
 199         int num_syms;
 200
 201 ###### functions
 202         static int text_cmp(struct text a, struct text b)
 203         {
 204                 int len = a.len;
 205                 if (a.len > b.len)
 206                         len = b.len;
 207                 int cmp = strncmp(a.txt, b.txt, len);
 208                 if (cmp)
 209                         return cmp;
 210                 else
 211                         return a.len - b.len;
 212         }
 213
 214         static struct symbol *sym_find(struct grammar *g, struct text s)
 215         {
 216                 struct symbol **l = &g->syms;
 217                 struct symbol *n;
 218                 int cmp = 1;
 219
 220                 while (*l &&
 221                         (cmp = text_cmp((*l)->name, s)) < 0)
 222                                 l = & (*l)->next;
 223                 if (cmp == 0)
 224                         return *l;
 225                 n = calloc(1, sizeof(*n));
 226                 n->name = s;
 227                 n->type = Unknown;
 228                 n->next = *l;
 229                 n->num = -1;
 230                 *l = n;
 231                 return n;
 232         }
 233
 234         static void symbols_init(struct grammar *g)
 235         {
 236                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 237                 int i;
 238                 for (i = 0; i < entries; i++) {
 239                         struct text t;
 240                         struct symbol *s;
 241                         t.txt = reserved_words[i];
 242                         if (!t.txt)
 243                                 continue;
 244                         t.len = strlen(t.txt);
 245                         s = sym_find(g, t);
 246                         s->type = Terminal;
 247                         s->num = i;
 248                 }
 249         }
 250
 251 ### Data types and precedence.
 252
 253 Data type specification and precedence specification are both
 254 introduced by a dollar sign at the start of the line.  If the next
 255 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 256 precedence, otherwise it specifies a data type.
 257
 258 The data type name is simply stored and applied to the head of all
 259 subsequent productions.  It must be the name of a structure, so `$expr`
 260 maps to `struct expr`.
 261
 262 Any productions given before the first data type will have no data type
 263 and can carry no information.  In order to allow other non-terminals to
 264 have no type, the data type `$void` can be given.  This does *not* mean
 265 that `struct void` will be used, but rather than no type will be
 266 associated with future non-terminals.
 267
 268 The precedence line must contain a list of symbols - typically
 269 terminal symbols, but not necessarily.  It can only contain symbols
 270 that have not been seen yet, so precedence declaration must precede
 271 the productions that they affect.
 272
 273 A precedence line may also contain a "virtual" symbol which is an
 274 identifier preceded by `$$`.  Virtual terminals carry precedence
 275 information but are not included in the grammar.  A production can
 276 declare that it inherits the precedence of a given virtual symbol.
 277
 278 This use for `$$` precludes it from being used as a symbol in the
 279 described language.  Two other symbols: `${` and `}$` are also
 280 unavailable.
 281
 282 Each new precedence line introduces a new precedence level and
 283 declares how it associates.  This level is stored in each symbol
 284 listed and may be inherited by any production which uses the symbol.  A
 285 production inherits from the last symbol which has a precedence.
 286
 287 ###### grammar fields
 288         struct text current_type;
 289         int prec_levels;
 290
 291 ###### declarations
 292         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 293         static const char *known[] = { "$$", "${", "}$" };
 294
 295 ###### functions
 296         static char *dollar_line(struct token_state *ts, struct grammar *g)
 297         {
 298                 struct token t = token_next(ts);
 299                 char *err;
 300                 enum assoc assoc;
 301                 int found;
 302
 303                 if (t.num != TK_ident) {
 304                         err = "type or assoc expected after '$'";
 305                         goto abort;
 306                 }
 307                 if (text_is(t.txt, "LEFT"))
 308                         assoc = Left;
 309                 else if (text_is(t.txt, "RIGHT"))
 310                         assoc = Right;
 311                 else if (text_is(t.txt, "NON"))
 312                         assoc = Non;
 313                 else {
 314                         g->current_type = t.txt;
 315                         if (text_is(t.txt, "void"))
 316                                 g->current_type.txt = NULL;
 317                         t = token_next(ts);
 318                         if (t.num != TK_newline) {
 319                                 err = "Extra tokens after type name";
 320                                 goto abort;
 321                         }
 322                         return NULL;
 323                 }
 324
 325                 // This is a precedence line, need some symbols.
 326                 found = 0;
 327                 g->prec_levels += 1;
 328                 t = token_next(ts);
 329                 while (t.num != TK_newline) {
 330                         enum symtype type = Terminal;
 331                         struct symbol *s;
 332                         if (t.num == TK_virtual) {
 333                                 type = Virtual;
 334                                 t = token_next(ts);
 335                                 if (t.num != TK_ident) {
 336                                         err = "$$ must be followed by a word";
 337                                         goto abort;
 338                                 }
 339                         } else if (t.num != TK_ident &&
 340                                    t.num != TK_mark) {
 341                                 err = "Illegal token in precedence line";
 342                                 goto abort;
 343                         }
 344                         s = sym_find(g, t.txt);
 345                         if (s->type != Unknown) {
 346                                 err = "Symbols in precedence line must not already be known.";
 347                                 goto abort;
 348                         }
 349                         s->type = type;
 350                         s->precedence = g->prec_levels;
 351                         s->assoc = assoc;
 352                         found += 1;
 353                 }
 354                 if (found == 0)
 355                         err = "No symbols given on precedence line";
 356                         goto abort;
 357                 return NULL;
 358         abort:
 359                 while (t.num != TK_newline && t.num != TK_eof)
 360                         t = token_next(ts);
 361                 return err;
 362         }
 363
 364 ### Productions
 365
 366 A production either starts with an identifier which is the head
 367 non-terminal, or a vertical bar (`|`) in which case this production
 368 uses the same head as the previous one.  The identifier must be
 369 followed by a `->` mark.  All productions for a given non-terminal must
 370 be grouped together with the `nonterminal ->` given only once.
 371
 372 After this a (possibly empty) sequence of identifiers and marks form
 373 the body of the production.  A virtual symbol may be given after the
 374 body by preceding it with `$$`.  If a virtual symbol is given, the
 375 precedence of the production is that for the virtual symbol.  If none
 376 is given, the precedence is inherited from the last symbol in the
 377 production which has a precedence specified.
 378
 379 After the optional precedence may come the `${` mark.  This indicates
 380 the start of a code fragment.  If present, this must be on the same
 381 line as the start of the production.
 382
 383 All of the text from the `${` through to the matching `}$` is
 384 collected and forms the code-fragment for the production.  It must all
 385 be in one `code_node` of the literate code.  The `}$` must be
 386 at the end of a line.
 387
 388 Text in the code fragment will undergo substitutions where `$N` for
 389 some numeric `N` will be replaced with a variable holding the parse
 390 information for the particular symbol in the production.  `$0` is the
 391 head of the production, `$1` is the first symbol of the body, etc.
 392 The type of `$N` for a terminal symbol is `struct token`.  For
 393 a non-terminal, it is whatever has been declared for that symbol.
 394
 395 While building productions we will need to add to an array which needs to
 396 grow dynamically.
 397
 398 ###### functions
 399         static void array_add(void *varray, int *cnt, void *new)
 400         {
 401                 void ***array = varray;
 402                 int current = 0;
 403                 const int step = 8;
 404                 current = ((*cnt-1) | (step-1))+1;
 405                 if (*cnt == current) {
 406                         /* must grow */
 407                         current += step;
 408                         *array = realloc(*array, current * sizeof(void*));
 409                 }
 410                 (*array)[*cnt] = new;
 411                 (*cnt) += 1;
 412         }
 413
 414 Collecting the code fragment simply involves reading tokens until we
 415 hit the end token `}$`, and noting the character position of the start and
 416 the end.
 417
 418 ###### functions
 419         static struct text collect_code(struct token_state *state,
 420                                         struct token start)
 421         {
 422                 struct text code;
 423                 struct token t;
 424                 code.txt = start.txt.txt + start.txt.len;
 425                 do
 426                         t = token_next(state);
 427                 while (t.node == start.node &&
 428                        t.num != TK_close && t.num != TK_error &&
 429                        t.num != TK_eof);
 430                 if (t.num == TK_close && t.node == start.node)
 431                         code.len = t.txt.txt - code.txt;
 432                 else
 433                         code.txt = NULL;
 434                 return code;
 435         }
 436
 437 Now we have all the bit we need to parse a full production.
 438
 439 ###### includes
 440         #include <memory.h>
 441
 442 ###### grammar fields
 443         struct production **productions;
 444         int production_count;
 445
 446 ###### production fields
 447         struct symbol  *head;
 448         struct symbol **body;
 449         int             body_size;
 450         struct text     code;
 451
 452 ###### symbol fields
 453         int first_production;
 454
 455 ###### functions
 456         static char *parse_production(struct grammar *g,
 457                                       struct symbol *head,
 458                                       struct token_state *state)
 459         {
 460                 /* Head has already been parsed. */
 461                 struct token tk;
 462                 char *err;
 463                 struct production p, *pp;
 464
 465                 memset(&p, 0, sizeof(p));
 466                 p.head = head;
 467                 tk = token_next(state);
 468                 while (tk.num == TK_ident || tk.num == TK_mark) {
 469                         struct symbol *bs = sym_find(g, tk.txt);
 470                         if (bs->type == Unknown)
 471                                 bs->type = Terminal;
 472                         if (bs->type == Virtual) {
 473                                 err = "Virtual symbol not permitted in production";
 474                                 goto abort;
 475                         }
 476                         if (bs->precedence) {
 477                                 p.precedence = bs->precedence;
 478                                 p.assoc = bs->assoc;
 479                         }
 480                         array_add(&p.body, &p.body_size, bs);
 481                         tk = token_next(state);
 482                 }
 483                 if (tk.num == TK_virtual) {
 484                         struct symbol *vs;
 485                         tk = token_next(state);
 486                         if (tk.num != TK_ident) {
 487                                 err = "word required after $$";
 488                                 goto abort;
 489                         }
 490                         vs = sym_find(g, tk.txt);
 491                         if (vs->type != Virtual) {
 492                                 err = "symbol after $$ must be virtual";
 493                                 goto abort;
 494                         }
 495                         p.precedence = vs->precedence;
 496                         p.assoc = vs->assoc;
 497                         tk = token_next(state);
 498                 }
 499                 if (tk.num == TK_open) {
 500                         p.code = collect_code(state, tk);
 501                         if (p.code.txt == NULL) {
 502                                 err = "code fragment not closed properly";
 503                                 goto abort;
 504                         }
 505                         tk = token_next(state);
 506                 }
 507                 if (tk.num != TK_newline && tk.num != TK_eof) {
 508                         err = "stray tokens at end of line";
 509                         goto abort;
 510                 }
 511                 pp = malloc(sizeof(*pp));
 512                 *pp = p;
 513                 array_add(&g->productions, &g->production_count, pp);
 514                 return NULL;
 515         abort:
 516                 while (tk.num != TK_newline && tk.num != TK_eof)
 517                         tk = token_next(state);
 518                 return err;
 519         }
 520
 521 With the ability to parse production and dollar-lines, we have nearly all
 522 that we need to parse a grammar from a `code_node`.
 523
 524 The head of the first production will effectively be the `start` symbol of
 525 the grammar.  However it won't _actually_ be so.  Processing the grammar is
 526 greatly simplified if the real start symbol only has a single production,
 527 and expects `$eof` as the final terminal.  So when we find the first
 528 explicit production we insert an extra production as production zero which
 529 looks like
 530
 531 ###### Example: production 0
 532         $start -> START $eof
 533
 534 where `START` is the first non-terminal given.
 535
 536 ###### create production zero
 537         struct production *p = calloc(1,sizeof(*p));
 538         struct text start = {"$start",6};
 539         struct text eof = {"$eof",4};
 540         p->head = sym_find(g, start);
 541         p->head->type = Nonterminal;
 542         array_add(&p->body, &p->body_size, head);
 543         array_add(&p->body, &p->body_size, sym_find(g, eof));
 544         p->head->first_production = g->production_count;
 545         array_add(&g->productions, &g->production_count, p);
 546
 547 Now we are ready to read in the grammar.
 548
 549 ###### grammar_read
 550         static struct grammar *grammar_read(struct code_node *code)
 551         {
 552                 struct token_config conf = {
 553                         .word_start = "",
 554                         .word_cont = "",
 555                         .words_marks = known,
 556                         .known_count = sizeof(known)/sizeof(known[0]),
 557                         .number_chars = "",
 558                         .ignored = (1 << TK_line_comment)
 559                                  | (1 << TK_block_comment)
 560                                  | (0 << TK_number)
 561                                  | (1 << TK_string)
 562                                  | (1 << TK_multi_string)
 563                                  | (1 << TK_in)
 564                                  | (1 << TK_out),
 565                 };
 566
 567                 struct token_state *state = token_open(code, &conf);
 568                 struct token tk;
 569                 struct symbol *head = NULL;
 570                 struct grammar *g;
 571                 char *err = NULL;
 572
 573                 g = calloc(1, sizeof(*g));
 574                 symbols_init(g);
 575
 576                 for (tk = token_next(state); tk.num != TK_eof;
 577                      tk = token_next(state)) {
 578                         if (tk.num == TK_newline)
 579                                 continue;
 580                         if (tk.num == TK_ident) {
 581                                 // new non-terminal
 582                                 head = sym_find(g, tk.txt);
 583                                 if (head->type == Nonterminal)
 584                                         err = "This non-terminal has already be used.";
 585                                 else if (head->type == Virtual)
 586                                         err = "Virtual symbol not permitted in head of production";
 587                                 else {
 588                                         head->type = Nonterminal;
 589                                         head->struct_name = g->current_type;
 590                                         if (g->production_count == 0) {
 591                                                 ## create production zero
 592                                         }
 593                                         head->first_production = g->production_count;
 594                                         tk = token_next(state);
 595                                         if (tk.num == TK_mark &&
 596                                             text_is(tk.txt, "->"))
 597                                                 err = parse_production(g, head, state);
 598                                         else
 599                                                 err = "'->' missing in production";
 600                                 }
 601                         } else if (tk.num == TK_mark
 602                                    && text_is(tk.txt, "|")) {
 603                                 // another production for same non-term
 604                                 if (head)
 605                                         err = parse_production(g, head, state);
 606                                 else
 607                                         err = "First production must have a head";
 608                         } else if (tk.num == TK_mark
 609                                    && text_is(tk.txt, "$")) {
 610                                 err = dollar_line(state, g);
 611                         } else {
 612                                 err = "Unrecognised token at start of line.";
 613                         }
 614                         if (err)
 615                                 goto abort;
 616                 }
 617                 token_close(state);
 618                 return g;
 619         abort:
 620                 fprintf(stderr, "Error at line %d: %s\n",
 621                         tk.line, err);
 622                 token_close(state);
 623                 free(g);
 624                 return NULL;
 625         }
 626
 627 ## Analysing the grammar
 628
 629 The central task in analysing the grammar is to determine a set of
 630 states to drive the parsing process.  This will require finding
 631 various sets of symbols and of "items".  Some of these sets will need
 632 to attach information to each element in the set, so they are more
 633 like maps.
 634
 635 Each "item" may have a 'look-ahead' or `LA` set associated with
 636 it.  Many of these will be the same as each other.  To avoid memory
 637 wastage, and to simplify some comparisons of sets, these sets will be
 638 stored in a list of unique sets, each assigned a number.
 639
 640 Once we have the data structures in hand to manage these sets and
 641 lists, we can start setting the 'nullable' flag, build the 'FIRST'
 642 sets, and then create the item sets which define the various states.
 643
 644 ### Symbol sets.
 645
 646 Though we don't only store symbols in these sets, they are the main
 647 things we store, so they are called symbol sets or "symsets".
 648
 649 A symset has a size, an array of shorts, and an optional array of data
 650 values, which are also shorts.  If the array of data is not present,
 651 we store an impossible pointer, as `NULL` is used to indicate that no
 652 memory has been allocated yet;
 653
 654 ###### declarations
 655         struct symset {
 656                 short cnt;
 657                 unsigned short *syms, *data;
 658         };
 659         #define NO_DATA ((unsigned short *)1)
 660         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 661         const struct symset INIT_DATASET = { 0, NULL, NULL };
 662
 663 The arrays of shorts are allocated in blocks of 8 and are kept sorted
 664 by using an insertion sort.  We don't explicitly record the amount of
 665 allocated space as it can be derived directly from the current `cnt` using
 666 `((cnt - 1) | 7) + 1`.
 667
 668 ###### functions
 669         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 670         {
 671                 int i;
 672                 int current = ((s->cnt-1) | 7) + 1;
 673                 if (current == s->cnt) {
 674                         current += 8;
 675                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 676                         if (s->data != NO_DATA)
 677                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 678                 }
 679                 i = s->cnt;
 680                 while (i > 0 && s->syms[i-1] > key) {
 681                         s->syms[i] = s->syms[i-1];
 682                         if (s->data != NO_DATA)
 683                                 s->data[i] = s->data[i-1];
 684                         i--;
 685                 }
 686                 s->syms[i] = key;
 687                 if (s->data != NO_DATA)
 688                         s->data[i] = val;
 689                 s->cnt += 1;
 690         }
 691
 692 Finding a symbol (or item) in a `symset` uses a simple binary search.
 693 We return the index where the value was found (so data can be accessed),
 694 or `-1` to indicate failure.
 695
 696         static int symset_find(struct symset *ss, unsigned short key)
 697         {
 698                 int lo = 0;
 699                 int hi = ss->cnt;
 700
 701                 if (hi == 0)
 702                         return -1;
 703                 while (lo + 1 < hi) {
 704                         int mid = (lo + hi) / 2;
 705                         if (ss->syms[mid] <= key)
 706                                 lo = mid;
 707                         else
 708                                 hi = mid;
 709                 }
 710                 if (ss->syms[lo] == key)
 711                         return lo;
 712                 return -1;
 713         }
 714
 715 We will often want to form the union of two symsets.  When we do, we
 716 will often want to know if anything changed (as they might mean there
 717 is more work to do).  So `symset_union` reports whether anything was
 718 added to the first set.  We use a slow quadratic approach as these
 719 sets don't really get very big.  If profiles shows this to be a problem is
 720 can be optimised later.
 721
 722         static int symset_union(struct symset *a, struct symset *b)
 723         {
 724                 int i;
 725                 int added = 0;
 726                 for (i = 0; i < b->cnt; i++)
 727                         if (symset_find(a, b->syms[i]) < 0) {
 728                                 unsigned short data = 0;
 729                                 if (b->data != NO_DATA)
 730                                         data = b->data[i];
 731                                 symset_add(a, b->syms[i], data);
 732                                 added++;
 733                         }
 734                 return added;
 735         }
 736
 737 And of course we must be able to free a symset.
 738
 739         static void symset_free(struct symset ss)
 740         {
 741                 free(ss.syms);
 742                 if (ss.data != NO_DATA)
 743                         free(ss.data);
 744         }
 745
 746 ### Symset Storage
 747
 748 Some symsets are simply stored somewhere appropriate in the `grammar`
 749 data structure, others needs a bit of indirection.  This applies
 750 particularly to "LA" sets.  These will be paired with "items" in an
 751 "itemset".  As itemsets will be stored in a symset, the "LA" set needs to be
 752 stored in the `data` field, so we need a mapping from a small (short)
 753 number to an LA `symset`.
 754
 755 As mentioned earlier this involves creating a list of unique symsets.
 756
 757 For now, we just use a linear list sorted by insertion.  A skiplist
 758 would be more efficient and may be added later.
 759
 760 ###### declarations
 761
 762         struct setlist {
 763                 struct symset ss;
 764                 int num;
 765                 struct setlist *next;
 766         };
 767
 768 ###### grammar fields
 769         struct setlist *sets;
 770         int nextset;
 771
 772 ###### functions
 773
 774         static int ss_cmp(struct symset a, struct symset b)
 775         {
 776                 int i;
 777                 int diff = a.cnt - b.cnt;
 778                 if (diff)
 779                         return diff;
 780                 for (i = 0; i < a.cnt; i++) {
 781                         diff = (int)a.syms[i] - (int)b.syms[i];
 782                         if (diff)
 783                                 return diff;
 784                 }
 785                 return 0;
 786         }
 787
 788         static int save_set(struct grammar *g, struct symset ss)
 789         {
 790                 struct setlist **sl = &g->sets;
 791                 int cmp = 1;
 792                 struct setlist *s;
 793
 794                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 795                         sl = & (*sl)->next;
 796                 if (cmp == 0) {
 797                         symset_free(ss);
 798                         return (*sl)->num;
 799                 }
 800
 801                 s = malloc(sizeof(*s));
 802                 s->ss = ss;
 803                 s->num = g->nextset;
 804                 g->nextset += 1;
 805                 s->next = *sl;
 806                 *sl = s;
 807                 return s->num;
 808         }
 809
 810 Finding a set by number is currently performed by a simple linear search.
 811 If this turns out to hurt performance, we can store the sets in a dynamic
 812 array like the productions.
 813
 814         static struct symset set_find(struct grammar *g, int num)
 815         {
 816                 struct setlist *sl = g->sets;
 817                 while (sl && sl->num != num)
 818                         sl = sl->next;
 819                 return sl->ss;
 820         }
 821
 822
 823 ### Setting `nullable`
 824
 825 We set `nullable` on the head symbol for any production for which all
 826 the body symbols (if any) are nullable.  As this is a recursive
 827 definition, any change in the `nullable` setting means that we need to
 828 re-evaluate where it needs to be set.
 829
 830 We simply loop around performing the same calculations until no more
 831 changes happen.
 832
 833 ###### symbol fields
 834         int nullable;
 835
 836 ###### functions
 837         static void set_nullable(struct grammar *g)
 838         {
 839                 int check_again = 1;
 840                 while (check_again) {
 841                         int p;
 842                         check_again = 0;
 843                         for (p = 0; p < g->production_count; p++) {
 844                                 struct production *pr = g->productions[p];
 845                                 int s;
 846
 847                                 if (pr->head->nullable)
 848                                         continue;
 849                                 for (s = 0; s < pr->body_size; s++)
 850                                         if (! pr->body[s]->nullable)
 851                                                 break;
 852                                 if (s == pr->body_size) {
 853                                         pr->head->nullable = 1;
 854                                         check_again = 1;
 855                                 }
 856                         }
 857                 }
 858         }
 859
 860 ### Setting `can_eol`
 861
 862 In order to be able to ignore newline tokens when not relevant, but
 863 still include them in the parse when needed, we will need to know
 864 which states can start a "line-like" section of code.  We ignore
 865 newlines when there is an indent since the most recent start of a
 866 line-like section.
 867
 868 To know what is line-like, we first need to know which symbols can end
 869 a line-like section, which is precisely those which can end with a
 870 newline token.  These symbols don't necessarily alway end with a
 871 newline, but they can.  Hence they are not described as "lines" but
 872 only "line-like".
 873
 874 Clearly the `TK_newline` token can end with a newline.  Any symbol
 875 which is the head of a production that contains a line-ending symbol
 876 followed only by nullable symbols is also a line-ending symbol.  We
 877 use a new field `can_eol` to record this attribute of symbols, and
 878 compute it in a repetitive manner similar to `set_nullable`.
 879
 880 ###### symbol fields
 881         int can_eol;
 882
 883 ###### functions
 884         static void set_can_eol(struct grammar *g)
 885         {
 886                 int check_again = 1;
 887                 g->symtab[TK_newline]->can_eol = 1;
 888                 while (check_again) {
 889                         int p;
 890                         check_again = 0;
 891                         for (p = 0; p < g->production_count; p++) {
 892                                 struct production *pr = g->productions[p];
 893                                 int s;
 894
 895                                 if (pr->head->can_eol)
 896                                         continue;
 897
 898                                 for (s = pr->body_size - 1; s >= 0; s--) {
 899                                         if (pr->body[s]->can_eol) {
 900                                                 pr->head->can_eol = 1;
 901                                                 check_again = 1;
 902                                                 break;
 903                                         }
 904                                         if (!pr->body[s]->nullable)
 905                                                 break;
 906                                 }
 907                         }
 908                 }
 909         }
 910
 911 ### Building the `first` sets
 912
 913 When calculating what can follow a particular non-terminal, we will need to
 914 know what the "first" terminal in any subsequent non-terminal might be.  So
 915 we calculate the `first` set for every non-terminal and store them in an
 916 array.  We don't bother recording the "first" set for terminals as they are
 917 trivial.
 918
 919 As the "first" for one symbol might depend on the "first" of another,
 920 we repeat the calculations until no changes happen, just like with
 921 `set_nullable`.  This makes use of the fact that `symset_union`
 922 reports if any change happens.
 923
 924 The core of this which finds the "first" of part of a production body
 925 will be reused for computing the "follow" sets, so we split it out
 926 into a separate function.
 927
 928 ###### grammar fields
 929         struct symset *first;
 930
 931 ###### functions
 932
 933         static int add_first(struct production *pr, int start,
 934                              struct symset *target, struct grammar *g,
 935                              int *to_end)
 936         {
 937                 int s;
 938                 int changed = 0;
 939                 for (s = start; s < pr->body_size; s++) {
 940                         struct symbol *bs = pr->body[s];
 941                         if (bs->type == Terminal) {
 942                                 if (symset_find(target, bs->num) < 0) {
 943                                         symset_add(target, bs->num, 0);
 944                                         changed = 1;
 945                                 }
 946                                 break;
 947                         } else if (symset_union(target, &g->first[bs->num]))
 948                                 changed = 1;
 949                         if (!bs->nullable)
 950                                 break;
 951                 }
 952                 if (to_end)
 953                         *to_end = (s == pr->body_size);
 954                 return changed;
 955         }
 956
 957         static void build_first(struct grammar *g)
 958         {
 959                 int check_again = 1;
 960                 int s;
 961                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
 962                 for (s = 0; s < g->num_syms; s++)
 963                         g->first[s] = INIT_SYMSET;
 964
 965                 while (check_again) {
 966                         int p;
 967                         check_again = 0;
 968                         for (p = 0; p < g->production_count; p++) {
 969                                 struct production *pr = g->productions[p];
 970                                 struct symset *head = &g->first[pr->head->num];
 971
 972                                 if (add_first(pr, 0, head, g, NULL))
 973                                         check_again = 1;
 974                         }
 975                 }
 976         }
 977
 978 ### Building the `follow` sets.
 979
 980 There are two different situations when we will want to generate "follow"
 981 sets.  If we are doing an SLR analysis, we want to generate a single
 982 "follow" set for each non-terminal in the grammar.  That is what is
 983 happening here.  If we are doing an LALR or LR analysis we will want
 984 to generate a separate "LA" set for each item.  We do that later
 985 in state generation.
 986
 987 There are two parts to generating a "follow" set.  Firstly we look at
 988 every place that any non-terminal appears in the body of any
 989 production, and we find the set of possible "first" symbols after
 990 there.  This is done using `add_first` above and only needs to be done
 991 once as the "first" sets are now stable and will not change.
 992
 993 ###### follow code
 994
 995         for (p = 0; p < g->production_count; p++) {
 996                 struct production *pr = g->productions[p];
 997                 int b;
 998
 999                 for (b = 0; b < pr->body_size - 1; b++) {
1000                         struct symbol *bs = pr->body[b];
1001                         if (bs->type == Terminal)
1002                                 continue;
1003                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1004                 }
1005         }
1006
1007 The second part is to add the "follow" set of the head of a production
1008 to the "follow" sets of the final symbol in the production, and any
1009 other symbol which is followed only by `nullable` symbols.  As this
1010 depends on "follow" itself we need to repeatedly perform the process
1011 until no further changes happen.
1012
1013 ###### follow code
1014
1015         for (again = 0, p = 0;
1016              p < g->production_count;
1017              p < g->production_count-1
1018                 ? p++ : again ? (again = 0, p = 0)
1019                               : p++) {
1020                 struct production *pr = g->productions[p];
1021                 int b;
1022
1023                 for (b = pr->body_size - 1; b >= 0; b--) {
1024                         struct symbol *bs = pr->body[b];
1025                         if (bs->type == Terminal)
1026                                 break;
1027                         if (symset_union(&g->follow[bs->num],
1028                                          &g->follow[pr->head->num]))
1029                                 again = 1;
1030                         if (!bs->nullable)
1031                                 break;
1032                 }
1033         }
1034
1035 We now just need to create and initialise the `follow` list to get a
1036 complete function.
1037
1038 ###### grammar fields
1039         struct symset *follow;
1040
1041 ###### functions
1042         static void build_follow(struct grammar *g)
1043         {
1044                 int s, again, p;
1045                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1046                 for (s = 0; s < g->num_syms; s++)
1047                         g->follow[s] = INIT_SYMSET;
1048                 ## follow code
1049         }
1050
1051 ### Building itemsets and states
1052
1053 There are three different levels of detail that can be chosen for
1054 building the itemsets and states for the LR grammar.  They are:
1055
1056 1. LR(0) or SLR(1), where no look-ahead is considered.
1057 2. LALR(1) where we build look-ahead sets with each item and merge
1058    the LA sets when we find two paths to the same "kernel" set of items.
1059 3. LR(1) where different look-ahead for any item in the set means
1060    a different state must be created.
1061
1062 ###### forward declarations
1063         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1064
1065 We need to be able to look through existing states to see if a newly
1066 generated state already exists.  For now we use a simple sorted linked
1067 list.
1068
1069 An item is a pair of numbers: the production number and the position of
1070 "DOT", which is an index into the body of the production.
1071 As the numbers are not enormous we can combine them into a single "short"
1072 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1073 production number (so 4000 productions with maximum of 15 symbols in the
1074 body).
1075
1076 Comparing the itemsets will be a little different to comparing symsets
1077 as we want to do the lookup after generating the "kernel" of an
1078 itemset, so we need to ignore the offset=zero items which are added during
1079 completion.
1080
1081 To facilitate this, we modify the "DOT" number so that "0" sorts to
1082 the end of the list in the symset, and then only compare items before
1083 the first "0".
1084
1085 ###### declarations
1086         static inline unsigned short item_num(int production, int index)
1087         {
1088                 return production | ((31-index) << 11);
1089         }
1090         static inline int item_prod(unsigned short item)
1091         {
1092                 return item & 0x7ff;
1093         }
1094         static inline int item_index(unsigned short item)
1095         {
1096                 return (31-(item >> 11)) & 0x1f;
1097         }
1098
1099 For LR(1) analysis we need to compare not just the itemset in a state
1100 but also the LA sets.  As we assign each unique LA set a number, we
1101 can just compare the symset and the data values together.
1102
1103 ###### functions
1104         static int itemset_cmp(struct symset a, struct symset b,
1105                                enum grammar_type type)
1106         {
1107                 int i;
1108                 int av, bv;
1109
1110                 for (i = 0;
1111                      i < a.cnt && i < b.cnt &&
1112                      item_index(a.syms[i]) > 0 &&
1113                      item_index(b.syms[i]) > 0;
1114                      i++) {
1115                         int diff = a.syms[i] - b.syms[i];
1116                         if (diff)
1117                                 return diff;
1118                         if (type == LR1) {
1119                                 diff = a.data[i] - b.data[i];
1120                                 if (diff)
1121                                         return diff;
1122                         }
1123                 }
1124                 if (i == a.cnt || item_index(a.syms[i]) == 0)
1125                         av = -1;
1126                 else
1127                         av = a.syms[i];
1128                 if (i == b.cnt || item_index(b.syms[i]) == 0)
1129                         bv = -1;
1130                 else
1131                         bv = b.syms[i];
1132                 if (av - bv)
1133                         return av - bv;
1134                 if (type < LR1 || av == -1)
1135                         return 0;
1136                 return
1137                         a.data[i] - b.data[i];
1138         }
1139
1140 And now we can build the list of itemsets.  The lookup routine returns
1141 both a success flag and a pointer to where in the list an insert
1142 should happen, so we don't need to search a second time.
1143
1144 ###### declarations
1145         struct itemset {
1146                 struct itemset *next;
1147                 short state;
1148                 struct symset items;
1149                 struct symset go_to;
1150                 char completed;
1151                 char starts_line;
1152         };
1153
1154 ###### grammar fields
1155         struct itemset *items;
1156         int states;
1157
1158 ###### functions
1159         static int itemset_find(struct grammar *g, struct itemset ***where,
1160                                 struct symset kernel, enum grammar_type type)
1161         {
1162                 struct itemset **ip;
1163
1164                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1165                         struct itemset *i = *ip;
1166                         int diff;
1167                         diff = itemset_cmp(i->items, kernel, type);
1168                         if (diff < 0)
1169                                 continue;
1170                         if (diff > 0)
1171                                 break;
1172                         /* found */
1173                         *where = ip;
1174                         return 1;
1175                 }
1176                 *where = ip;
1177                 return 0;
1178         }
1179
1180 Adding an itemset may require merging the LA sets if LALR analysis is
1181 happening. If any new LA set add symbol that weren't in the old LA set, we
1182 clear the `completed` flag so that the dependants of this itemset will be
1183 recalculated and their LA sets updated.
1184
1185 `add_itemset` must consume the symsets it is passed, either by adding
1186 them to a data structure, of freeing them.
1187
1188         static int add_itemset(struct grammar *g, struct symset ss,
1189                                enum grammar_type type, int starts_line)
1190         {
1191                 struct itemset **where, *is;
1192                 int i;
1193                 int found = itemset_find(g, &where, ss, type);
1194                 if (!found) {
1195                         is = calloc(1, sizeof(*is));
1196                         is->state = g->states;
1197                         g->states += 1;
1198                         is->items = ss;
1199                         is->next = *where;
1200                         is->go_to = INIT_DATASET;
1201                         is->starts_line = starts_line;
1202                         *where = is;
1203                         return is->state;
1204                 }
1205                 is = *where;
1206                 if (type != LALR) {
1207                         symset_free(ss);
1208                         return is->state;
1209                 }
1210                 for (i = 0; i < ss.cnt; i++) {
1211                         struct symset temp = INIT_SYMSET, s;
1212                         if (ss.data[i] == is->items.data[i])
1213                                 continue;
1214                         s = set_find(g, is->items.data[i]);
1215                         symset_union(&temp, &s);
1216                         s = set_find(g, ss.data[i]);
1217                         if (symset_union(&temp, &s)) {
1218                                 is->items.data[i] = save_set(g, temp);
1219                                 is->completed = 0;
1220                         } else
1221                                 symset_free(temp);
1222                 }
1223                 symset_free(ss);
1224                 return is->state;
1225         }
1226
1227 #### The build
1228
1229 To build all the itemsets, we first insert the initial itemset made from the
1230 start symbol, complete each itemset, and then generate new itemsets from old
1231 until no new ones can be made.
1232
1233 Completing an itemset means finding all the items where "DOT" is followed by
1234 a nonterminal and adding "DOT=0" items for every production from that
1235 non-terminal - providing each item hasn't already been added.
1236
1237 If LA sets are needed, the LA set for each new item is found using
1238 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This may
1239 be supplemented by the LA set for the item which produce the new item.
1240
1241 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1242 is used in the next stage.
1243
1244 NOTE: precedence handling should happen here - I haven't written this yet
1245 though.
1246
1247 ###### complete itemset
1248         for (i = 0; i < is->items.cnt; i++) {
1249                 int p = item_prod(is->items.syms[i]);
1250                 int bs = item_index(is->items.syms[i]);
1251                 struct production *pr = g->productions[p];
1252                 int p2;
1253                 struct symbol *s;
1254                 struct symset LA = INIT_SYMSET;
1255                 unsigned short sn = 0;
1256
1257                 if (bs == pr->body_size)
1258                         continue;
1259                 s = pr->body[bs];
1260                 if (symset_find(&done, s->num) < 0)
1261                         symset_add(&done, s->num, 0);
1262                 if (s->type != Nonterminal)
1263                         continue;
1264                 again = 1;
1265                 if (type >= LALR) {
1266                         // Need the LA set.
1267                         int to_end;
1268                         add_first(pr, bs+1, &LA, g, &to_end);
1269                         if (to_end) {
1270                                 struct symset ss = set_find(g, is->items.data[i]);
1271                                 symset_union(&LA, &ss);
1272                         }
1273                         sn = save_set(g, LA);
1274                         LA = set_find(g, sn);
1275                 }
1276
1277                 /* Add productions for this symbol */
1278                 for (p2 = s->first_production;
1279                      p2 < g->production_count &&
1280                       g->productions[p2]->head == s;
1281                      p2++) {
1282                         int itm = item_num(p2, 0);
1283                         int pos = symset_find(&is->items, itm);
1284                         if (pos < 0) {
1285                                 symset_add(&is->items, itm, sn);
1286                                 /* Will have re-ordered, so start
1287                                  * from beginning again */
1288                                 i = -1;
1289                         } else if (type >= LALR) {
1290                                 struct symset ss = set_find(g, is->items.data[pos]);
1291                                 struct symset tmp = INIT_SYMSET;
1292
1293                                 symset_union(&tmp, &ss);
1294                                 if (symset_union(&tmp, &LA)) {
1295                                         is->items.data[pos] = save_set(g, tmp);
1296                                         i = -1;
1297                                 }else
1298                                         symset_free(tmp);
1299                         }
1300                 }
1301         }
1302
1303 For each symbol we found (and placed in `done`) we collect all the items for
1304 which this symbol is next, and create a new itemset with "DOT" advanced over
1305 the symbol.  This is then added to the collection of itemsets (or merged
1306 with a pre-existing itemset).
1307
1308 ###### derive itemsets
1309         // Now we have a completed itemset, so we need to
1310         // compute all the 'next' itemsets and create them
1311         // if they don't exist.
1312         for (i = 0; i < done.cnt; i++) {
1313                 int j;
1314                 unsigned short state;
1315                 int starts_line = 0;
1316                 struct symbol *sym = g->symtab[done.syms[i]];
1317                 struct symset newitemset = INIT_SYMSET;
1318                 if (type >= LALR)
1319                         newitemset = INIT_DATASET;
1320
1321                 if (sym->can_eol ||
1322                     (sym->nullable && is->starts_line))
1323                         starts_line = 1;
1324                 for (j = 0; j < is->items.cnt; j++) {
1325                         int itm = is->items.syms[j];
1326                         int p = item_prod(itm);
1327                         int bp = item_index(itm);
1328                         struct production *pr = g->productions[p];
1329                         unsigned short la = 0;
1330                         int pos;
1331
1332                         if (bp == pr->body_size)
1333                                 continue;
1334                         if (pr->body[bp] != sym)
1335                                 continue;
1336                         if (type >= LALR)
1337                                 la = is->items.data[j];
1338                         pos = symset_find(&newitemset, pr->head->num);
1339                         if (pos < 0)
1340                                 symset_add(&newitemset, item_num(p, bp+1), la);
1341                         else if (type >= LALR) {
1342                                 // Need to merge la set.
1343                                 int la2 = newitemset.data[pos];
1344                                 if (la != la2) {
1345                                         struct symset ss = set_find(g, la2);
1346                                         struct symset LA = INIT_SYMSET;
1347                                         symset_union(&LA, &ss);
1348                                         ss = set_find(g, la);
1349                                         if (symset_union(&LA, &ss))
1350                                                 newitemset.data[pos] = save_set(g, LA);
1351                                         else
1352                                                 symset_free(LA);
1353                                 }
1354                         }
1355                 }
1356                 state = add_itemset(g, newitemset, type, starts_line);
1357                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1358                         symset_add(&is->go_to, done.syms[i], state);
1359         }
1360
1361 All that is left is to crate the initial itemset from production zero, and
1362 with `TK_eof` as the LA set.
1363
1364 ###### functions
1365         static void build_itemsets(struct grammar *g, enum grammar_type type)
1366         {
1367                 struct symset first = INIT_SYMSET;
1368                 struct itemset *is;
1369                 int again;
1370                 unsigned short la = 0;
1371                 if (type >= LALR) {
1372                         // LA set just has eof
1373                         struct symset eof = INIT_SYMSET;
1374                         symset_add(&eof, TK_eof, 0);
1375                         la = save_set(g, eof);
1376                         first = INIT_DATASET;
1377                 }
1378                 // production 0, offset 0 (with no data)
1379                 symset_add(&first, item_num(0, 0), la);
1380                 add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
1381                 for (again = 0, is = g->items;
1382                      is;
1383                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
1384                         int i;
1385                         struct symset done = INIT_SYMSET;
1386                         if (is->completed)
1387                                 continue;
1388                         is->completed = 1;
1389                         again = 1;
1390                         ## complete itemset
1391                         ## derive itemsets
1392                         symset_free(done);
1393                 }
1394         }
1395
1396 ### Completing the analysis.
1397
1398 The exact process of analysis depends on which level was requested.  For
1399 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1400 `LR1` we need first, but not follow.  For `SLR` we need both.
1401
1402 We don't build the "action" tables that you might expect as the parser
1403 doesn't use them.  They will be calculated to some extent if needed for
1404 a report.
1405
1406 Once we have built everything we allocate arrays for the two lists:
1407 symbols and itemsets.  This allows more efficient access during reporting.
1408 The symbols are grouped as terminals and non-terminals and we record the
1409 changeover point in `first_nonterm`.
1410
1411 ###### grammar fields
1412         struct symbol **symtab;
1413         struct itemset **statetab;
1414         int first_nonterm;
1415
1416 ###### grammar_analyse
1417
1418         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1419         {
1420                 struct symbol *s;
1421                 struct itemset *is;
1422                 int snum = TK_reserved;
1423                 for (s = g->syms; s; s = s->next)
1424                         if (s->num < 0 && s->type == Terminal) {
1425                                 s->num = snum;
1426                                 snum++;
1427                         }
1428                 g->first_nonterm = snum;
1429                 for (s = g->syms; s; s = s->next)
1430                         if (s->num < 0) {
1431                                 s->num = snum;
1432                                 snum++;
1433                         }
1434                 g->num_syms = snum;
1435                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1436                 for (s = g->syms; s; s = s->next)
1437                         g->symtab[s->num] = s;
1438
1439                 set_nullable(g);
1440                 set_can_eol(g);
1441                 if (type >= SLR)
1442                         build_first(g);
1443
1444                 if (type == SLR)
1445                         build_follow(g);
1446
1447                 build_itemsets(g, type);
1448
1449                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1450                 for (is = g->items; is ; is = is->next)
1451                         g->statetab[is->state] = is;
1452         }
1453
1454 ## Reporting on the Grammar
1455
1456 The purpose of the report is to give the grammar developer insight into
1457 how the grammar parser will work.  It is basically a structured dump of
1458 all the tables that have been generated, plus an description of any conflicts.
1459
1460 ###### grammar_report
1461         static int grammar_report(struct grammar *g, enum grammar_type type)
1462         {
1463                 report_symbols(g);
1464                 if (g->follow)
1465                         report_follow(g);
1466                 report_itemsets(g);
1467                 return report_conflicts(g, type);
1468         }
1469
1470 Firstly we have the complete list of symbols, together with the "FIRST"
1471 set if that was generated.
1472
1473 ###### functions
1474
1475         static void report_symbols(struct grammar *g)
1476         {
1477                 int n;
1478                 if (g->first)
1479                         printf("SYMBOLS + FIRST:\n");
1480                 else
1481                         printf("SYMBOLS:\n");
1482
1483                 for (n = 0; n < g->num_syms; n++) {
1484                         struct symbol *s = g->symtab[n];
1485                         if (!s)
1486                                 continue;
1487
1488                         printf(" %c%c%3d%c: ",
1489                                s->nullable ? '.':' ',
1490                                s->can_eol ? '>':' ',
1491                                s->num, symtypes[s->type]);
1492                         prtxt(s->name);
1493                         if (s->precedence)
1494                                 printf(" (%d%s)", s->precedence,
1495                                        assoc_names[s->assoc]);
1496
1497                         if (g->first && s->type == Nonterminal) {
1498                                 int j;
1499                                 char c = ':';
1500                                 for (j = 0; j < g->first[n].cnt; j++) {
1501                                         printf("%c ", c);
1502                                         c = ',';
1503                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1504                                 }
1505                         }
1506                         printf("\n");
1507                 }
1508         }
1509
1510 Then we have to follow sets if they were computed.
1511
1512         static void report_follow(struct grammar *g)
1513         {
1514                 int n;
1515                 printf("FOLLOW:\n");
1516                 for (n = 0; n < g->num_syms; n++)
1517                         if (g->follow[n].cnt) {
1518                                 int j;
1519                                 char c = ':';
1520                                 printf("  ");
1521                                 prtxt(g->symtab[n]->name);
1522                                 for (j = 0; j < g->follow[n].cnt; j++) {
1523                                         printf("%c ", c);
1524                                         c = ',';
1525                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1526                                 }
1527                                 printf("\n");
1528                         }
1529         }
1530
1531 And finally the item sets.  These include the GO TO tables and, for
1532 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1533 it up a bit.  First the items, with production number and associativity.
1534
1535         static void report_item(struct grammar *g, int itm)
1536         {
1537                 int p = item_prod(itm);
1538                 int dot = item_index(itm);
1539                 struct production *pr = g->productions[p];
1540                 int i;
1541
1542                 printf("    ");
1543                 prtxt(pr->head->name);
1544                 printf(" ->");
1545                 for (i = 0; i < pr->body_size; i++) {
1546                         printf(" %s", (dot == i ? ". ": ""));
1547                         prtxt(pr->body[i]->name);
1548                 }
1549                 if (dot == pr->body_size)
1550                         printf(" .");
1551                 printf(" [%d]", p);
1552                 if (pr->precedence)
1553                         printf(" (%d%s)", pr->precedence,
1554                                assoc_names[pr->assoc]);
1555                 printf("\n");
1556         }
1557
1558 The LA sets which are (possibly) reported with each item:
1559
1560         static void report_la(struct grammar *g, int lanum)
1561         {
1562                 struct symset la = set_find(g, lanum);
1563                 int i;
1564                 char c = ':';
1565
1566                 printf("        LOOK AHEAD(%d)", lanum);
1567                 for (i = 0; i < la.cnt; i++) {
1568                         printf("%c ", c);
1569                         c = ',';
1570                         prtxt(g->symtab[la.syms[i]]->name);
1571                 }
1572                 printf("\n");
1573         }
1574
1575 Then the go to sets:
1576
1577
1578         static void report_goto(struct grammar *g, struct symset gt)
1579         {
1580                 int i;
1581                 printf("    GOTO:\n");
1582
1583                 for (i = 0; i < gt.cnt; i++) {
1584                         printf("      ");
1585                         prtxt(g->symtab[gt.syms[i]]->name);
1586                         printf(" -> %d\n", gt.data[i]);
1587                 }
1588         }
1589
1590 Now we can report all the item sets complete with items, LA sets, and GO TO.
1591
1592         static void report_itemsets(struct grammar *g)
1593         {
1594                 int s;
1595                 printf("ITEM SETS(%d)\n", g->states);
1596                 for (s = 0; s < g->states; s++) {
1597                         int j;
1598                         struct itemset *is = g->statetab[s];
1599                         printf("  Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
1600                         for (j = 0; j < is->items.cnt; j++) {
1601                                 report_item(g, is->items.syms[j]);
1602                                 if (is->items.data != NO_DATA)
1603                                         report_la(g, is->items.data[j]);
1604                         }
1605                         report_goto(g, is->go_to);
1606                 }
1607         }
1608
1609 ### Reporting conflicts
1610
1611 Conflict detection varies a lot among different analysis levels.  However
1612 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1613 LR1 are also similar as they have FOLLOW or LA sets.
1614
1615 ###### functions
1616
1617         ## conflict functions
1618
1619         static int report_conflicts(struct grammar *g, enum grammar_type type)
1620         {
1621                 int cnt = 0;
1622                 printf("Conflicts:\n");
1623                 if (type < SLR)
1624                         cnt = conflicts_lr0(g, type);
1625                 else
1626                         cnt = conflicts_slr(g, type);
1627                 if (cnt == 0)
1628                         printf(" - no conflicts\n");
1629                 return cnt;
1630         }
1631
1632 LR0 conflicts are any state which have both a reducible item and
1633 a shiftable item.
1634
1635 LR05 conflicts only occurs if two possibly reductions exist,
1636 as shifts always over-ride reductions.
1637
1638 ###### conflict functions
1639         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1640         {
1641                 int i;
1642                 int cnt = 0;
1643                 for (i = 0; i < g->states; i++) {
1644                         struct itemset *is = g->statetab[i];
1645                         int last_reduce = -1;
1646                         int prev_reduce = -1;
1647                         int last_shift = -1;
1648                         int j;
1649                         if (!is)
1650                                 continue;
1651                         for (j = 0; j < is->items.cnt; j++) {
1652                                 int itm = is->items.syms[j];
1653                                 int p = item_prod(itm);
1654                                 int bp = item_index(itm);
1655                                 struct production *pr = g->productions[p];
1656
1657                                 if (bp == pr->body_size) {
1658                                         prev_reduce = last_reduce;
1659                                         last_reduce = j;
1660                                         continue;
1661                                 }
1662                                 if (pr->body[bp]->type == Terminal)
1663                                         last_shift = j;
1664                         }
1665                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1666                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1667                                 report_item(g, is->items.syms[last_shift]);
1668                                 report_item(g, is->items.syms[last_reduce]);
1669                                 cnt++;
1670                         }
1671                         if (prev_reduce >= 0) {
1672                                 printf("  State %d has 2 (or more) reducible items\n", i);
1673                                 report_item(g, is->items.syms[prev_reduce]);
1674                                 report_item(g, is->items.syms[last_reduce]);
1675                                 cnt++;
1676                         }
1677                 }
1678                 return cnt;
1679         }
1680
1681 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1682 look ahead, or if a symbol in a look-ahead can be shifted.  The differ only
1683 in the source of the look ahead set.
1684
1685 We build a dataset mapping terminal to item for possible SHIFTs and then
1686 another for possible REDUCE operations. We report when we get conflicts
1687 between the two.
1688
1689         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1690         {
1691                 int i;
1692                 int cnt = 0;
1693
1694                 for (i = 0; i < g->states; i++) {
1695                         struct itemset *is = g->statetab[i];
1696                         struct symset shifts = INIT_DATASET;
1697                         struct symset reduce = INIT_DATASET;
1698                         int j;
1699                         if (!is)
1700                                 continue;
1701                         /* First collect the shifts */
1702                         for (j = 0; j < is->items.cnt; j++) {
1703                                 unsigned short itm = is->items.syms[j];
1704                                 int p = item_prod(itm);
1705                                 int bp = item_index(itm);
1706                                 struct production *pr = g->productions[p];
1707
1708                                 if (bp < pr->body_size &&
1709                                     pr->body[bp]->type == Terminal) {
1710                                         /* shiftable */
1711                                         int sym = pr->body[bp]->num;
1712                                         if (symset_find(&shifts, sym) < 0)
1713                                                 symset_add(&shifts, sym, itm);
1714                                 }
1715                         }
1716                         /* Now look for reduction and conflicts */
1717                         for (j = 0; j < is->items.cnt; j++) {
1718                                 unsigned short itm = is->items.syms[j];
1719                                 int p = item_prod(itm);
1720                                 int bp = item_index(itm);
1721                                 struct production *pr = g->productions[p];
1722
1723                                 if (bp < pr->body_size)
1724                                         continue;
1725                                 /* reducible */
1726                                 struct symset la;
1727                                 if (type == SLR)
1728                                         la = g->follow[pr->head->num];
1729                                 else
1730                                         la = set_find(g, is->items.data[j]);
1731                                 int k;
1732                                 for (k = 0; k < la.cnt; k++) {
1733                                         int pos = symset_find(&shifts, la.syms[k]);
1734                                         if (pos >= 0) {
1735                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1736                                                 prtxt(g->symtab[la.syms[k]]->name);
1737                                                 printf(":\n");
1738                                                 report_item(g, shifts.data[pos]);
1739                                                 report_item(g, itm);
1740                                                 cnt++;
1741                                         }
1742                                         pos = symset_find(&reduce, la.syms[k]);
1743                                         if (pos < 0) {
1744                                                 symset_add(&reduce, la.syms[k], itm);
1745                                                 continue;
1746                                         }
1747                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1748                                         prtxt(g->symtab[la.syms[k]]->name);
1749                                         printf(":\n");
1750                                         report_item(g, itm);
1751                                         report_item(g, reduce.data[pos]);
1752                                         cnt++;
1753                                 }
1754                         }
1755                         symset_free(shifts);
1756                         symset_free(reduce);
1757                 }
1758                 return cnt;
1759         }
1760
1761
1762 ## Generating the parser
1763
1764 The export part of the parser is the `parse_XX` function, where the name
1765 `XX` is based on the name of the parser files.
1766
1767 This takes a `code_node`, a partially initialized `token_config`, and an
1768 optional `FILE` to send tracing to.  The `token_config` gets the list of
1769 known words added and then is used with the `code_node` to initialize the
1770 scanner.
1771
1772 `parse_XX` then call the library function `parser_run` to actually complete
1773 the parse.  This needs the `states` table and function to call the various
1774 pieces of code provided in the grammar file, so they are generated first.
1775
1776 ###### parser_generate
1777
1778         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1779         {
1780                 gen_known(f, g);
1781                 gen_non_term(f, g);
1782                 gen_goto(f, g);
1783                 gen_states(f, g);
1784                 gen_reduce(f, g, file);
1785                 gen_free(f, g);
1786
1787                 fprintf(f, "#line 0 \"gen_parser\"\n");
1788                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1789                         name);
1790                 fprintf(f, "{\n");
1791                 fprintf(f, "\tstruct token_state *tokens;\n");
1792                 fprintf(f, "\tconfig->words_marks = known;\n");
1793                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1794                 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1795                 fprintf(f, "\ttokens = token_open(code, config);\n");
1796                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
1797                 fprintf(f, "\ttoken_close(tokens);\n");
1798                 fprintf(f, "\treturn rv;\n");
1799                 fprintf(f, "}\n\n");
1800         }
1801
1802 ### Table words table
1803
1804 The know words is simply an array of terminal symbols.
1805 The table of nonterminals used for tracing is a similar array.
1806
1807 ###### functions
1808
1809         static void gen_known(FILE *f, struct grammar *g)
1810         {
1811                 int i;
1812                 fprintf(f, "#line 0 \"gen_known\"\n");
1813                 fprintf(f, "static const char *known[] = {\n");
1814                 for (i = TK_reserved;
1815                      i < g->num_syms && g->symtab[i]->type == Terminal;
1816                      i++)
1817                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1818                                 g->symtab[i]->name.txt);
1819                 fprintf(f, "};\n\n");
1820         }
1821
1822         static void gen_non_term(FILE *f, struct grammar *g)
1823         {
1824                 int i;
1825                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1826                 fprintf(f, "static const char *non_term[] = {\n");
1827                 for (i = TK_reserved;
1828                      i < g->num_syms;
1829                      i++)
1830                         if (g->symtab[i]->type == Nonterminal)
1831                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1832                                         g->symtab[i]->name.txt);
1833                 fprintf(f, "};\n\n");
1834         }
1835
1836 ### States and the goto tables.
1837
1838 For each state we record the goto table, the reducible production if
1839 there is one, or a symbol to shift for error recovery.
1840 Some of the details of the reducible production are stored in the
1841 `do_reduce` function to come later.  Here we store the production number,
1842 the body size (useful for stack management) and the resulting symbol (useful
1843 for knowing how to free data later).
1844
1845 The go to table is stored in a simple array of `sym` and corresponding
1846 `state`.
1847
1848 ###### exported types
1849
1850         struct lookup {
1851                 short sym;
1852                 short state;
1853         };
1854         struct state {
1855                 short go_to_cnt;
1856                 const struct lookup * go_to;
1857                 short reduce_prod;
1858                 short reduce_size;
1859                 short reduce_sym;
1860                 short shift_sym;
1861                 short starts_line;
1862         };
1863
1864
1865 ###### functions
1866
1867         static void gen_goto(FILE *f, struct grammar *g)
1868         {
1869                 int i;
1870                 fprintf(f, "#line 0 \"gen_goto\"\n");
1871                 for (i = 0; i < g->states; i++) {
1872                         int j;
1873                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
1874                                 i);
1875                         struct symset gt = g->statetab[i]->go_to;
1876                         for (j = 0; j < gt.cnt; j++)
1877                                 fprintf(f, "\t{ %d, %d },\n",
1878                                         gt.syms[j], gt.data[j]);
1879                         fprintf(f, "};\n");
1880                 }
1881         }
1882
1883 ###### functions
1884
1885         static void gen_states(FILE *f, struct grammar *g)
1886         {
1887                 int i;
1888                 fprintf(f, "#line 0 \"gen_states\"\n");
1889                 fprintf(f, "static const struct state states[] = {\n");
1890                 for (i = 0; i < g->states; i++) {
1891                         struct itemset *is = g->statetab[i];
1892                         int j, prod = -1, prod_len;
1893                         int shift_sym = -1;
1894                         int shift_len = 0, shift_remain = 0;
1895                         for (j = 0; j < is->items.cnt; j++) {
1896                                 int itm = is->items.syms[j];
1897                                 int p = item_prod(itm);
1898                                 int bp = item_index(itm);
1899                                 struct production *pr = g->productions[p];
1900
1901                                 if (bp < pr->body_size) {
1902                                         if (shift_sym < 0 ||
1903                                             (shift_len == bp && shift_remain > pr->body_size - bp)) {
1904                                                 shift_sym = pr->body[bp]->num;
1905                                                 shift_len = bp;
1906                                                 shift_remain = pr->body_size - bp;
1907                                         }
1908                                         continue;
1909                                 }
1910                                 /* This is what we reduce */
1911                                 if (prod < 0 || prod_len < pr->body_size) {
1912                                         prod = p;
1913                                         prod_len = pr->body_size;
1914                                 }
1915                         }
1916
1917                         if (prod >= 0)
1918                                 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
1919                                         i, is->go_to.cnt, i, prod,
1920                                         g->productions[prod]->body_size,
1921                                         g->productions[prod]->head->num,
1922                                         is->starts_line);
1923                         else
1924                                 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
1925                                         i, is->go_to.cnt, i, shift_sym,
1926                                         is->starts_line);
1927                 }
1928                 fprintf(f, "};\n\n");
1929         }
1930
1931 ### The `do_reduce` function and the code
1932
1933 When the parser engine decides to reduce a production, it calls `do_reduce`.
1934 This has two functions.
1935
1936 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1937 production being reduced.  Secondly it runs the code that was included with
1938 the production if any.
1939
1940 This code needs to be able to store data somewhere.  Rather than requiring
1941 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1942 `do_reduce` return the size to be saved.
1943
1944 The code fragment requires translation when written out.  Any `$N` needs to
1945 be converted to a reference either to that buffer (if `$0`) or to the
1946 structure returned by a previous reduction.  These pointer need to be cast
1947 to the appropriate type for each access.  All this is handling in
1948 `gen_code`.
1949
1950
1951 ###### functions
1952
1953         static void gen_code(struct production *p, FILE *f, struct grammar *g)
1954         {
1955                 char *c;
1956                 fprintf(f, "\t\t\t");
1957                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1958                         int n;
1959                         if (*c != '$') {
1960                                 fputc(*c, f);
1961                                 if (*c == '\n')
1962                                         fputs("\t\t\t", f);
1963                                 continue;
1964                         }
1965                         c++;
1966                         if (*c < '0' || *c > '9') {
1967                                 fputc(*c, f);
1968                                 continue;
1969                         }
1970                         n = *c - '0';
1971                         while (c[1] >= '0' && c[1] <= '9') {
1972                                 c += 1;
1973                                 n = n * 10 + *c - '0';
1974                         }
1975                         if (n == 0)
1976                                 fprintf(f, "(*(struct %.*s*)ret)",
1977                                         p->head->struct_name.len,
1978                                         p->head->struct_name.txt);
1979                         else if (n > p->body_size)
1980                                 fprintf(f, "$%d", n);
1981                         else if (p->body[n-1]->type == Terminal)
1982                                 fprintf(f, "(*(struct token *)body[%d])",
1983                                         n-1);
1984                         else if (p->body[n-1]->struct_name.txt == NULL)
1985                                 fprintf(f, "$%d", n);
1986                         else
1987                                 fprintf(f, "(*(struct %.*s*)body[%d])",
1988                                         p->body[n-1]->struct_name.len,
1989                                         p->body[n-1]->struct_name.txt, n-1);
1990                 }
1991                 fputs("\n", f);
1992         }
1993
1994 ###### functions
1995
1996         static void gen_reduce(FILE *f, struct grammar *g, char *file)
1997         {
1998                 int i;
1999                 fprintf(f, "#line 0 \"gen_reduce\"\n");
2000                 fprintf(f, "static int do_reduce(int prod, void **body, void *ret)\n");
2001                 fprintf(f, "{\n");
2002                 fprintf(f, "\tint ret_size = 0;\n");
2003
2004                 fprintf(f, "\tswitch(prod) {\n");
2005                 for (i = 0; i < g->production_count; i++) {
2006                         struct production *p = g->productions[i];
2007                         fprintf(f, "\tcase %d:\n", i);
2008
2009                         if (p->code.txt)
2010                                 gen_code(p, f, g);
2011
2012                         if (p->head->struct_name.txt)
2013                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
2014                                         p->head->struct_name.len,
2015                                         p->head->struct_name.txt);
2016
2017                         fprintf(f, "\t\tbreak;\n");
2018                 }
2019                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2020         }
2021
2022 ### `do_free`
2023
2024 As each non-terminal can potentially cause a different type of data
2025 structure to be allocated and filled in, we need to be able to free it when
2026 done.
2027
2028 It is particularly important to have fine control over freeing during error
2029 recovery where individual stack frames might need to be freed.
2030
2031 For this, the grammar author required to defined a `free_XX` function for
2032 each structure that is used by a non-terminal.  `do_free` all call whichever
2033 is appropriate given a symbol number, and will call `free` (as is
2034 appropriate for tokens` on any terminal symbol.
2035
2036 ###### functions
2037
2038         static void gen_free(FILE *f, struct grammar *g)
2039         {
2040                 int i;
2041
2042                 fprintf(f, "#line 0 \"gen_free\"\n");
2043                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2044                 fprintf(f, "{\n");
2045                 fprintf(f, "\tif (!asn) return;\n");
2046                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2047                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2048                 fprintf(f, "\tswitch(sym) {\n");
2049
2050                 for (i = 0; i < g->num_syms; i++) {
2051                         struct symbol *s = g->symtab[i];
2052                         if (!s ||
2053                             s->type != Nonterminal ||
2054                             s->struct_name.txt == NULL)
2055                                 continue;
2056
2057                         fprintf(f, "\tcase %d:\n", s->num);
2058                         fprintf(f, "\t\tfree_%.*s(asn);\n",
2059                                 s->struct_name.len,
2060                                 s->struct_name.txt);
2061                         fprintf(f, "\t\tbreak;\n");
2062                 }
2063                 fprintf(f, "\t}\n}\n\n");
2064         }
2065
2066 ## The main routine.
2067
2068 There are three key parts to the "main" function of parsergen: processing
2069 the arguments, loading the grammar file, and dealing with the grammar.
2070
2071 ### Argument processing.
2072
2073 Command line options allow the selection of analysis mode, name of output
2074 file, and whether or not a report should be generated.  By default we create
2075 a report only if no code output was requested.
2076
2077 The `parse_XX` function name uses the basename of the output file which
2078 should not have a suffix (`.c`).  `.c` is added to the given name for the
2079 code, and `.h` is added for the header (if header text is specifed in the
2080 grammar file).
2081
2082 ###### includes
2083         #include <getopt.h>
2084
2085 ###### declarations
2086         static const struct option long_options[] = {
2087                 { "LR0",        0, NULL, '0' },
2088                 { "LR05",       0, NULL, '5' },
2089                 { "SLR",        0, NULL, 'S' },
2090                 { "LALR",       0, NULL, 'L' },
2091                 { "LR1",        0, NULL, '1' },
2092                 { "tag",        1, NULL, 't' },
2093                 { "report",     0, NULL, 'R' },
2094                 { "output",     1, NULL, 'o' },
2095                 { NULL,         0, NULL, 0   }
2096         };
2097         const char *options = "05SL1t:Ro:";
2098
2099 ###### process arguments
2100         int opt;
2101         char *outfile = NULL;
2102         char *infile;
2103         char *name;
2104         char *tag = NULL;
2105         int report = 1;
2106         enum grammar_type type = LR05;
2107         while ((opt = getopt_long(argc, argv, options,
2108                                   long_options, NULL)) != -1) {
2109                 switch(opt) {
2110                 case '0':
2111                         type = LR0; break;
2112                 case '5':
2113                         type = LR05; break;
2114                 case 'S':
2115                         type = SLR; break;
2116                 case 'L':
2117                         type = LALR; break;
2118                 case '1':
2119                         type = LR1; break;
2120                 case 'R':
2121                         report = 2; break;
2122                 case 'o':
2123                         outfile = optarg; break;
2124                 case 't':
2125                         tag = optarg; break;
2126                 default:
2127                         fprintf(stderr, "Usage: parsergen ...\n");
2128                         exit(1);
2129                 }
2130         }
2131         if (optind < argc)
2132                 infile = argv[optind++];
2133         else {
2134                 fprintf(stderr, "No input file given\n");
2135                 exit(1);
2136         }
2137         if (outfile && report == 1)
2138                 report = 0;
2139         name = outfile;
2140         if (name && strchr(name, '/'))
2141                 name = strrchr(name, '/')+1;
2142
2143         if (optind < argc) {
2144                 fprintf(stderr, "Excess command line arguments\n");
2145                 exit(1);
2146         }
2147
2148 ### Loading the grammar file
2149
2150 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2151 map it.
2152
2153 One we have extracted the code (with `mdcode`) we expect to file three
2154 sections: header, code, and grammar.  Anything else is an error.
2155
2156 "header" and "code" are optional, though it is hard to build a working
2157 parser with neither. "grammar" must be provided.
2158
2159 ###### includes
2160         #include <fcntl.h>
2161         #include <sys/mman.h>
2162         #include <errno.h>
2163
2164 ###### functions
2165         static int errs;
2166         static void pr_err(char *msg)
2167         {
2168                 errs++;
2169                 fprintf(stderr, "%s\n", msg);
2170         }
2171
2172 ###### load file
2173         struct section *table;
2174         int fd;
2175         int len;
2176         char *file;
2177         fd = open(infile, O_RDONLY);
2178         if (fd < 0) {
2179                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2180                         infile, strerror(errno));
2181                 exit(1);
2182         }
2183         len = lseek(fd, 0, 2);
2184         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2185         table = code_extract(file, file+len, pr_err);
2186
2187         struct code_node *hdr = NULL;
2188         struct code_node *code = NULL;
2189         struct code_node *gram = NULL;
2190         for (s = table; s; s = s->next) {
2191                 struct text sec = s->section;
2192                 if (tag && !strip_tag(&sec, tag))
2193                         continue;
2194                 if (text_is(sec, "header"))
2195                         hdr = s->code;
2196                 else if (text_is(sec, "code"))
2197                         code = s->code;
2198                 else if (text_is(sec, "grammar"))
2199                         gram = s->code;
2200                 else {
2201                         fprintf(stderr, "Unknown content section: %.*s\n",
2202                                 s->section.len, s->section.txt);
2203                         rv |= 2;
2204                 }
2205         }
2206
2207 ### Processing the input
2208
2209 As we need to append an extention to a filename and then open it for
2210 writing, and we need to do this twice, it helps to have a separate function.
2211
2212 ###### functions
2213
2214         static FILE *open_ext(char *base, char *ext)
2215         {
2216                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2217                 FILE *f;
2218                 strcat(strcpy(fn, base), ext);
2219                 f = fopen(fn, "w");
2220                 free(fn);
2221                 return f;
2222         }
2223
2224 If we can read the grammar, then we analyse and optionally report on it.  If
2225 the report finds conflicts we will exit with an error status.
2226
2227 ###### process input
2228         struct grammar *g = NULL;
2229         if (gram == NULL) {
2230                 fprintf(stderr, "No grammar section provided\n");
2231                 rv |= 2;
2232         } else {
2233                 g = grammar_read(gram);
2234                 if (!g) {
2235                         fprintf(stderr, "Failure to parse grammar\n");
2236                         rv |= 2;
2237                 }
2238         }
2239         if (g) {
2240                 grammar_analyse(g, type);
2241                 if (report)
2242                         if (grammar_report(g, type))
2243                                 rv |= 1;
2244         }
2245
2246 If a headers section is defined, we write it out and include a declaration
2247 for the `parse_XX` function so it can be used from separate code.
2248
2249         if (rv == 0 && hdr && outfile) {
2250                 FILE *f = open_ext(outfile, ".h");
2251                 if (f) {
2252                         code_node_print(f, hdr, infile);
2253                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2254                                 name);
2255                         fclose(f);
2256                 } else {
2257                         fprintf(stderr, "Cannot create %s.h\n",
2258                                 outfile);
2259                         rv |= 4;
2260                 }
2261         }
2262
2263 And if all goes well and an output file was provided, we create the `.c`
2264 file with the code section (if any) and the parser tables and function.
2265
2266         if (rv == 0 && outfile) {
2267                 FILE *f = open_ext(outfile, ".c");
2268                 if (f) {
2269                         if (code)
2270                                 code_node_print(f, code, infile);
2271                         gen_parser(f, g, infile, name);
2272                         fclose(f);
2273                 } else {
2274                         fprintf(stderr, "Cannot create %s.c\n",
2275                                 outfile);
2276                         rv |= 4;
2277                 }
2278         }
2279
2280 And that about wraps it up.  We need to set the locale so that UTF-8 is
2281 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2282
2283 ###### File: parsergen.mk
2284         parsergen : parsergen.o libscanner.o libmdcode.o
2285                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2286
2287 ###### includes
2288         #include <locale.h>
2289
2290 ###### main
2291
2292         int main(int argc, char *argv[])
2293         {
2294                 struct section *s;
2295                 int rv = 0;
2296
2297                 setlocale(LC_ALL,"");
2298
2299                 ## process arguments
2300                 ## load file
2301                 ## process input
2302
2303                 return rv;
2304         }
2305
2306 ## The SHIFT/REDUCE parser
2307
2308 Having analysed the grammar and generated all the table, we only need the
2309 shift/reduce engine to bring it all together.
2310
2311 ### Goto table lookup
2312
2313 The parser generator has nicely provided us with goto tables sorted by
2314 symbol number.  We need a binary search function to find a symbol in the
2315 table.
2316
2317 ###### parser functions
2318
2319         static int search(const struct state *l, int sym)
2320         {
2321                 int lo = 0;
2322                 int hi = l->go_to_cnt;
2323
2324                 if (hi == 0)
2325                         return -1;
2326                 while (lo + 1 < hi) {
2327                         int mid = (lo + hi) / 2;
2328                         if (l->go_to[mid].sym <= sym)
2329                                 lo = mid;
2330                         else
2331                                 hi = mid;
2332                 }
2333                 if (l->go_to[lo].sym == sym)
2334                         return l->go_to[lo].state;
2335                 else
2336                         return -1;
2337         }
2338
2339 ### The state stack.
2340
2341 The core data structure for the parser is the stack.  This tracks all the
2342 symbols that have been recognised or partially recognised.
2343
2344 The stack usually won't grow very large - maybe a few tens of entries.  So
2345 we dynamically resize and array as required but never bother to shrink it
2346 down again.
2347
2348 We keep the stack as two separate allocations.  One, `asn_stack` stores the
2349 "abstract syntax nodes" which are created by each reduction.  When we call
2350 `do_reduce` we need to pass an array of the `asn`s of the body of the
2351 production, and by keeping a separate `asn` stack, we can just pass a
2352 pointer into this stack.
2353
2354 The other allocation stores all other stack fields of which there are four.
2355 The `state` is the most important one and guides the parsing process.  The
2356 `sym` is nearly unnecessary.  However when we want to free entries from the
2357 `asn_stack`, it helps to know what type they are so we can call the right
2358 freeing function.  The symbol leads us to the right free function through
2359 `do_free`.
2360
2361 The `indents` count and the `starts_indented` flag track the line
2362 indents in the symbol.  These are used to allow indent information to
2363 guide parsing and error recovery.
2364
2365 As well as the stack of frames we have a `next` frame which is
2366 assembled from the incoming token and other information prior to
2367 pushing it onto the stack.
2368
2369 ###### parser functions
2370
2371         struct parser {
2372                 struct frame {
2373                         short state;
2374                         short sym;
2375                         short starts_indented;
2376                         short indents;
2377                         short newline_permitted;
2378                 } *stack, next;
2379                 void **asn_stack;
2380                 int stack_size;
2381                 int tos;
2382         };
2383
2384 #### Shift and pop
2385
2386 Two operations are needed on the stack - shift (which is like push) and pop.
2387
2388 Shift applies not only to terminals but also to non-terminals.  When we
2389 reduce a production we will pop off entries corresponding to the body
2390 symbols, then push on an item for the head of the production.  This last is
2391 exactly the same process as shifting in a terminal so we use the same
2392 function for both.
2393
2394 To simplify other code we arrange for `shift` to fail if there is no `goto`
2395 state for the symbol.  This is useful in basic parsing due to our design
2396 that we shift when we can, and reduce when we cannot.  So the `shift`
2397 function reports if it could.
2398
2399 So `shift` finds the next state.  If that succeed it extends the allocations
2400 if needed and pushes all the information onto the stacks.
2401
2402 ###### parser functions
2403
2404         static int shift(struct parser *p,
2405                          void *asn,
2406                          const struct state states[])
2407         {
2408                 // Push an entry onto the stack
2409                 int newstate = search(&states[p->next.state], p->next.sym);
2410                 if (newstate < 0)
2411                         return 0;
2412                 if (p->tos >= p->stack_size) {
2413                         p->stack_size += 10;
2414                         p->stack = realloc(p->stack, p->stack_size
2415                                            * sizeof(p->stack[0]));
2416                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2417                                            * sizeof(p->asn_stack[0]));
2418                 }
2419                 p->stack[p->tos] = p->next;
2420                 p->asn_stack[p->tos] = asn;
2421                 p->tos++;
2422                 p->next.state = newstate;
2423                 p->next.indents = 0;
2424                 p->next.starts_indented = 0;
2425                 // if new state doesn't start a line, we inherit newline_permitted status
2426                 if (states[newstate].starts_line)
2427                         p->next.newline_permitted = 1;
2428                 return 1;
2429         }
2430
2431 `pop` simply moves the top of stack (`tos`) back down the required amount
2432 and frees any `asn` entries that need to be freed.  It is called _after_ we
2433 reduce a production, just before we `shift` the nonterminal in.
2434
2435 ###### parser functions
2436
2437         static void pop(struct parser *p, int num,
2438                         void(*do_free)(short sym, void *asn))
2439         {
2440                 int i;
2441                 p->tos -= num;
2442                 for (i = 0; i < num; i++) {
2443                         p->next.indents += p->stack[p->tos+i].indents;
2444                         do_free(p->stack[p->tos+i].sym,
2445                                 p->asn_stack[p->tos+i]);
2446                 }
2447
2448                 if (num) {
2449                         p->next.state = p->stack[p->tos].state;
2450                         p->next.starts_indented = p->stack[p->tos].starts_indented;
2451                         p->next.newline_permitted = p->stack[p->tos].newline_permitted;
2452                         if (p->next.indents > p->next.starts_indented)
2453                                 p->next.newline_permitted = 0;
2454                 }
2455         }
2456
2457 ### Memory allocation
2458
2459 The `scanner` returns tokens in a local variable - we want them in allocated
2460 memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
2461 a reduce is in a large buffer.  Both of these require some allocation and
2462 copying, hence `memdup` and `tokcopy`.
2463
2464 ###### parser includes
2465         #include <memory.h>
2466
2467 ###### parser functions
2468
2469         void *memdup(void *m, int len)
2470         {
2471                 void *ret;
2472                 if (len == 0)
2473                         return NULL;
2474                 ret = malloc(len);
2475                 memcpy(ret, m, len);
2476                 return ret;
2477         }
2478
2479         static struct token *tok_copy(struct token tk)
2480         {
2481                 struct token *new = malloc(sizeof(*new));
2482                 *new = tk;
2483                 return new;
2484         }
2485
2486 ### The heart of the parser.
2487
2488 Now we have the parser.  If we can shift, we do.  If not and we can reduce
2489 we do.  If the production we reduced was production zero, then we have
2490 accepted the input and can finish.
2491
2492 We return whatever `asn` was returned by reducing production zero.
2493
2494 If we can neither shift nor reduce we have an error to handle.  We pop
2495 single entries off the stack until we can shift the `TK_error` symbol, then
2496 drop input tokens until we find one we can shift into the new error state.
2497
2498 When we find `TK_in` and `TK_out` tokens which report indents we need
2499 to handle them directly as the grammar cannot express what we want to
2500 do with them.
2501
2502 `TK_in` tokens are easy: we simply update the `next` stack frame to
2503 record how many indents there are and that the next token started with
2504 an indent.
2505
2506 `TK_out` tokens must either be counted off against any pending indent,
2507 or must force reductions until there is a pending indent which isn't
2508 at the start of a production.
2509
2510 ###### parser includes
2511         #include "parser.h"
2512 ###### parser_run
2513         void *parser_run(struct token_state *tokens,
2514                          const struct state states[],
2515                          int (*do_reduce)(int, void**, void*),
2516                          void (*do_free)(short, void*),
2517                          FILE *trace, const char *non_term[], int knowns)
2518         {
2519                 struct parser p = { 0 };
2520                 struct token *tk = NULL;
2521                 int accepted = 0;
2522                 void *ret;
2523
2524                 p.next.newline_permitted = states[0].starts_line;
2525                 while (!accepted) {
2526                         struct token *err_tk;
2527                         if (!tk)
2528                                 tk = tok_copy(token_next(tokens));
2529                         p.next.sym = tk->num;
2530                         if (trace)
2531                                 parser_trace(trace, &p, tk, states, non_term, knowns);
2532
2533                         if (p.next.sym == TK_in) {
2534                                 p.next.starts_indented = 1;
2535                                 p.next.indents = 1;
2536                                 free(tk);
2537                                 tk = NULL;
2538                                 continue;
2539                         }
2540                         if (p.next.sym == TK_out) {
2541                                 if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
2542                                     (p.stack[p.tos-1].indents == 1 &&
2543                                      states[p.next.state].reduce_size > 1)) {
2544                                         p.stack[p.tos-1].indents -= 1;
2545                                         if (p.stack[p.tos-1].indents == p.stack[p.tos-1].starts_indented) {
2546                                                 // no internal indent any more, reassess 'newline_permitted'
2547                                                 if (states[p.stack[p.tos-1].state].starts_line)
2548                                                         p.stack[p.tos-1].newline_permitted = 1;
2549                                                 else if (p.tos > 1)
2550                                                         p.stack[p.tos-1].newline_permitted = p.stack[p.tos-2].newline_permitted;
2551                                         }
2552                                         free(tk);
2553                                         tk = NULL;
2554                                         continue;
2555                                 }
2556                                 // fall through and force a REDUCE (as 'shift'
2557                                 // will fail).
2558                         }
2559                         if (p.next.sym == TK_newline) {
2560                                 if (!p.tos || ! p.stack[p.tos-1].newline_permitted) {
2561                                         free(tk);
2562                                         tk = NULL;
2563                                         continue;
2564                                 }
2565                         }
2566                         if (shift(&p, tk, states)) {
2567                                 tk = NULL;
2568                                 continue;
2569                         }
2570                         if (states[p.next.state].reduce_prod >= 0) {
2571                                 void **body;
2572                                 int prod = states[p.next.state].reduce_prod;
2573                                 int size = states[p.next.state].reduce_size;
2574                                 int bufsize;
2575                                 static char buf[16*1024];
2576                                 p.next.sym = states[p.next.state].reduce_sym;
2577
2578                                 body = p.asn_stack +
2579                                         (p.tos - states[p.next.state].reduce_size);
2580
2581                                 bufsize = do_reduce(prod, body, buf);
2582
2583                                 pop(&p, size, do_free);
2584                                 shift(&p, memdup(buf, bufsize), states);
2585                                 if (prod == 0)
2586                                         accepted = 1;
2587                                 continue;
2588                         }
2589                         if (tk->num == TK_out) {
2590                                 // Indent problem - synthesise tokens to get us
2591                                 // out of here.
2592                                 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
2593                                 p.next.sym = states[p.next.state].shift_sym;
2594                                 shift(&p, tok_copy(*tk), states);
2595                                 // FIXME need to report this error somehow
2596                                 continue;
2597                         }
2598                         /* Error. We walk up the stack until we
2599                          * find a state which will accept TK_error.
2600                          * We then shift in TK_error and see what state
2601                          * that takes us too.
2602                          * Then we discard input tokens until
2603                          * we find one that is acceptable.
2604                          */
2605
2606                         err_tk = tok_copy(*tk);
2607                         p.next.sym = TK_error;
2608                         while (shift(&p, err_tk, states) == 0
2609                                && p.tos > 0)
2610                                 // discard this state
2611                                 pop(&p, 1, do_free);
2612                         if (p.tos == 0) {
2613                                 free(err_tk);
2614                                 // no state accepted TK_error
2615                                 break;
2616                         }
2617                         while (search(&states[p.next.state], tk->num) < 0 &&
2618                                tk->num != TK_eof) {
2619                                 free(tk);
2620                                 tk = tok_copy(token_next(tokens));
2621                                 if (tk->num == TK_in)
2622                                         p.next.indents += 1;
2623                                 if (tk->num == TK_out) {
2624                                         if (p.next.indents == 0)
2625                                                 break;
2626                                         p.next.indents -= 1;
2627                                 }
2628                         }
2629                         if (p.tos == 0 && tk->num == TK_eof)
2630                                 break;
2631                 }
2632                 free(tk);
2633                 if (accepted)
2634                         ret = p.asn_stack[0];
2635                 else
2636                         pop(&p, p.tos, do_free);
2637                 free(p.asn_stack);
2638                 free(p.stack);
2639                 return ret;
2640         }
2641
2642 ###### exported functions
2643         void *parser_run(struct token_state *tokens,
2644                          const struct state states[],
2645                          int (*do_reduce)(int, void**, void*),
2646                          void (*do_free)(short, void*),
2647                          FILE *trace, const char *non_term[], int knowns);
2648
2649 ### Tracing
2650
2651 Being able to visualize the parser in action can be invaluable when
2652 debugging the parser code, or trying to understand how the parse of a
2653 particular grammar progresses.  The stack contains all the important
2654 state, so just printing out the stack every time around the parse loop
2655 can make it possible to see exactly what is happening.
2656
2657 This doesn't explicitly show each SHIFT and REDUCE action.  However they
2658 are easily deduced from the change between consecutive lines, and the
2659 details of each state can be found by cross referencing the states list
2660 in the stack with the "report" that parsergen can generate.
2661
2662 For terminal symbols, we just dump the token.  For non-terminals we
2663 print the name of the symbol.  The look ahead token is reported at the
2664 end inside square brackets.
2665
2666 ###### parser functions
2667
2668         static char *reserved_words[] = {
2669                 [TK_error]        = "ERROR",
2670                 [TK_in]           = "IN",
2671                 [TK_out]          = "OUT",
2672                 [TK_newline]      = "NEWLINE",
2673                 [TK_eof]          = "$eof",
2674         };
2675         static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2676         {
2677                 fprintf(trace, "(%d", f->state);
2678                 if (f->indents)
2679                         fprintf(trace, "%c%d", f->starts_indented?':':'.',
2680                                 f->indents);
2681                 if (states[f->state].starts_line)
2682                         fprintf(trace, "s");
2683                 if (f->newline_permitted)
2684                         fprintf(trace, "n");
2685                 fprintf(trace, ") ");
2686         }
2687
2688         void parser_trace(FILE *trace, struct parser *p,
2689                           struct token *tk, const struct state states[],
2690                           const char *non_term[], int knowns)
2691         {
2692                 int i;
2693                 for (i = 0; i < p->tos; i++) {
2694                         int sym = p->stack[i].sym;
2695                         parser_trace_state(trace, &p->stack[i], states);
2696                         if (sym < TK_reserved &&
2697                             reserved_words[sym] != NULL)
2698                                 fputs(reserved_words[sym], trace);
2699                         else if (sym < TK_reserved + knowns) {
2700                                 struct token *t = p->asn_stack[i];
2701                                 text_dump(trace, t->txt, 20);
2702                         } else
2703                                 fputs(non_term[sym - TK_reserved - knowns],
2704                                       trace);
2705                         fputs(" ", trace);
2706                 }
2707                 parser_trace_state(trace, &p->next, states);
2708                 fprintf(trace, " [");
2709                 if (tk->num < TK_reserved &&
2710                     reserved_words[tk->num] != NULL)
2711                         fputs(reserved_words[tk->num], trace);
2712                 else
2713                         text_dump(trace, tk->txt, 20);
2714                 fputs("]\n", trace);
2715         }
2716
2717 # A Worked Example
2718
2719 The obvious example for a parser is a calculator.
2720
2721 As `scanner` provides number parsing function using `libgmp` is it not much
2722 work to perform arbitrary rational number calculations.
2723
2724 This calculator takes one expression, or an equality test per line.  The
2725 results are printed and in any equality test fails, the program exits with
2726 an error.
2727
2728 Embedding mdcode inside mdcode is rather horrible.  I'd like to find a
2729 better approach, but as the grammar file must have 3 components I need
2730 something like this.
2731
2732 ###### File: parsergen.mk
2733         calc.c calc.h : parsergen parsergen.mdc
2734                 ./parsergen --tag calc -o calc parsergen.mdc
2735         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2736                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2737
2738 # calc: header
2739
2740         #include "number.h"
2741         // what do we use for a demo-grammar?  A calculator of course.
2742         struct number {
2743                 mpq_t val;
2744                 char tail[2];
2745                 int err;
2746         };
2747
2748 # calc: code
2749
2750         #include <stdlib.h>
2751         #include <unistd.h>
2752         #include <fcntl.h>
2753         #include <sys/mman.h>
2754         #include <stdio.h>
2755         #include <malloc.h>
2756         #include <gmp.h>
2757         #include "mdcode.h"
2758         #include "scanner.h"
2759         #include "number.h"
2760         #include "parser.h"
2761
2762         #include "calc.h"
2763
2764         static void free_number(struct number *n)
2765         {
2766                 mpq_clear(n->val);
2767                 free(n);
2768         }
2769
2770         int main(int argc, char *argv[])
2771         {
2772                 int fd = open(argv[1], O_RDONLY);
2773                 int len = lseek(fd, 0, 2);
2774                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2775                 struct section *s = code_extract(file, file+len, NULL);
2776                 struct token_config config = {
2777                         .ignored = (1 << TK_line_comment)
2778                                  | (1 << TK_block_comment)
2779                                  | (1 << TK_in)
2780                                  | (1 << TK_out),
2781                         .number_chars = ".,_+-",
2782                         .word_start = "",
2783                         .word_cont = "",
2784                 };
2785                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2786                 exit(0);
2787         }
2788
2789 # calc: grammar
2790
2791         Session -> Session Line
2792                 | Line
2793
2794         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2795                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2796                                         gmp_printf("  or as a decimal: %Fg\n", fl);
2797                                         mpf_clear(fl);
2798                                         }
2799                                      }$
2800                 | Expression = Expression NEWLINE ${
2801                         if (mpq_equal($1.val, $3.val))
2802                                 gmp_printf("Both equal %Qd\n", $1.val);
2803                         else {
2804                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
2805                                         $1.val, $3.val);
2806                                 exit(1);
2807                         }
2808                 }$
2809                 | NEWLINE ${ printf("Blank line\n"); }$
2810                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2811
2812         $number
2813         Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2814                 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2815                 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2816
2817         Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2818                 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2819                 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2820
2821         Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2822                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$