ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator with 2D support #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 "2D support" means that indentation and line breaks can be significant.
   8
   9 There are several distinct sections.
  10
  11 1. `grammar_read` will read a grammar from a literate-code file and
  12    store the productions, symbols, and code fragments.
  13 2. `grammar_analyse` will take that grammar and build LR parsing
  14    states and look-ahead tables.
  15 3. `grammar_report` will present the details of the analysis
  16    in a readable format and will report any conflicts.
  17 4. `parser_generate` will write out C code files with various
  18    tables and with the code fragments arranged in useful places.
  19 5. `parser_run` is a library function intended to be linked together
  20    with the generated parser tables to complete the implementation of
  21    a parser.
  22 6. Finally `calc` is a test grammar for a simple calculator.  The
  23    `parsergen` program built from the C code in this file can extract
  24    that grammar directly from this file and process it.
  25
  26 ###### File: parsergen.c
  27         #include <unistd.h>
  28         #include <stdlib.h>
  29         #include <stdio.h>
  30         ## includes
  31         ## forward declarations
  32         ## declarations
  33         ## functions
  34         ## grammar_read
  35         ## grammar_analyse
  36         ## grammar_report
  37         ## parser_generate
  38         ## main
  39 ###### File: parser.h
  40         ## exported types
  41         ## exported functions
  42 ###### File: libparser.c
  43         #include <unistd.h>
  44         #include <stdlib.h>
  45         #include <stdio.h>
  46         ## parser includes
  47         ## parser functions
  48         ## parser_run
  49 ###### File: parsergen.mk
  50         CFLAGS += -Wall -g
  51         all :: parsergen calc
  52         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  53                 ./md2c parsergen.mdc
  54
  55 ## Reading the grammar
  56
  57 The grammar must be stored in a markdown literate code file as parsed by
  58 "[mdcode][]".  It may have up to four top level (i.e.  unreferenced)
  59 sections: `header`, `code`, `grammar` and `reduce`.  The first two, if
  60 present, will be literally copied into the generated `.c` and `.h`
  61 files.  The third is required and contains the grammar which is
  62 tokenised with "[scanner][]".  `reduce` can provide variable
  63 declarations and common intialization code for all the reduction code
  64 fragments in the grammar.
  65
  66 If the `--tag` option is given, then any top level header that doesn't
  67 start with the tag is ignored, and the tag is striped from the rest.  So
  68 `--tag Foo` means that the four sections expected are `Foo: header`,
  69 `Foo: code`, `Foo: grammar` and `Foo: reduce`.  The tag `calc` is used
  70 to extract the sample calculator from this file.
  71
  72 [mdcode]: mdcode.html
  73 [scanner]: scanner.html
  74
  75 ###### includes
  76         #include "mdcode.h"
  77         #include "scanner.h"
  78
  79 ###### parser includes
  80         #include "mdcode.h"
  81         #include "scanner.h"
  82
  83 The grammar contains production sets, precedence/associativity
  84 declarations, general terminal declarations, and data type declarations.
  85 These are all parsed with _ad hoc_ parsing as we don't have a parser
  86 generator yet.
  87
  88 The precedence and associativity can be set for each production, and
  89 can be inherited from symbols.  The data type (either structure or a
  90 reference to a structure) is potentially defined for each non-terminal
  91 and describes what C structure is used to store information about each
  92 symbol.
  93
  94 ###### declarations
  95         enum assoc {Left, Right, Non};
  96         char *assoc_names[] = {"Left","Right","Non"};
  97
  98         struct symbol {
  99                 struct text struct_name;
 100                 int isref;
 101                 enum assoc assoc;
 102                 unsigned short precedence;
 103                 ## symbol fields
 104         };
 105         struct production {
 106                 unsigned short precedence;
 107                 enum assoc assoc;
 108                 ## production fields
 109         };
 110         struct grammar {
 111                 ## grammar fields
 112         };
 113
 114 The strings reported by `mdcode` and `scanner` are `struct text` which
 115 have length rather than being nul terminated.  To help with printing and
 116 comparing we define `text_is` and `prtxt`, which should possibly go in
 117 `mdcode`.  `scanner` does provide `text_dump` which is useful for
 118 strings which might contain control characters.
 119
 120 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 121 because that is what we need to detect tags.
 122
 123 ###### functions
 124         static int text_is(struct text t, char *s)
 125         {
 126                 return (strlen(s) == t.len &&
 127                         strncmp(s, t.txt, t.len) == 0);
 128         }
 129         static void prtxt(struct text t)
 130         {
 131                 printf("%.*s", t.len, t.txt);
 132         }
 133
 134         static int strip_tag(struct text *t, char *tag)
 135         {
 136                 int skip = strlen(tag) + 1;
 137                 if (skip >= t->len ||
 138                     strncmp(t->txt, tag, skip-1) != 0 ||
 139                     t->txt[skip-1] != ':')
 140                         return 0;
 141                 while (skip < t->len && t->txt[skip] == ' ')
 142                         skip++;
 143                 t->len -= skip;
 144                 t->txt += skip;
 145                 return 1;
 146         }
 147
 148 ### Symbols
 149
 150 Productions are comprised primarily of symbols - terminal and
 151 non-terminal.  We do not make a syntactic distinction between the two,
 152 though non-terminals must be identifiers.  Non-terminal symbols are
 153 those which appear in the head of a production, terminal symbols are
 154 those which don't.  There are also "virtual" symbols used for precedence
 155 marking discussed later, and sometimes we won't know what type a symbol
 156 is yet.
 157
 158 To help with code safety it is possible to declare the terminal symbols.
 159 If this is done, then any symbol used in a production that does not
 160 appear in the head of any production and is not declared as a terminal
 161 is treated as an error.
 162
 163 ###### forward declarations
 164         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 165         char *symtypes = "UVTN";
 166 ###### symbol fields
 167         enum symtype type;
 168 ###### grammar fields
 169         int terminals_declared;
 170
 171 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 172 table of known symbols and the resulting parser will report them as
 173 `TK_reserved + N`.  A small set of identifiers are reserved for the
 174 different token types that `scanner` can report, and an even smaller set
 175 are reserved for a special token that the parser can generate (`EOL`) as
 176 will be described later.  This latter set cannot use predefined numbers,
 177 so they are marked as `isspecial` for now and will get assigned a number
 178 with the non-terminals later.
 179
 180 ###### declarations
 181
 182         static struct { int num; char *name; } reserved_words[] = {
 183                 { TK_error,        "ERROR" },
 184                 { TK_number,       "NUMBER" },
 185                 { TK_ident,        "IDENTIFIER" },
 186                 { TK_mark,         "MARK" },
 187                 { TK_string,       "STRING" },
 188                 { TK_multi_string, "MULTI_STRING" },
 189                 { TK_in,           "IN" },
 190                 { TK_out,          "OUT" },
 191                 { TK_newline,      "NEWLINE" },
 192                 { TK_eof,          "$eof" },
 193                 { -1,              "EOL" },
 194         };
 195
 196 ###### symbol fields
 197         short num;
 198         unsigned int isspecial:1;
 199
 200 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 201 recognised.  The former is automatically expected at the end of the text
 202 being parsed. The latter are always ignored by the parser.
 203
 204 All of these symbols are stored in a simple symbol table.  We use the
 205 `struct text` from `mdcode` to store the name and link them together
 206 into a sorted list using an insertion sort.
 207
 208 We don't have separate `find` and `insert` functions as any symbol we
 209 find needs to be remembered.  We simply expect `find` to always return a
 210 symbol, but its type might be `Unknown`.
 211
 212 ###### includes
 213         #include <string.h>
 214
 215 ###### symbol fields
 216         struct text name;
 217         struct symbol *next;
 218
 219 ###### grammar fields
 220         struct symbol *syms;
 221         int num_syms;
 222
 223 ###### functions
 224         static struct symbol *sym_find(struct grammar *g, struct text s)
 225         {
 226                 struct symbol **l = &g->syms;
 227                 struct symbol *n;
 228                 int cmp = 1;
 229
 230                 while (*l &&
 231                         (cmp = text_cmp((*l)->name, s)) < 0)
 232                                 l = & (*l)->next;
 233                 if (cmp == 0)
 234                         return *l;
 235                 n = calloc(1, sizeof(*n));
 236                 n->name = s;
 237                 n->type = Unknown;
 238                 n->next = *l;
 239                 n->num = -1;
 240                 *l = n;
 241                 return n;
 242         }
 243
 244         static void symbols_init(struct grammar *g)
 245         {
 246                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 247                 int i;
 248                 for (i = 0; i < entries; i++) {
 249                         struct text t;
 250                         struct symbol *s;
 251                         t.txt = reserved_words[i].name;
 252                         t.len = strlen(t.txt);
 253                         s = sym_find(g, t);
 254                         s->type = Terminal;
 255                         s->num = reserved_words[i].num;
 256                         s->isspecial = 1;
 257                 }
 258         }
 259
 260 ### Data types and precedence.
 261
 262 Data type specification, precedence specification, and declaration of
 263 terminals are all introduced by a dollar sign at the start of the line.
 264 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 265 precedence, if it is `TERM` then the line declares terminals without
 266 precedence, otherwise it specifies a data type.
 267
 268 The data type name is simply stored and applied to the head of all
 269 subsequent productions.  It must be the name of a structure optionally
 270 preceded by an asterisk which means a reference or "pointer".  So
 271 `$expression` maps to `struct expression` and `$*statement` maps to
 272 `struct statement *`.
 273
 274 Any productions given before the first data type declaration will have
 275 no data type associated with them and can carry no information.  In
 276 order to allow other non-terminals to have no type, the data type
 277 `$void` can be given.  This does *not* mean that `struct void` will be
 278 used, but rather than no type will be associated with subsequent
 279 non-terminals.
 280
 281 The precedence line must contain a list of symbols, either terminal
 282 symbols or virtual symbols.  It can only contain symbols that have not
 283 been seen yet, so precedence declaration must precede the productions
 284 that they affect.
 285
 286 A "virtual" symbol is an identifier preceded by `$$`.  Virtual symbols
 287 carry precedence information but are not included in the grammar.  A
 288 production can declare that it inherits the precedence of a given
 289 virtual symbol.
 290
 291 This use for `$$` precludes it from being used as a symbol in the
 292 described language.  Two other symbols: `${` and `}$` are also
 293 unavailable.
 294
 295 Each new precedence line introduces a new precedence level and
 296 declares how it associates.  This level is stored in each symbol
 297 listed and may be inherited by any production which uses the symbol.  A
 298 production inherits from the last symbol which has a precedence.
 299
 300 The symbols on the first precedence line have the lowest precedence.
 301 Subsequent lines introduce symbols with higher precedence and so bind
 302 more tightly.
 303
 304 Note that a declaration line (or "dollar line") can start with either of
 305 two different marks: "$" or "$*".  The `dollar_line()` function defined
 306 here is told which was found in the `isref` argument.
 307
 308 ###### grammar fields
 309         struct text current_type;
 310         int type_isref;
 311         int prec_levels;
 312
 313 ###### declarations
 314         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 315         static const char *known[] = { "$$", "${", "}$" };
 316
 317 ###### functions
 318         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 319         {
 320                 struct token t = token_next(ts);
 321                 char *err;
 322                 enum assoc assoc;
 323                 int term = 0;
 324                 int found;
 325
 326                 if (t.num != TK_ident) {
 327                         err = "type or assoc expected after '$'";
 328                         goto abort;
 329                 }
 330                 if (text_is(t.txt, "LEFT"))
 331                         assoc = Left;
 332                 else if (text_is(t.txt, "RIGHT"))
 333                         assoc = Right;
 334                 else if (text_is(t.txt, "NON"))
 335                         assoc = Non;
 336                 else if (text_is(t.txt, "TERM")) {
 337                         term = 1;
 338                         g->terminals_declared = 1;
 339                 } else {
 340                         g->current_type = t.txt;
 341                         g->type_isref = isref;
 342                         if (text_is(t.txt, "void"))
 343                                 g->current_type.txt = NULL;
 344                         t = token_next(ts);
 345                         if (t.num != TK_newline) {
 346                                 err = "Extra tokens after type name";
 347                                 goto abort;
 348                         }
 349                         return NULL;
 350                 }
 351
 352                 if (isref) {
 353                         err = "$* cannot be followed by a precedence";
 354                         goto abort;
 355                 }
 356
 357                 // This is a precedence or TERM line, need some symbols.
 358                 found = 0;
 359                 g->prec_levels += 1;
 360                 t = token_next(ts);
 361                 while (t.num != TK_newline) {
 362                         enum symtype type = Terminal;
 363                         struct symbol *s;
 364                         if (t.num == TK_virtual) {
 365                                 type = Virtual;
 366                                 t = token_next(ts);
 367                                 if (t.num != TK_ident) {
 368                                         err = "$$ must be followed by a word";
 369                                         goto abort;
 370                                 }
 371                                 if (term) {
 372                                         err = "Virtual symbols not permitted on $TERM line";
 373                                         goto abort;
 374                                 }
 375                         } else if (t.num != TK_ident &&
 376                                    t.num != TK_mark) {
 377                                 err = "Illegal token in precedence line";
 378                                 goto abort;
 379                         }
 380                         s = sym_find(g, t.txt);
 381                         if (s->type != Unknown) {
 382                                 err = "Symbols in precedence/TERM line must not already be known.";
 383                                 goto abort;
 384                         }
 385                         s->type = type;
 386                         if (!term) {
 387                                 s->precedence = g->prec_levels;
 388                                 s->assoc = assoc;
 389                         }
 390                         found += 1;
 391                         t = token_next(ts);
 392                 }
 393                 if (found == 0)
 394                         err = "No symbols given on precedence/TERM line";
 395                         goto abort;
 396                 return NULL;
 397         abort:
 398                 while (t.num != TK_newline && t.num != TK_eof)
 399                         t = token_next(ts);
 400                 return err;
 401         }
 402
 403 ### Productions
 404
 405 A production either starts with an identifier which is the head
 406 non-terminal, or a vertical bar (`|`) in which case this production
 407 uses the same head as the previous one.  The identifier must be
 408 followed by a `->` mark.  All productions for a given non-terminal must
 409 be grouped together with the `nonterminal ->` given only once.
 410
 411 After this a (possibly empty) sequence of identifiers and marks form
 412 the body of the production.  A virtual symbol may be given after the
 413 body by preceding it with `$$`.  If a virtual symbol is given, the
 414 precedence of the production is that for the virtual symbol.  If none
 415 is given, the precedence is inherited from the last symbol in the
 416 production which has a precedence specified.
 417
 418 After the optional precedence may come the `${` mark.  This indicates
 419 the start of a code fragment.  If present, this must be on the same
 420 line as the start of the production.
 421
 422 All of the text from the `${` through to the matching `}$` is
 423 collected and forms the code-fragment for the production.  It must all
 424 be in one `code_node` of the literate code.  The `}$` must be
 425 at the end of a line.
 426
 427 Text in the code fragment will undergo substitutions where `$N` or
 428 `$<N`,for some numeric `N` (or non-numeric indicator as described
 429 later), will be replaced with a variable holding the parse information
 430 for the particular symbol in the production.  `$0` is the head of the
 431 production, `$1` is the first symbol of the body, etc.  The type of `$N`
 432 for a terminal symbol is `struct token`.  For a non-terminal, it is
 433 whatever has been declared for that symbol.  The `<` may be included and
 434 means that the value (usually a reference) is being moved out, so it
 435 will not automatically be freed.  The effect of using '<' is that the
 436 variable is cleared to all-zeros.
 437
 438 While building productions we will need to add to an array which needs to
 439 grow dynamically.
 440
 441 ###### functions
 442         static void array_add(void *varray, int *cnt, void *new)
 443         {
 444                 void ***array = varray;
 445                 int current = 0;
 446                 const int step = 8;
 447                 current = ((*cnt-1) | (step-1))+1;
 448                 if (*cnt == current) {
 449                         /* must grow */
 450                         current += step;
 451                         *array = realloc(*array, current * sizeof(void*));
 452                 }
 453                 (*array)[*cnt] = new;
 454                 (*cnt) += 1;
 455         }
 456
 457 Collecting the code fragment simply involves reading tokens until we
 458 hit the end token `}$`, and noting the character position of the start and
 459 the end.
 460
 461 ###### functions
 462         static struct text collect_code(struct token_state *state,
 463                                         struct token start)
 464         {
 465                 struct text code;
 466                 struct token t;
 467                 code.txt = start.txt.txt + start.txt.len;
 468                 do
 469                         t = token_next(state);
 470                 while (t.node == start.node &&
 471                        t.num != TK_close && t.num != TK_error &&
 472                        t.num != TK_eof);
 473                 if (t.num == TK_close && t.node == start.node)
 474                         code.len = t.txt.txt - code.txt;
 475                 else
 476                         code.txt = NULL;
 477                 return code;
 478         }
 479
 480 Now we have all the bits we need to parse a full production.
 481
 482 ###### includes
 483         #include <memory.h>
 484
 485 ###### grammar fields
 486         struct production **productions;
 487         int production_count;
 488
 489 ###### production fields
 490         struct symbol  *head;
 491         struct symbol **body;
 492         int             body_size;
 493         struct text     code;
 494         int             code_line;
 495
 496 ###### symbol fields
 497         int first_production;
 498
 499 ###### functions
 500         static char *parse_production(struct grammar *g,
 501                                       struct symbol *head,
 502                                       struct token_state *state)
 503         {
 504                 /* Head has already been parsed. */
 505                 struct token tk;
 506                 char *err;
 507                 struct production p, *pp;
 508
 509                 memset(&p, 0, sizeof(p));
 510                 p.head = head;
 511                 tk = token_next(state);
 512                 while (tk.num == TK_ident || tk.num == TK_mark) {
 513                         struct symbol *bs = sym_find(g, tk.txt);
 514                         if (bs->type == Unknown) {
 515                                 if (!g->terminals_declared)
 516                                         bs->type = Terminal;
 517                         }
 518                         if (bs->type == Virtual) {
 519                                 err = "Virtual symbol not permitted in production";
 520                                 goto abort;
 521                         }
 522                         if (bs->precedence) {
 523                                 p.precedence = bs->precedence;
 524                                 p.assoc = bs->assoc;
 525                         }
 526                         array_add(&p.body, &p.body_size, bs);
 527                         tk = token_next(state);
 528                 }
 529                 if (tk.num == TK_virtual) {
 530                         struct symbol *vs;
 531                         tk = token_next(state);
 532                         if (tk.num != TK_ident) {
 533                                 err = "word required after $$";
 534                                 goto abort;
 535                         }
 536                         vs = sym_find(g, tk.txt);
 537                         if (vs->precedence == 0) {
 538                                 err = "symbol after $$ must have precedence";
 539                                 goto abort;
 540                         } else {
 541                                 p.precedence = vs->precedence;
 542                                 p.assoc = vs->assoc;
 543                         }
 544                         tk = token_next(state);
 545                 }
 546                 if (tk.num == TK_open) {
 547                         p.code_line = tk.line;
 548                         p.code = collect_code(state, tk);
 549                         if (p.code.txt == NULL) {
 550                                 err = "code fragment not closed properly";
 551                                 goto abort;
 552                         }
 553                         tk = token_next(state);
 554                 }
 555
 556                 if (tk.num != TK_newline && tk.num != TK_eof) {
 557                         err = "stray tokens at end of line";
 558                         goto abort;
 559                 }
 560                 pp = malloc(sizeof(*pp));
 561                 *pp = p;
 562                 array_add(&g->productions, &g->production_count, pp);
 563                 return NULL;
 564         abort:
 565                 while (tk.num != TK_newline && tk.num != TK_eof)
 566                         tk = token_next(state);
 567                 free(p.body);
 568                 return err;
 569         }
 570
 571 With the ability to parse productions and dollar-lines, we have nearly all
 572 that we need to parse a grammar from a `code_node`.
 573
 574 The head of the first production will effectively be the `start` symbol
 575 of the grammar.  However it won't _actually_ be so.  Processing the
 576 grammar is greatly simplified if the real start symbol only has a single
 577 production, and expects `$eof` as the final terminal.  So when we find
 578 the first explicit production we insert an extra implicit production as
 579 production zero which looks like
 580
 581 ###### Example: production 0
 582         $start -> START $eof
 583
 584 where `START` is the first non-terminal given.
 585
 586 ###### create production zero
 587         struct production *p = calloc(1,sizeof(*p));
 588         struct text start = {"$start",6};
 589         struct text eof = {"$eof",4};
 590         struct text code = {"$0 = $<1;", 9};
 591         p->head = sym_find(g, start);
 592         p->head->type = Nonterminal;
 593         p->head->struct_name = g->current_type;
 594         p->head->isref = g->type_isref;
 595         if (g->current_type.txt)
 596                 p->code = code;
 597         array_add(&p->body, &p->body_size, head);
 598         array_add(&p->body, &p->body_size, sym_find(g, eof));
 599         p->head->first_production = g->production_count;
 600         array_add(&g->productions, &g->production_count, p);
 601
 602 Now we are ready to read in the grammar.  We ignore comments
 603 and strings so that the marks which introduce them can be
 604 used as terminals in the grammar.  We don't ignore numbers
 605 even though we don't allow them as that causes the scanner
 606 to produce errors that the parser is better positioned to handle.
 607 We also ignore indentation, but we expect and use newlines.
 608
 609 To allow comments in the grammar, we explicitly check for "//" as the
 610 first token on the line and those lines are skipped.  "//" can still be
 611 used as a terminal anywhere that a terminal is expected.
 612
 613 ###### grammar_read
 614         static struct grammar *grammar_read(struct code_node *code)
 615         {
 616                 struct token_config conf = {
 617                         .word_start = "",
 618                         .word_cont = "",
 619                         .words_marks = known,
 620                         .known_count = sizeof(known)/sizeof(known[0]),
 621                         .number_chars = "",
 622                         .ignored = (1 << TK_line_comment)
 623                                  | (1 << TK_block_comment)
 624                                  | (0 << TK_number)
 625                                  | (1 << TK_string)
 626                                  | (1 << TK_multi_string)
 627                                  | (1 << TK_in)
 628                                  | (1 << TK_out),
 629                 };
 630
 631                 struct token_state *state = token_open(code, &conf);
 632                 struct token tk;
 633                 struct symbol *head = NULL;
 634                 struct grammar *g;
 635                 char *err = NULL;
 636
 637                 g = calloc(1, sizeof(*g));
 638                 symbols_init(g);
 639
 640                 for (tk = token_next(state); tk.num != TK_eof;
 641                      tk = token_next(state)) {
 642                         if (tk.num == TK_newline)
 643                                 continue;
 644                         if (tk.num == TK_ident) {
 645                                 // new non-terminal
 646                                 head = sym_find(g, tk.txt);
 647                                 if (head->type == Nonterminal)
 648                                         err = "This non-terminal has already be used.";
 649                                 else if (head->type == Virtual)
 650                                         err = "Virtual symbol not permitted in head of production";
 651                                 else {
 652                                         head->type = Nonterminal;
 653                                         head->struct_name = g->current_type;
 654                                         head->isref = g->type_isref;
 655                                         if (g->production_count == 0) {
 656                                                 ## create production zero
 657                                         }
 658                                         head->first_production = g->production_count;
 659                                         tk = token_next(state);
 660                                         if (tk.num == TK_mark &&
 661                                             text_is(tk.txt, "->"))
 662                                                 err = parse_production(g, head, state);
 663                                         else
 664                                                 err = "'->' missing in production";
 665                                 }
 666                         } else if (tk.num == TK_mark
 667                                    && text_is(tk.txt, "|")) {
 668                                 // another production for same non-term
 669                                 if (head)
 670                                         err = parse_production(g, head, state);
 671                                 else
 672                                         err = "First production must have a head";
 673                         } else if (tk.num == TK_mark
 674                                    && text_is(tk.txt, "$")) {
 675                                 err = dollar_line(state, g, 0);
 676                         } else if (tk.num == TK_mark
 677                                    && text_is(tk.txt, "$*")) {
 678                                 err = dollar_line(state, g, 1);
 679                         } else if (tk.num == TK_mark
 680                                    && text_is(tk.txt, "//")) {
 681                                 while (tk.num != TK_newline &&
 682                                        tk.num != TK_eof)
 683                                         tk = token_next(state);
 684                         } else {
 685                                 err = "Unrecognised token at start of line.";
 686                         }
 687                         if (err)
 688                                 goto abort;
 689                 }
 690                 token_close(state);
 691                 if (g->terminals_declared) {
 692                         struct symbol *s;
 693                         int errs = 0;
 694                         for (s = g->syms; s; s = s->next) {
 695                                 if (s->type != Unknown)
 696                                         continue;
 697                                 errs += 1;
 698                                 fprintf(stderr, "Token %.*s not declared\n",
 699                                         s->name.len, s->name.txt);
 700                         }
 701                         if (errs) {
 702                                 free(g); // FIXME free content
 703                                 g = NULL;
 704                         }
 705                 }
 706                 return g;
 707         abort:
 708                 fprintf(stderr, "Error at line %d: %s\n",
 709                         tk.line, err);
 710                 token_close(state);
 711                 free(g); // FIXME free content
 712                 return NULL;
 713         }
 714
 715 ## Analysing the grammar
 716
 717 The central task in analysing the grammar is to determine a set of
 718 states to drive the parsing process.  This will require finding various
 719 sets of symbols and of "items".  Some of these sets will need to attach
 720 information to each element in the set, so they are more like maps.
 721
 722 Each "item" may have a 'look-ahead' or `LA` set associated with it.
 723 Many of these will be the same as each other.  To avoid memory wastage,
 724 and to simplify some comparisons of sets, these sets will be stored in a
 725 list of unique sets, each assigned a number.
 726
 727 Once we have the data structures in hand to manage these sets and lists,
 728 we can start setting the 'nullable' flag, build the 'FIRST' sets, and
 729 then create the item sets which define the various states.
 730
 731 ### Symbol sets.
 732
 733 Though we don't only store symbols in these sets, they are the main
 734 things we store, so they are called symbol sets or "symsets".
 735
 736 A symset has a size, an array of shorts, and an optional array of data
 737 values, which are also shorts.  If the array of data is not present, we
 738 store an impossible pointer, as `NULL` is used to indicate that no
 739 memory has been allocated yet;
 740
 741 ###### declarations
 742         struct symset {
 743                 short cnt;
 744                 unsigned short *syms, *data;
 745         };
 746         #define NO_DATA ((unsigned short *)1)
 747         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 748         const struct symset INIT_DATASET = { 0, NULL, NULL };
 749
 750 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
 751 using an insertion sort.  We don't explicitly record the amount of
 752 allocated space as it can be derived directly from the current `cnt`
 753 using `((cnt - 1) | 7) + 1`.
 754
 755 ###### functions
 756         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 757         {
 758                 int i;
 759                 int current = ((s->cnt-1) | 7) + 1;
 760                 if (current == s->cnt) {
 761                         current += 8;
 762                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 763                         if (s->data != NO_DATA)
 764                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 765                 }
 766                 i = s->cnt;
 767                 while (i > 0 && s->syms[i-1] > key) {
 768                         s->syms[i] = s->syms[i-1];
 769                         if (s->data != NO_DATA)
 770                                 s->data[i] = s->data[i-1];
 771                         i--;
 772                 }
 773                 s->syms[i] = key;
 774                 if (s->data != NO_DATA)
 775                         s->data[i] = val;
 776                 s->cnt += 1;
 777         }
 778
 779 Finding a symbol (or item) in a `symset` uses a simple binary search.
 780 We return the index where the value was found (so data can be accessed),
 781 or `-1` to indicate failure.
 782
 783         static int symset_find(struct symset *ss, unsigned short key)
 784         {
 785                 int lo = 0;
 786                 int hi = ss->cnt;
 787
 788                 if (hi == 0)
 789                         return -1;
 790                 while (lo + 1 < hi) {
 791                         int mid = (lo + hi) / 2;
 792                         if (ss->syms[mid] <= key)
 793                                 lo = mid;
 794                         else
 795                                 hi = mid;
 796                 }
 797                 if (ss->syms[lo] == key)
 798                         return lo;
 799                 return -1;
 800         }
 801
 802 We will often want to form the union of two symsets.  When we do, we
 803 will often want to know if anything changed (as that might mean there is
 804 more work to do).  So `symset_union` reports whether anything was added
 805 to the first set.  We use a slow quadratic approach as these sets are
 806 rarely large.  If profiling shows this to be a problem it can be
 807 optimised later.
 808
 809         static int symset_union(struct symset *a, struct symset *b)
 810         {
 811                 int i;
 812                 int added = 0;
 813                 for (i = 0; i < b->cnt; i++)
 814                         if (symset_find(a, b->syms[i]) < 0) {
 815                                 unsigned short data = 0;
 816                                 if (b->data != NO_DATA)
 817                                         data = b->data[i];
 818                                 symset_add(a, b->syms[i], data);
 819                                 added++;
 820                         }
 821                 return added;
 822         }
 823
 824 And of course we must be able to free a symset.
 825
 826         static void symset_free(struct symset ss)
 827         {
 828                 free(ss.syms);
 829                 if (ss.data != NO_DATA)
 830                         free(ss.data);
 831         }
 832
 833 ### Symset Storage
 834
 835 Some symsets are simply stored somewhere appropriate in the `grammar`
 836 data structure, others needs a bit of indirection.  This applies
 837 particularly to "LA" sets.  These will be paired with "items" in an
 838 "itemset".  As itemsets will be stored in a symset, the "LA" set needs
 839 to be stored in the `data` field, so we need a mapping from a small
 840 (short) number to an LA `symset`.
 841
 842 As mentioned earlier this involves creating a list of unique symsets.
 843
 844 For now, we just use a linear list sorted by insertion.  A skiplist
 845 would be more efficient and may be added later.
 846
 847 ###### declarations
 848
 849         struct setlist {
 850                 struct symset ss;
 851                 int num;
 852                 struct setlist *next;
 853         };
 854
 855 ###### grammar fields
 856         struct setlist *sets;
 857         int nextset;
 858
 859 ###### functions
 860
 861         static int ss_cmp(struct symset a, struct symset b)
 862         {
 863                 int i;
 864                 int diff = a.cnt - b.cnt;
 865                 if (diff)
 866                         return diff;
 867                 for (i = 0; i < a.cnt; i++) {
 868                         diff = (int)a.syms[i] - (int)b.syms[i];
 869                         if (diff)
 870                                 return diff;
 871                 }
 872                 return 0;
 873         }
 874
 875         static int save_set(struct grammar *g, struct symset ss)
 876         {
 877                 struct setlist **sl = &g->sets;
 878                 int cmp = 1;
 879                 struct setlist *s;
 880
 881                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 882                         sl = & (*sl)->next;
 883                 if (cmp == 0) {
 884                         symset_free(ss);
 885                         return (*sl)->num;
 886                 }
 887
 888                 s = malloc(sizeof(*s));
 889                 s->ss = ss;
 890                 s->num = g->nextset;
 891                 g->nextset += 1;
 892                 s->next = *sl;
 893                 *sl = s;
 894                 return s->num;
 895         }
 896
 897 Finding a set by number is currently performed by a simple linear
 898 search.  If this turns out to hurt performance, we can store the sets in
 899 a dynamic array like the productions.
 900
 901         static struct symset set_find(struct grammar *g, int num)
 902         {
 903                 struct setlist *sl = g->sets;
 904                 while (sl && sl->num != num)
 905                         sl = sl->next;
 906                 return sl->ss;
 907         }
 908
 909 ### Setting `nullable`
 910
 911 We set `nullable` on the head symbol for any production for which all
 912 the body symbols (if any) are nullable.  As this is a recursive
 913 definition, any change in the `nullable` setting means that we need to
 914 re-evaluate where it needs to be set.
 915
 916 We simply loop around performing the same calculations until no more
 917 changes happen.
 918
 919 ###### symbol fields
 920         int nullable;
 921
 922 ###### functions
 923         static void set_nullable(struct grammar *g)
 924         {
 925                 int check_again = 1;
 926                 while (check_again) {
 927                         int p;
 928                         check_again = 0;
 929                         for (p = 0; p < g->production_count; p++) {
 930                                 struct production *pr = g->productions[p];
 931                                 int s;
 932
 933                                 if (pr->head->nullable)
 934                                         continue;
 935                                 for (s = 0; s < pr->body_size; s++)
 936                                         if (! pr->body[s]->nullable)
 937                                                 break;
 938                                 if (s == pr->body_size) {
 939                                         pr->head->nullable = 1;
 940                                         check_again = 1;
 941                                 }
 942                         }
 943                 }
 944         }
 945
 946 ### Building the `first` sets
 947
 948 When calculating what can follow a particular non-terminal, we will need
 949 to know what the "first" terminal in any subsequent non-terminal might
 950 be.  So we calculate the `first` set for every non-terminal and store
 951 them in an array.  We don't bother recording the "first" set for
 952 terminals as they are trivial.
 953
 954 As the "first" for one symbol might depend on the "first" of another, we
 955 repeat the calculations until no changes happen, just like with
 956 `set_nullable`.  This makes use of the fact that `symset_union` reports
 957 if any change happens.
 958
 959 The core of this, which finds the "first" of part of a production body,
 960 will be reused for computing the "follow" sets, so we split that out
 961 into a separate function, `add_first`, which also reports if it got all
 962 the way to the end of the production without finding a non-nullable
 963 symbol.
 964
 965 ###### grammar fields
 966         struct symset *first;
 967
 968 ###### functions
 969
 970         static int add_first(struct production *pr, int start,
 971                              struct symset *target, struct grammar *g,
 972                              int *to_end)
 973         {
 974                 int s;
 975                 int changed = 0;
 976                 for (s = start; s < pr->body_size; s++) {
 977                         struct symbol *bs = pr->body[s];
 978                         if (bs->type == Terminal) {
 979                                 if (symset_find(target, bs->num) < 0) {
 980                                         symset_add(target, bs->num, 0);
 981                                         changed = 1;
 982                                 }
 983                                 break;
 984                         } else if (symset_union(target, &g->first[bs->num]))
 985                                 changed = 1;
 986                         if (!bs->nullable)
 987                                 break;
 988                 }
 989                 if (to_end)
 990                         *to_end = (s == pr->body_size);
 991                 return changed;
 992         }
 993
 994         static void build_first(struct grammar *g)
 995         {
 996                 int check_again = 1;
 997                 int s;
 998                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
 999                 for (s = 0; s < g->num_syms; s++)
1000                         g->first[s] = INIT_SYMSET;
1001
1002                 while (check_again) {
1003                         int p;
1004                         check_again = 0;
1005                         for (p = 0; p < g->production_count; p++) {
1006                                 struct production *pr = g->productions[p];
1007                                 struct symset *head = &g->first[pr->head->num];
1008
1009                                 if (add_first(pr, 0, head, g, NULL))
1010                                         check_again = 1;
1011                         }
1012                 }
1013         }
1014
1015 ### Building the `follow` sets.
1016
1017 There are two different situations when we will want to generate
1018 "follow" sets.  If we are doing an SLR analysis, we want to generate a
1019 single "follow" set for each non-terminal in the grammar.  That is what
1020 is happening here.  If we are doing an LALR or LR analysis we will want
1021 to generate a separate "LA" set for each item.  We do that later in
1022 state generation.
1023
1024 There are two parts to generating a "follow" set.  Firstly we look at
1025 every place that any non-terminal appears in the body of any production,
1026 and we find the set of possible "first" symbols after there.  This is
1027 done using `add_first` above and only needs to be done once as the
1028 "first" sets are now stable and will not change.
1029
1030 ###### grammar fields
1031         struct symset *follow;
1032
1033 ###### follow code
1034
1035         for (p = 0; p < g->production_count; p++) {
1036                 struct production *pr = g->productions[p];
1037                 int b;
1038
1039                 for (b = 0; b < pr->body_size - 1; b++) {
1040                         struct symbol *bs = pr->body[b];
1041                         if (bs->type == Terminal)
1042                                 continue;
1043                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1044                 }
1045         }
1046
1047 The second part is to add the "follow" set of the head of a production
1048 to the "follow" sets of the final symbol in the production, and any
1049 other symbol which is followed only by `nullable` symbols.  As this
1050 depends on "follow" itself we need to repeatedly perform the process
1051 until no further changes happen.
1052
1053 Rather than 2 nested loops, one that repeats the whole process until
1054 there is no change, and one that iterates of the set of productions, we
1055 combine these two functions into a single loop.
1056
1057 ###### follow code
1058
1059         for (check_again = 0, p = 0;
1060              p < g->production_count;
1061              p < g->production_count-1
1062                 ? p++ : check_again ? (check_again = 0, p = 0)
1063                               : p++) {
1064                 struct production *pr = g->productions[p];
1065                 int b;
1066
1067                 for (b = pr->body_size - 1; b >= 0; b--) {
1068                         struct symbol *bs = pr->body[b];
1069                         if (bs->type == Terminal)
1070                                 break;
1071                         if (symset_union(&g->follow[bs->num],
1072                                          &g->follow[pr->head->num]))
1073                                 check_again = 1;
1074                         if (!bs->nullable)
1075                                 break;
1076                 }
1077         }
1078
1079 We now just need to create and initialise the `follow` list to get a
1080 complete function.
1081
1082 ###### functions
1083         static void build_follow(struct grammar *g)
1084         {
1085                 int s, check_again, p;
1086                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1087                 for (s = 0; s < g->num_syms; s++)
1088                         g->follow[s] = INIT_SYMSET;
1089                 ## follow code
1090         }
1091
1092 ### Building itemsets and states
1093
1094 There are three different levels of detail that can be chosen for
1095 building the itemsets and states for the LR grammar.  They are:
1096
1097 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1098    itemsets - look-ahead, if used at all, is simply the 'follow' sets
1099    already computed,
1100 2. LALR(1) where we build look-ahead sets with each item and merge
1101    the LA sets when we find two paths to the same "kernel" set of items.
1102 3. LR(1) where different look-ahead for any item in the set means
1103    a different item set must be created.
1104
1105 ###### forward declarations
1106         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1107
1108 We need to be able to look through existing states to see if a newly
1109 generated state already exists.  For now we use a simple sorted linked
1110 list.
1111
1112 An item is a pair of numbers: the production number and the position of
1113 "DOT", which is an index into the body of the production.  As the
1114 numbers are not enormous we can combine them into a single "short" and
1115 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1116 production number (so 2000 productions with maximum of 31 symbols in the
1117 body).
1118
1119 Comparing the itemsets will be a little different to comparing symsets
1120 as we want to do the lookup after generating the "kernel" of an
1121 itemset, so we need to ignore the offset=zero items which are added during
1122 completion.
1123
1124 To facilitate this, we modify the "DOT" number so that "0" sorts to
1125 the end of the list in the symset, and then only compare items before
1126 the first "0".
1127
1128 ###### declarations
1129         static inline unsigned short item_num(int production, int dot)
1130         {
1131                 return production | ((31-dot) << 11);
1132         }
1133         static inline int item_prod(unsigned short item)
1134         {
1135                 return item & 0x7ff;
1136         }
1137         static inline int item_dot(unsigned short item)
1138         {
1139                 return (31-(item >> 11)) & 0x1f;
1140         }
1141
1142 For LR(1) analysis we need to compare not just the itemset in a state
1143 but also the LA sets.  As we assign each unique LA set a number, we
1144 can just compare the symset and the data values together.
1145
1146 ###### functions
1147         static int itemset_cmp(struct symset a, struct symset b,
1148                                enum grammar_type type)
1149         {
1150                 int i;
1151                 int av, bv;
1152
1153                 for (i = 0;
1154                      i < a.cnt && i < b.cnt &&
1155                      item_dot(a.syms[i]) > 0 &&
1156                      item_dot(b.syms[i]) > 0;
1157                      i++) {
1158                         int diff = a.syms[i] - b.syms[i];
1159                         if (diff)
1160                                 return diff;
1161                         if (type == LR1) {
1162                                 diff = a.data[i] - b.data[i];
1163                                 if (diff)
1164                                         return diff;
1165                         }
1166                 }
1167                 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1168                         av = -1;
1169                 else
1170                         av = a.syms[i];
1171                 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1172                         bv = -1;
1173                 else
1174                         bv = b.syms[i];
1175                 if (av - bv)
1176                         return av - bv;
1177                 if (type < LR1 || av == -1)
1178                         return 0;
1179                 return
1180                         a.data[i] - b.data[i];
1181         }
1182
1183 It will be helpful to know if an itemset has been "completed" or not,
1184 particularly for LALR where itemsets get merged, at which point they
1185 need to be consider for completion again.  So a `completed` flag is
1186 needed.
1187
1188 And now we can build the list of itemsets.  The lookup routine returns
1189 both a success flag and a pointer to where in the list an insert should
1190 happen, so we don't need to search a second time.
1191
1192 ###### declarations
1193         struct itemset {
1194                 struct itemset *next;
1195                 short state;
1196                 struct symset items;
1197                 struct symset go_to;
1198                 enum assoc assoc;
1199                 unsigned short precedence;
1200                 char completed;
1201         };
1202
1203 ###### grammar fields
1204         struct itemset *items;
1205         int states;
1206
1207 ###### functions
1208         static int itemset_find(struct grammar *g, struct itemset ***where,
1209                                 struct symset kernel, enum grammar_type type)
1210         {
1211                 struct itemset **ip;
1212
1213                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1214                         struct itemset *i = *ip;
1215                         int diff;
1216                         diff = itemset_cmp(i->items, kernel, type);
1217                         if (diff < 0)
1218                                 continue;
1219                         if (diff > 0)
1220                                 break;
1221                         /* found */
1222                         *where = ip;
1223                         return 1;
1224                 }
1225                 *where = ip;
1226                 return 0;
1227         }
1228
1229 Adding an itemset may require merging the LA sets if LALR analysis is
1230 happening.  If any new LA set adds any symbols that weren't in the old
1231 LA set, we clear the `completed` flag so that the dependants of this
1232 itemset will be recalculated and their LA sets updated.
1233
1234 `add_itemset` must consume the symsets it is passed, either by adding
1235 them to a data structure, of freeing them.
1236
1237         static int add_itemset(struct grammar *g, struct symset ss,
1238                                enum assoc assoc, unsigned short precedence,
1239                                enum grammar_type type)
1240         {
1241                 struct itemset **where, *is;
1242                 int i;
1243                 int found = itemset_find(g, &where, ss, type);
1244                 if (!found) {
1245                         is = calloc(1, sizeof(*is));
1246                         is->state = g->states;
1247                         g->states += 1;
1248                         is->items = ss;
1249                         is->assoc = assoc;
1250                         is->precedence = precedence;
1251                         is->next = *where;
1252                         is->go_to = INIT_DATASET;
1253                         *where = is;
1254                         return is->state;
1255                 }
1256                 is = *where;
1257                 if (type != LALR) {
1258                         symset_free(ss);
1259                         return is->state;
1260                 }
1261                 for (i = 0; i < ss.cnt; i++) {
1262                         struct symset temp = INIT_SYMSET, s;
1263                         if (ss.data[i] == is->items.data[i])
1264                                 continue;
1265                         s = set_find(g, is->items.data[i]);
1266                         symset_union(&temp, &s);
1267                         s = set_find(g, ss.data[i]);
1268                         if (symset_union(&temp, &s)) {
1269                                 is->items.data[i] = save_set(g, temp);
1270                                 is->completed = 0;
1271                         } else
1272                                 symset_free(temp);
1273                 }
1274                 symset_free(ss);
1275                 return is->state;
1276         }
1277
1278 #### The build
1279
1280 To build all the itemsets, we first insert the initial itemset made from
1281 production zero, complete each itemset, and then generate new itemsets
1282 from old until no new ones can be made.
1283
1284 Completing an itemset means finding all the items where "DOT" is
1285 followed by a nonterminal and adding "DOT=0" items for every production
1286 from that non-terminal - providing each item hasn't already been added.
1287 When we add such an item it might get inserted before the current
1288 one, so to make sure we process it we reset the loop counter to the
1289 start.
1290
1291 If LA sets are needed, the LA set for each new item is found using
1292 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This
1293 may be supplemented by the LA set for the item which produced the new
1294 item.
1295
1296 We also collect a set of all symbols which follow "DOT" (in `done`) as
1297 this is used in the next stage.
1298
1299 When itemsets are created we assign a precedence to the itemset from the
1300 complete item, if there is one.  We ignore the possibility of there
1301 being two and don't (currently) handle precedence in such grammars.
1302 When completing a grammar we ignore any item where DOT is followed by a
1303 terminal with a precedence lower than that for the itemset.  Unless the
1304 terminal has right associativity, we also ignore items where the
1305 terminal has the same precedence.  The result is that unwanted items are
1306 still in the itemset, but the terminal doesn't get into the go to set,
1307 so the item is ineffective.
1308
1309 ###### complete itemset
1310         for (i = 0; i < is->items.cnt; i++) {
1311                 int p = item_prod(is->items.syms[i]);
1312                 int bs = item_dot(is->items.syms[i]);
1313                 struct production *pr = g->productions[p];
1314                 int p2;
1315                 struct symbol *s;
1316                 struct symset LA = INIT_SYMSET;
1317                 unsigned short sn = 0;
1318
1319                 if (bs == pr->body_size)
1320                         continue;
1321                 s = pr->body[bs];
1322                 if (s->precedence && is->precedence &&
1323                     is->precedence > s->precedence)
1324                         /* This terminal has a low precedence and
1325                          * shouldn't be shifted
1326                          */
1327                         continue;
1328                 if (s->precedence && is->precedence &&
1329                     is->precedence == s->precedence && s->assoc != Right)
1330                         /* This terminal has a matching precedence and is
1331                          * not Right-associative, so we mustn't shift it.
1332                          */
1333                         continue;
1334                 if (symset_find(&done, s->num) < 0)
1335                         symset_add(&done, s->num, 0);
1336
1337                 if (s->type != Nonterminal)
1338                         continue;
1339                 check_again = 1;
1340                 if (type >= LALR) {
1341                         // Need the LA set.
1342                         int to_end;
1343                         add_first(pr, bs+1, &LA, g, &to_end);
1344                         if (to_end) {
1345                                 struct symset ss = set_find(g, is->items.data[i]);
1346                                 symset_union(&LA, &ss);
1347                         }
1348                         sn = save_set(g, LA);
1349                         LA = set_find(g, sn);
1350                 }
1351
1352                 /* Add productions for this symbol */
1353                 for (p2 = s->first_production;
1354                      p2 < g->production_count &&
1355                       g->productions[p2]->head == s;
1356                      p2++) {
1357                         int itm = item_num(p2, 0);
1358                         int pos = symset_find(&is->items, itm);
1359                         if (pos < 0) {
1360                                 symset_add(&is->items, itm, sn);
1361                                 /* Will have re-ordered, so start
1362                                  * from beginning again */
1363                                 i = -1;
1364                         } else if (type >= LALR) {
1365                                 struct symset ss = set_find(g, is->items.data[pos]);
1366                                 struct symset tmp = INIT_SYMSET;
1367                                 struct symset *la = &LA;
1368
1369                                 symset_union(&tmp, &ss);
1370                                 if (symset_union(&tmp, la)) {
1371                                         is->items.data[pos] = save_set(g, tmp);
1372                                         i = -1;
1373                                 } else
1374                                         symset_free(tmp);
1375                         }
1376                 }
1377         }
1378
1379 For each symbol we found (and placed in `done`) we collect all the items
1380 for which this symbol is next, and create a new itemset with "DOT"
1381 advanced over the symbol.  This is then added to the collection of
1382 itemsets (or merged with a pre-existing itemset).
1383
1384 ###### derive itemsets
1385         // Now we have a completed itemset, so we need to
1386         // compute all the 'next' itemsets and create them
1387         // if they don't exist.
1388         for (i = 0; i < done.cnt; i++) {
1389                 int j;
1390                 unsigned short state;
1391                 struct symbol *sym = g->symtab[done.syms[i]];
1392                 enum assoc assoc = Non;
1393                 unsigned short precedence = 0;
1394                 struct symset newitemset = INIT_SYMSET;
1395                 if (type >= LALR)
1396                         newitemset = INIT_DATASET;
1397
1398                 for (j = 0; j < is->items.cnt; j++) {
1399                         int itm = is->items.syms[j];
1400                         int p = item_prod(itm);
1401                         int bp = item_dot(itm);
1402                         struct production *pr = g->productions[p];
1403                         unsigned short la = 0;
1404                         int pos;
1405
1406                         if (bp == pr->body_size)
1407                                 continue;
1408                         if (pr->body[bp] != sym)
1409                                 continue;
1410
1411                         bp += 1;
1412                         if (type >= LALR)
1413                                 la = is->items.data[j];
1414                         if (bp == pr->body_size &&
1415                             pr->precedence > 0 &&
1416                             pr->precedence > precedence) {
1417                                 // new itemset is reducible and has a precedence.
1418                                 precedence = pr->precedence;
1419                                 assoc = pr->assoc;
1420                         }
1421                         pos = symset_find(&newitemset, item_num(p, bp));
1422
1423                         if (pos < 0)
1424                                 symset_add(&newitemset, item_num(p, bp), la);
1425                         else if (type >= LALR) {
1426                                 // Need to merge la set.
1427                                 int la2 = newitemset.data[pos];
1428                                 if (la != la2) {
1429                                         struct symset ss = set_find(g, la2);
1430                                         struct symset LA = INIT_SYMSET;
1431                                         symset_union(&LA, &ss);
1432                                         ss = set_find(g, la);
1433                                         if (symset_union(&LA, &ss))
1434                                                 newitemset.data[pos] = save_set(g, LA);
1435                                         else
1436                                                 symset_free(LA);
1437                                 }
1438                         }
1439                 }
1440                 state = add_itemset(g, newitemset, assoc, precedence, type);
1441                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1442                         symset_add(&is->go_to, done.syms[i], state);
1443         }
1444
1445 All that is left is to create the initial itemset from production zero, and
1446 with `TK_eof` as the LA set.
1447
1448 ###### functions
1449         static void build_itemsets(struct grammar *g, enum grammar_type type)
1450         {
1451                 struct symset first = INIT_SYMSET;
1452                 struct itemset *is;
1453                 int check_again;
1454                 unsigned short la = 0;
1455                 if (type >= LALR) {
1456                         // LA set just has eof
1457                         struct symset eof = INIT_SYMSET;
1458                         symset_add(&eof, TK_eof, 0);
1459                         la = save_set(g, eof);
1460                         first = INIT_DATASET;
1461                 }
1462                 // production 0, offset 0 (with no data)
1463                 symset_add(&first, item_num(0, 0), la);
1464                 add_itemset(g, first, Non, 0, type);
1465                 for (check_again = 0, is = g->items;
1466                      is;
1467                      is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1468                         int i;
1469                         struct symset done = INIT_SYMSET;
1470                         if (is->completed)
1471                                 continue;
1472                         is->completed = 1;
1473                         check_again = 1;
1474                         ## complete itemset
1475                         ## derive itemsets
1476                         symset_free(done);
1477                 }
1478         }
1479
1480 ### Completing the analysis.
1481
1482 The exact process of analysis depends on which level was requested.  For
1483 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1484 `LR1` we need first, but not follow.  For `SLR` we need both.
1485
1486 We don't build the "action" tables that you might expect as the parser
1487 doesn't use them.  They will be calculated to some extent if needed for
1488 a report.
1489
1490 Once we have built everything we allocate arrays for the two lists:
1491 symbols and itemsets.  This allows more efficient access during
1492 reporting.  The symbols are grouped as terminals, then non-terminals,
1493 then virtual, with the start of non-terminals recorded as `first_nonterm`.
1494 Special terminals -- meaning just EOL -- are included with the
1495 non-terminals so that they are not expected by the scanner.
1496
1497 ###### grammar fields
1498         struct symbol **symtab;
1499         struct itemset **statetab;
1500         int first_nonterm;
1501
1502 ###### grammar_analyse
1503
1504         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1505         {
1506                 struct symbol *s;
1507                 struct itemset *is;
1508                 int snum = TK_reserved;
1509                 for (s = g->syms; s; s = s->next)
1510                         if (s->num < 0 && s->type == Terminal && !s->isspecial) {
1511                                 s->num = snum;
1512                                 snum++;
1513                         }
1514                 g->first_nonterm = snum;
1515                 for (s = g->syms; s; s = s->next)
1516                         if (s->num < 0 && s->type != Virtual) {
1517                                 s->num = snum;
1518                                 snum++;
1519                         }
1520                 for (s = g->syms; s; s = s->next)
1521                         if (s->num < 0) {
1522                                 s->num = snum;
1523                                 snum++;
1524                         }
1525                 g->num_syms = snum;
1526                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1527                 for (s = g->syms; s; s = s->next)
1528                         g->symtab[s->num] = s;
1529
1530                 set_nullable(g);
1531                 if (type >= SLR)
1532                         build_first(g);
1533
1534                 if (type == SLR)
1535                         build_follow(g);
1536
1537                 build_itemsets(g, type);
1538
1539                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1540                 for (is = g->items; is ; is = is->next)
1541                         g->statetab[is->state] = is;
1542         }
1543
1544 ## Reporting on the Grammar
1545
1546 The purpose of the report is to give the grammar developer insight into
1547 how the grammar parser will work.  It is basically a structured dump of
1548 all the tables that have been generated, plus a description of any conflicts.
1549
1550 ###### grammar_report
1551         static int grammar_report(struct grammar *g, enum grammar_type type)
1552         {
1553                 report_symbols(g);
1554                 if (g->follow)
1555                         report_follow(g);
1556                 report_itemsets(g);
1557                 return report_conflicts(g, type);
1558         }
1559
1560 Firstly we have the complete list of symbols, together with the
1561 "FIRST" set if that was generated.  We add a mark to each symbol to
1562 show if it is nullable (`.`).
1563
1564 ###### functions
1565
1566         static void report_symbols(struct grammar *g)
1567         {
1568                 int n;
1569                 if (g->first)
1570                         printf("SYMBOLS + FIRST:\n");
1571                 else
1572                         printf("SYMBOLS:\n");
1573
1574                 for (n = 0; n < g->num_syms; n++) {
1575                         struct symbol *s = g->symtab[n];
1576                         if (!s)
1577                                 continue;
1578
1579                         printf(" %c%3d%c: ",
1580                                s->nullable ? '.':' ',
1581                                s->num, symtypes[s->type]);
1582                         prtxt(s->name);
1583                         if (s->precedence)
1584                                 printf(" (%d%s)", s->precedence,
1585                                        assoc_names[s->assoc]);
1586
1587                         if (g->first && s->type == Nonterminal) {
1588                                 int j;
1589                                 char c = ':';
1590                                 for (j = 0; j < g->first[n].cnt; j++) {
1591                                         printf("%c ", c);
1592                                         c = ',';
1593                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1594                                 }
1595                         }
1596                         printf("\n");
1597                 }
1598         }
1599
1600 Then we have the follow sets if they were computed.
1601
1602         static void report_follow(struct grammar *g)
1603         {
1604                 int n;
1605                 printf("FOLLOW:\n");
1606                 for (n = 0; n < g->num_syms; n++)
1607                         if (g->follow[n].cnt) {
1608                                 int j;
1609                                 char c = ':';
1610                                 printf("  ");
1611                                 prtxt(g->symtab[n]->name);
1612                                 for (j = 0; j < g->follow[n].cnt; j++) {
1613                                         printf("%c ", c);
1614                                         c = ',';
1615                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1616                                 }
1617                                 printf("\n");
1618                         }
1619         }
1620
1621 And finally the item sets.  These include the GO TO tables and, for
1622 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1623 it up a bit.  First the items, with production number and associativity.
1624
1625         static void report_item(struct grammar *g, int itm)
1626         {
1627                 int p = item_prod(itm);
1628                 int dot = item_dot(itm);
1629                 struct production *pr = g->productions[p];
1630                 int i;
1631
1632                 printf("    ");
1633                 prtxt(pr->head->name);
1634                 printf(" ->");
1635                 for (i = 0; i < pr->body_size; i++) {
1636                         printf(" %s", (dot == i ? ". ": ""));
1637                         prtxt(pr->body[i]->name);
1638                 }
1639                 if (dot == pr->body_size)
1640                         printf(" .");
1641                 printf(" [%d]", p);
1642                 if (pr->precedence && dot == pr->body_size)
1643                         printf(" (%d%s)", pr->precedence,
1644                                assoc_names[pr->assoc]);
1645                 if (dot < pr->body_size &&
1646                     pr->body[dot]->precedence) {
1647                         struct symbol *s = pr->body[dot];
1648                         printf(" [%d%s]", s->precedence,
1649                                assoc_names[s->assoc]);
1650                 }
1651                 printf("\n");
1652         }
1653
1654 The LA sets which are (possibly) reported with each item:
1655
1656         static void report_la(struct grammar *g, int lanum)
1657         {
1658                 struct symset la = set_find(g, lanum);
1659                 int i;
1660                 char c = ':';
1661
1662                 printf("        LOOK AHEAD(%d)", lanum);
1663                 for (i = 0; i < la.cnt; i++) {
1664                         printf("%c ", c);
1665                         c = ',';
1666                         prtxt(g->symtab[la.syms[i]]->name);
1667                 }
1668                 printf("\n");
1669         }
1670
1671 Then the go to sets:
1672
1673         static void report_goto(struct grammar *g, struct symset gt)
1674         {
1675                 int i;
1676                 printf("    GOTO:\n");
1677
1678                 for (i = 0; i < gt.cnt; i++) {
1679                         printf("      ");
1680                         prtxt(g->symtab[gt.syms[i]]->name);
1681                         printf(" -> %d\n", gt.data[i]);
1682                 }
1683         }
1684
1685 Now we can report all the item sets complete with items, LA sets, and GO TO.
1686
1687         static void report_itemsets(struct grammar *g)
1688         {
1689                 int s;
1690                 printf("ITEM SETS(%d)\n", g->states);
1691                 for (s = 0; s < g->states; s++) {
1692                         int j;
1693                         struct itemset *is = g->statetab[s];
1694                         printf("  Itemset %d:", s);
1695                         if (is->precedence)
1696                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1697                         printf("\n");
1698                         for (j = 0; j < is->items.cnt; j++) {
1699                                 report_item(g, is->items.syms[j]);
1700                                 if (is->items.data != NO_DATA)
1701                                         report_la(g, is->items.data[j]);
1702                         }
1703                         report_goto(g, is->go_to);
1704                 }
1705         }
1706
1707 ### Reporting conflicts
1708
1709 Conflict detection varies a lot among different analysis levels.  However
1710 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1711 LR1 are also similar as they have FOLLOW or LA sets.
1712
1713 ###### functions
1714
1715         ## conflict functions
1716
1717         static int report_conflicts(struct grammar *g, enum grammar_type type)
1718         {
1719                 int cnt = 0;
1720                 printf("Conflicts:\n");
1721                 if (type < SLR)
1722                         cnt = conflicts_lr0(g, type);
1723                 else
1724                         cnt = conflicts_slr(g, type);
1725                 if (cnt == 0)
1726                         printf(" - no conflicts\n");
1727                 return cnt;
1728         }
1729
1730 LR0 conflicts are any state which have both a reducible item and
1731 a shiftable item, or two reducible items.
1732
1733 LR05 conflicts only occur if two possible reductions exist,
1734 as shifts always over-ride reductions.
1735
1736 ###### conflict functions
1737         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1738         {
1739                 int i;
1740                 int cnt = 0;
1741                 for (i = 0; i < g->states; i++) {
1742                         struct itemset *is = g->statetab[i];
1743                         int last_reduce = -1;
1744                         int prev_reduce = -1;
1745                         int last_shift = -1;
1746                         int j;
1747                         if (!is)
1748                                 continue;
1749                         for (j = 0; j < is->items.cnt; j++) {
1750                                 int itm = is->items.syms[j];
1751                                 int p = item_prod(itm);
1752                                 int bp = item_dot(itm);
1753                                 struct production *pr = g->productions[p];
1754
1755                                 if (bp == pr->body_size) {
1756                                         prev_reduce = last_reduce;
1757                                         last_reduce = j;
1758                                         continue;
1759                                 }
1760                                 if (pr->body[bp]->type == Terminal)
1761                                         last_shift = j;
1762                         }
1763                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1764                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1765                                 report_item(g, is->items.syms[last_shift]);
1766                                 report_item(g, is->items.syms[last_reduce]);
1767                                 cnt++;
1768                         }
1769                         if (prev_reduce >= 0) {
1770                                 printf("  State %d has 2 (or more) reducible items\n", i);
1771                                 report_item(g, is->items.syms[prev_reduce]);
1772                                 report_item(g, is->items.syms[last_reduce]);
1773                                 cnt++;
1774                         }
1775                 }
1776                 return cnt;
1777         }
1778
1779 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1780 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1781 in the source of the look ahead set.
1782
1783 We build two datasets to reflect the "action" table: one which maps
1784 terminals to items where that terminal could be shifted and another
1785 which maps terminals to items that could be reduced when the terminal
1786 is in look-ahead.  We report when we get conflicts between the two.
1787
1788         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1789         {
1790                 int i;
1791                 int cnt = 0;
1792
1793                 for (i = 0; i < g->states; i++) {
1794                         struct itemset *is = g->statetab[i];
1795                         struct symset shifts = INIT_DATASET;
1796                         struct symset reduce = INIT_DATASET;
1797                         int j;
1798                         if (!is)
1799                                 continue;
1800                         /* First collect the shifts */
1801                         for (j = 0; j < is->items.cnt; j++) {
1802                                 unsigned short itm = is->items.syms[j];
1803                                 int p = item_prod(itm);
1804                                 int bp = item_dot(itm);
1805                                 struct production *pr = g->productions[p];
1806                                 struct symbol *s;
1807
1808                                 if (bp >= pr->body_size ||
1809                                     pr->body[bp]->type != Terminal)
1810                                         /* not shiftable */
1811                                         continue;
1812
1813                                 s = pr->body[bp];
1814                                 if (s->precedence && is->precedence)
1815                                         /* Precedence resolves this, so no conflict */
1816                                         continue;
1817
1818                                 if (symset_find(&shifts, s->num) < 0)
1819                                         symset_add(&shifts, s->num, itm);
1820                         }
1821                         /* Now look for reductions and conflicts */
1822                         for (j = 0; j < is->items.cnt; j++) {
1823                                 unsigned short itm = is->items.syms[j];
1824                                 int p = item_prod(itm);
1825                                 int bp = item_dot(itm);
1826                                 struct production *pr = g->productions[p];
1827
1828                                 if (bp < pr->body_size)
1829                                         continue;
1830                                 /* reducible */
1831                                 struct symset la;
1832                                 if (type == SLR)
1833                                         la = g->follow[pr->head->num];
1834                                 else
1835                                         la = set_find(g, is->items.data[j]);
1836                                 int k;
1837                                 for (k = 0; k < la.cnt; k++) {
1838                                         int pos = symset_find(&shifts, la.syms[k]);
1839                                         if (pos >= 0) {
1840                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1841                                                 cnt++;
1842                                                         prtxt(g->symtab[la.syms[k]]->name);
1843                                                 printf(":\n");
1844                                                 report_item(g, shifts.data[pos]);
1845                                                 report_item(g, itm);
1846                                         }
1847                                         pos = symset_find(&reduce, la.syms[k]);
1848                                         if (pos < 0) {
1849                                                 symset_add(&reduce, la.syms[k], itm);
1850                                                 continue;
1851                                         }
1852                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1853                                         prtxt(g->symtab[la.syms[k]]->name);
1854                                         printf(":\n");
1855                                         report_item(g, itm);
1856                                         report_item(g, reduce.data[pos]);
1857                                         cnt++;
1858                                 }
1859                         }
1860                         symset_free(shifts);
1861                         symset_free(reduce);
1862                 }
1863                 return cnt;
1864         }
1865
1866
1867 ## Generating the parser
1868
1869 The exported part of the parser is the `parse_XX` function, where the name
1870 `XX` is based on the name of the parser files.
1871
1872 This takes a `code_node`, a partially initialized `token_config`, and an
1873 optional `FILE` to send tracing to.  The `token_config` gets the list of
1874 known words added and then is used with the `code_node` to initialize the
1875 scanner.
1876
1877 `parse_XX` then calls the library function `parser_run` to actually complete
1878 the parse.  This needs the `states` table and functions to call the various
1879 pieces of code provided in the grammar file, so they are generated first.
1880
1881 ###### parser_generate
1882
1883         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1884                                struct code_node *pre_reduce)
1885         {
1886                 gen_known(f, g);
1887                 gen_non_term(f, g);
1888                 gen_goto(f, g);
1889                 gen_states(f, g);
1890                 gen_reduce(f, g, file, pre_reduce);
1891                 gen_free(f, g);
1892
1893                 fprintf(f, "#line 0 \"gen_parser\"\n");
1894                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1895                         name);
1896                 fprintf(f, "{\n");
1897                 fprintf(f, "\tstruct token_state *tokens;\n");
1898                 fprintf(f, "\tconfig->words_marks = known;\n");
1899                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1900                 fprintf(f, "\ttokens = token_open(code, config);\n");
1901                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1902                 fprintf(f, "\ttoken_close(tokens);\n");
1903                 fprintf(f, "\treturn rv;\n");
1904                 fprintf(f, "}\n\n");
1905         }
1906
1907 ### Known words table
1908
1909 The known words table is simply an array of terminal symbols.
1910 The table of nonterminals used for tracing is a similar array.
1911
1912 ###### functions
1913
1914         static void gen_known(FILE *f, struct grammar *g)
1915         {
1916                 int i;
1917                 fprintf(f, "#line 0 \"gen_known\"\n");
1918                 fprintf(f, "static const char *known[] = {\n");
1919                 for (i = TK_reserved;
1920                      i < g->num_syms && g->symtab[i]->type == Terminal;
1921                      i++)
1922                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1923                                 g->symtab[i]->name.txt);
1924                 fprintf(f, "};\n\n");
1925         }
1926
1927         static void gen_non_term(FILE *f, struct grammar *g)
1928         {
1929                 int i;
1930                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1931                 fprintf(f, "static const char *non_term[] = {\n");
1932                 for (i = TK_reserved;
1933                      i < g->num_syms;
1934                      i++)
1935                         if (g->symtab[i]->type == Nonterminal ||
1936                             g->symtab[i]->isspecial)
1937                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1938                                         g->symtab[i]->name.txt);
1939                 fprintf(f, "};\n\n");
1940         }
1941
1942 ### States and the goto tables.
1943
1944 For each state we record the goto table and details of the reducible
1945 production if there is one.
1946 Some of the details of the reducible production are stored in the
1947 `do_reduce` function to come later.  Here we store the production
1948 number, the body size (useful for stack management), and the resulting
1949 symbol (useful for knowing how to free data later).
1950
1951 The go to table is stored in a simple array of `sym` and corresponding
1952 `state`.
1953
1954 ###### exported types
1955
1956         struct lookup {
1957                 short sym;
1958                 short state;
1959         };
1960         struct state {
1961                 short go_to_cnt;
1962                 const struct lookup * go_to;
1963                 short reduce_prod;
1964                 short reduce_size;
1965                 short reduce_sym;
1966                 short result_size;
1967         };
1968
1969 ###### functions
1970
1971         static void gen_goto(FILE *f, struct grammar *g)
1972         {
1973                 int i;
1974                 fprintf(f, "#line 0 \"gen_goto\"\n");
1975                 for (i = 0; i < g->states; i++) {
1976                         struct symset gt = g->statetab[i]->go_to;
1977                         int j;
1978
1979                         if (gt.cnt == 0)
1980                                 continue;
1981                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
1982                                 i);
1983                         for (j = 0; j < gt.cnt; j++)
1984                                 fprintf(f, "\t{ %d, %d },\n",
1985                                         gt.syms[j], gt.data[j]);
1986                         fprintf(f, "};\n");
1987                 }
1988         }
1989
1990         static void gen_states(FILE *f, struct grammar *g)
1991         {
1992                 int i;
1993                 fprintf(f, "#line 0 \"gen_states\"\n");
1994                 fprintf(f, "static const struct state states[] = {\n");
1995                 for (i = 0; i < g->states; i++) {
1996                         struct itemset *is = g->statetab[i];
1997                         int j, prod = -1, prod_len;
1998
1999                         for (j = 0; j < is->items.cnt; j++) {
2000                                 int itm = is->items.syms[j];
2001                                 int p = item_prod(itm);
2002                                 int bp = item_dot(itm);
2003                                 struct production *pr = g->productions[p];
2004
2005                                 if (bp < pr->body_size)
2006                                         continue;
2007                                 /* This is what we reduce - choose longest */
2008                                 if (prod < 0 || prod_len < pr->body_size) {
2009                                         prod = p;
2010                                         prod_len = pr->body_size;
2011                                 }
2012                         }
2013                         if (is->go_to.cnt)
2014                                 fprintf(f, "\t[%d] = { %d, goto_%d, ",
2015                                         i, is->go_to.cnt, i);
2016                         else
2017                                 fprintf(f, "\t[%d] = { 0, NULL, ", i);
2018                         if (prod >= 0) {
2019                                 struct production *pr = g->productions[prod];
2020                                 struct symbol *hd = pr->head;
2021                                 fprintf(f, "%d, %d, %d, ",
2022                                         prod, pr->body_size, pr->head->num);
2023                                 if (hd->struct_name.txt == NULL)
2024                                         fprintf(f, "0 },\n");
2025                                 else
2026                                         fprintf(f, "sizeof(struct %.*s%s) },\n",
2027                                                 hd->struct_name.len,
2028                                                 hd->struct_name.txt,
2029                                                 hd->isref ? "*" : "");
2030                         } else
2031                                 fprintf(f, "-1, -1, -1, -1 },\n");
2032                 }
2033                 fprintf(f, "};\n\n");
2034         }
2035
2036 ### The `do_reduce` function and the code
2037
2038 When the parser engine decides to reduce a production, it calls
2039 `do_reduce` which runs the code that was included with the production,
2040 if any.
2041
2042 This code needs to be able to store data somewhere.  Rather than
2043 requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
2044 buffer and have `do_reduce` return the size to be saved.
2045
2046 In order for the code to access "global" context, we pass in the
2047 "config" pointer that was passed to the parser function.  If the `struct
2048 token_config` is embedded in some larger structure, the reducing code
2049 can access the larger structure using pointer manipulation.  Performing
2050 that pointer manipulation, and any other common code or variables for
2051 all reduce actions, can be provided in code node called "reduce" which
2052 is passed around in `pre_reduce` which you might have already noticed.
2053
2054 The code fragments require translation when written out.  Any `$N` needs
2055 to be converted to a reference either to that buffer (if `$0`) or to the
2056 structure returned by a previous reduction.  These pointers need to be
2057 cast to the appropriate type for each access.  All this is handled in
2058 `gen_code`.
2059
2060 `gen_code` also allows symbol references to contain a '`<`' as in
2061 '`$<2`'.  This is particularly useful for references (or pointers), but
2062 can be used with structures too.  The `<` implies that the value is
2063 being moved out, so the object will not be automatically freed.  It is
2064 equivalent to assigning `NULL` to the pointer or filling a structure
2065 with zeros.
2066
2067 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2068 and possibly a number.  A number by itself (other than zero) selects a
2069 symbol from the body of the production.  A sequence of letters selects
2070 the shortest symbol in the body which contains those letters in the
2071 given order.  If a number follows the letters, then a later occurrence
2072 of that symbol is chosen.  So "`$AB2`" will refer to the structure
2073 attached to the second occurrence of the shortest symbol which contains
2074 an `A` followed by a `B`.  If there is no unique shortest system, or if
2075 the number given is too large, then the symbol reference is not
2076 transformed, and will cause an error when the code is compiled.
2077
2078 ###### functions
2079
2080         static int textchr(struct text t, char c, int s)
2081         {
2082                 int i;
2083                 for (i = s; i < t.len; i++)
2084                         if (t.txt[i] == c)
2085                                 return i;
2086                 return -1;
2087         }
2088
2089         static int subseq_match(char *seq, int slen, struct text name)
2090         {
2091                 int st = 0;
2092                 while (slen > 0) {
2093                         st = textchr(name, *seq, st);
2094                         if (st < 0)
2095                                 return 0;
2096                         slen -= 1;
2097                         seq += 1;
2098                         st += 1;
2099                 }
2100                 return 1;
2101         }
2102
2103         static int choose_sym(char **namep, int len, struct production *p)
2104         {
2105                 char *name = *namep;
2106                 char *nam = name;
2107                 int namlen;
2108                 int n = 0;
2109                 int i, s, slen;
2110                 char c;
2111
2112                 c = *name;
2113                 while (len > 0 &&
2114                        ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2115                         name += 1;
2116                         len -= 1;
2117                         c = *name;
2118                 }
2119                 namlen = name-nam;
2120                 while (len > 0 && (c >= '0' && c <= '9')) {
2121                         name += 1;
2122                         len -= 1;
2123                         n = n * 10 + (c - '0');
2124                         c = *name;
2125                 }
2126                 if (namlen == 0) {
2127                         if (name == *namep || n > p->body_size)
2128                                 return -1;
2129                         *namep = name;
2130                         return n;
2131                 }
2132                 slen = 0; s = -1;
2133                 for (i = 0; i < p->body_size; i++) {
2134                         if (!subseq_match(nam, namlen, p->body[i]->name))
2135                                 continue;
2136                         if (slen == 0 || p->body[i]->name.len < slen) {
2137                                 s = i;
2138                                 slen = p->body[i]->name.len;
2139                         }
2140                         if (s >= 0 && p->body[i] != p->body[s] &&
2141                             p->body[i]->name.len == p->body[s]->name.len)
2142                                 /* not unique, so s cannot be used */
2143                                 s = -1;
2144                 }
2145                 if (s < 0)
2146                         return -1;
2147                 if (n == 0)
2148                         n = 1;
2149                 for (i = 0; i < p->body_size; i++)
2150                         if (p->body[i] == p->body[s]) {
2151                                 n -= 1;
2152                                 if (n == 0)
2153                                         break;
2154                         }
2155                 if (n > 0 || i == p->body_size)
2156                         return -1;
2157                 *namep = name;
2158                 return i + 1;
2159         }
2160
2161         static void gen_code(struct production *p, FILE *f, struct grammar *g)
2162         {
2163                 char *c;
2164                 char *used = calloc(1, p->body_size);
2165                 int i;
2166
2167                 fprintf(f, "\t\t\t");
2168                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2169                         int n;
2170                         int use = 0;
2171                         if (*c != '$') {
2172                                 fputc(*c, f);
2173                                 if (*c == '\n')
2174                                         fputs("\t\t\t", f);
2175                                 continue;
2176                         }
2177                         c++;
2178                         if (*c == '<') {
2179                                 use = 1;
2180                                 c++;
2181                         }
2182                         n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2183                         if (n < 0) {
2184                                 fputc('$', f);
2185                                 if (use)
2186                                         fputc('<', f);
2187                                 fputc(*c, f);
2188                                 continue;
2189                         }
2190                         if (n == 0)
2191                                 fprintf(f, "(*(struct %.*s*%s)ret)",
2192                                         p->head->struct_name.len,
2193                                         p->head->struct_name.txt,
2194                                         p->head->isref ? "*":"");
2195                         else if (p->body[n-1]->type == Terminal)
2196                                 fprintf(f, "(*(struct token *)body[%d])",
2197                                         n-1);
2198                         else if (p->body[n-1]->struct_name.txt == NULL)
2199                                 fprintf(f, "$%d", n);
2200                         else {
2201                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2202                                         p->body[n-1]->struct_name.len,
2203                                         p->body[n-1]->struct_name.txt,
2204                                         p->body[n-1]->isref ? "*":"", n-1);
2205                                 used[n-1] = use;
2206                         }
2207                         c -= 1;
2208                 }
2209                 fputs("\n", f);
2210                 for (i = 0; i < p->body_size; i++) {
2211                         if (p->body[i]->struct_name.txt &&
2212                             used[i]) {
2213                                 // assume this has been copied out
2214                                 if (p->body[i]->isref)
2215                                         fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2216                                 else
2217                                         fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2218                         }
2219                 }
2220                 free(used);
2221         }
2222
2223 ###### functions
2224
2225         static void gen_reduce(FILE *f, struct grammar *g, char *file,
2226                                struct code_node *pre_reduce)
2227         {
2228                 int i;
2229                 fprintf(f, "#line 1 \"gen_reduce\"\n");
2230                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2231                 fprintf(f, "{\n");
2232                 fprintf(f, "\tint ret_size = 0;\n");
2233                 if (pre_reduce)
2234                         code_node_print(f, pre_reduce, file);
2235
2236                 fprintf(f, "#line 4 \"gen_reduce\"\n");
2237                 fprintf(f, "\tswitch(prod) {\n");
2238                 for (i = 0; i < g->production_count; i++) {
2239                         struct production *p = g->productions[i];
2240                         fprintf(f, "\tcase %d:\n", i);
2241
2242                         if (p->code.txt) {
2243                                 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2244                                 gen_code(p, f, g);
2245                         }
2246
2247                         if (p->head->struct_name.txt)
2248                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2249                                         p->head->struct_name.len,
2250                                         p->head->struct_name.txt,
2251                                         p->head->isref ? "*":"");
2252
2253                         fprintf(f, "\t\tbreak;\n");
2254                 }
2255                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2256         }
2257
2258 ### `do_free`
2259
2260 As each non-terminal can potentially cause a different type of data
2261 structure to be allocated and filled in, we need to be able to free it when
2262 done.
2263
2264 It is particularly important to have fine control over freeing during error
2265 recovery where individual stack frames might need to be freed.
2266
2267 For this, the grammar author is required to defined a `free_XX` function for
2268 each structure that is used by a non-terminal.  `do_free` will call whichever
2269 is appropriate given a symbol number, and will call `free` (as is
2270 appropriate for tokens) on any terminal symbol.
2271
2272 ###### functions
2273
2274         static void gen_free(FILE *f, struct grammar *g)
2275         {
2276                 int i;
2277
2278                 fprintf(f, "#line 0 \"gen_free\"\n");
2279                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2280                 fprintf(f, "{\n");
2281                 fprintf(f, "\tif (!asn) return;\n");
2282                 fprintf(f, "\tif (sym < %d", g->first_nonterm);
2283                 /* Need to handle special terminals too */
2284                 for (i = 0; i < g->num_syms; i++) {
2285                         struct symbol *s = g->symtab[i];
2286                         if (i >= g->first_nonterm && s->type == Terminal &&
2287                             s->isspecial)
2288                                 fprintf(f, " || sym == %d", s->num);
2289                 }
2290                 fprintf(f, ") {\n");
2291                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2292                 fprintf(f, "\tswitch(sym) {\n");
2293
2294                 for (i = 0; i < g->num_syms; i++) {
2295                         struct symbol *s = g->symtab[i];
2296                         if (!s ||
2297                             s->type != Nonterminal ||
2298                             s->struct_name.txt == NULL)
2299                                 continue;
2300
2301                         fprintf(f, "\tcase %d:\n", s->num);
2302                         if (s->isref) {
2303                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2304                                         s->struct_name.len,
2305                                         s->struct_name.txt);
2306                                 fprintf(f, "\t\tfree(asn);\n");
2307                         } else
2308                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2309                                         s->struct_name.len,
2310                                         s->struct_name.txt);
2311                         fprintf(f, "\t\tbreak;\n");
2312                 }
2313                 fprintf(f, "\t}\n}\n\n");
2314         }
2315
2316 ## The main routine.
2317
2318 There are three key parts to the "main" function of parsergen: processing
2319 the arguments, loading the grammar file, and dealing with the grammar.
2320
2321 ### Argument processing.
2322
2323 Command line options allow the selection of analysis mode, name of output
2324 file, and whether or not a report should be generated.  By default we create
2325 a report only if no code output was requested.
2326
2327 The `parse_XX` function name uses the basename of the output file which
2328 should not have a suffix (`.c`).  `.c` is added to the given name for the
2329 code, and `.h` is added for the header (if header text is specifed in the
2330 grammar file).
2331
2332 ###### includes
2333         #include <getopt.h>
2334
2335 ###### declarations
2336         static const struct option long_options[] = {
2337                 { "LR0",        0, NULL, '0' },
2338                 { "LR05",       0, NULL, '5' },
2339                 { "SLR",        0, NULL, 'S' },
2340                 { "LALR",       0, NULL, 'L' },
2341                 { "LR1",        0, NULL, '1' },
2342                 { "tag",        1, NULL, 't' },
2343                 { "report",     0, NULL, 'R' },
2344                 { "output",     1, NULL, 'o' },
2345                 { NULL,         0, NULL, 0   }
2346         };
2347         const char *options = "05SL1t:Ro:";
2348
2349 ###### process arguments
2350         int opt;
2351         char *outfile = NULL;
2352         char *infile;
2353         char *name;
2354         char *tag = NULL;
2355         int report = 1;
2356         enum grammar_type type = LR05;
2357         while ((opt = getopt_long(argc, argv, options,
2358                                   long_options, NULL)) != -1) {
2359                 switch(opt) {
2360                 case '0':
2361                         type = LR0; break;
2362                 case '5':
2363                         type = LR05; break;
2364                 case 'S':
2365                         type = SLR; break;
2366                 case 'L':
2367                         type = LALR; break;
2368                 case '1':
2369                         type = LR1; break;
2370                 case 'R':
2371                         report = 2; break;
2372                 case 'o':
2373                         outfile = optarg; break;
2374                 case 't':
2375                         tag = optarg; break;
2376                 default:
2377                         fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2378                         exit(1);
2379                 }
2380         }
2381         if (optind < argc)
2382                 infile = argv[optind++];
2383         else {
2384                 fprintf(stderr, "No input file given\n");
2385                 exit(1);
2386         }
2387         if (outfile && report == 1)
2388                 report = 0;
2389         name = outfile;
2390         if (name && strchr(name, '/'))
2391                 name = strrchr(name, '/')+1;
2392
2393         if (optind < argc) {
2394                 fprintf(stderr, "Excess command line arguments\n");
2395                 exit(1);
2396         }
2397
2398 ### Loading the grammar file
2399
2400 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2401 map it.
2402
2403 Once we have extracted the code (with `mdcode`) we expect to find three
2404 or four sections: header, code, grammar, reduce.
2405 Anything else that is not excluded by the `--tag` option is an error.
2406
2407 "header", "code", and "reduce" are optional, though it is hard to build
2408 a working parser without the first two.  "grammar" must be provided.
2409
2410 ###### includes
2411         #include <fcntl.h>
2412         #include <sys/mman.h>
2413         #include <errno.h>
2414
2415 ###### functions
2416         static int errs;
2417         static void pr_err(char *msg)
2418         {
2419                 errs++;
2420                 fprintf(stderr, "%s\n", msg);
2421         }
2422
2423 ###### load file
2424         struct section *table;
2425         int fd;
2426         int len;
2427         char *file;
2428         fd = open(infile, O_RDONLY);
2429         if (fd < 0) {
2430                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2431                         infile, strerror(errno));
2432                 exit(1);
2433         }
2434         len = lseek(fd, 0, 2);
2435         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2436         table = code_extract(file, file+len, pr_err);
2437
2438         struct code_node *hdr = NULL;
2439         struct code_node *code = NULL;
2440         struct code_node *gram = NULL;
2441         struct code_node *pre_reduce = NULL;
2442         for (s = table; s; s = s->next) {
2443                 struct text sec = s->section;
2444                 if (tag && !strip_tag(&sec, tag))
2445                         continue;
2446                 if (text_is(sec, "header"))
2447                         hdr = s->code;
2448                 else if (text_is(sec, "code"))
2449                         code = s->code;
2450                 else if (text_is(sec, "grammar"))
2451                         gram = s->code;
2452                 else if (text_is(sec, "reduce"))
2453                         pre_reduce = s->code;
2454                 else {
2455                         fprintf(stderr, "Unknown content section: %.*s\n",
2456                                 s->section.len, s->section.txt);
2457                         rv |= 2;
2458                 }
2459         }
2460
2461 ### Processing the input
2462
2463 As we need to append an extention to a filename and then open it for
2464 writing, and we need to do this twice, it helps to have a separate function.
2465
2466 ###### functions
2467
2468         static FILE *open_ext(char *base, char *ext)
2469         {
2470                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2471                 FILE *f;
2472                 strcat(strcpy(fn, base), ext);
2473                 f = fopen(fn, "w");
2474                 free(fn);
2475                 return f;
2476         }
2477
2478 If we can read the grammar, then we analyse and optionally report on it.  If
2479 the report finds conflicts we will exit with an error status.
2480
2481 ###### process input
2482         struct grammar *g = NULL;
2483         if (gram == NULL) {
2484                 fprintf(stderr, "No grammar section provided\n");
2485                 rv |= 2;
2486         } else {
2487                 g = grammar_read(gram);
2488                 if (!g) {
2489                         fprintf(stderr, "Failure to parse grammar\n");
2490                         rv |= 2;
2491                 }
2492         }
2493         if (g) {
2494                 grammar_analyse(g, type);
2495                 if (report)
2496                         if (grammar_report(g, type))
2497                                 rv |= 1;
2498         }
2499
2500 If a "headers" section is defined, we write it out and include a declaration
2501 for the `parse_XX` function so it can be used from separate code.
2502
2503         if (rv == 0 && hdr && outfile) {
2504                 FILE *f = open_ext(outfile, ".h");
2505                 if (f) {
2506                         code_node_print(f, hdr, infile);
2507                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2508                                 name);
2509                         fclose(f);
2510                 } else {
2511                         fprintf(stderr, "Cannot create %s.h\n",
2512                                 outfile);
2513                         rv |= 4;
2514                 }
2515         }
2516
2517 And if all goes well and an output file was provided, we create the `.c`
2518 file with the code section (if any) and the parser tables and function.
2519
2520         if (rv == 0 && outfile) {
2521                 FILE *f = open_ext(outfile, ".c");
2522                 if (f) {
2523                         if (code)
2524                                 code_node_print(f, code, infile);
2525                         gen_parser(f, g, infile, name, pre_reduce);
2526                         fclose(f);
2527                 } else {
2528                         fprintf(stderr, "Cannot create %s.c\n",
2529                                 outfile);
2530                         rv |= 4;
2531                 }
2532         }
2533
2534 And that about wraps it up.  We need to set the locale so that UTF-8 is
2535 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2536
2537 ###### File: parsergen.mk
2538         parsergen : parsergen.o libscanner.o libmdcode.o
2539                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2540
2541 ###### includes
2542         #include <locale.h>
2543
2544 ###### main
2545
2546         int main(int argc, char *argv[])
2547         {
2548                 struct section *s;
2549                 int rv = 0;
2550
2551                 setlocale(LC_ALL,"");
2552
2553                 ## process arguments
2554                 ## load file
2555                 ## process input
2556
2557                 return rv;
2558         }
2559
2560 ## The SHIFT/REDUCE parser
2561
2562 Having analysed the grammar and generated all the tables, we only need
2563 the shift/reduce engine to bring it all together.
2564
2565 ### Goto table lookup
2566
2567 The parser generator has nicely provided us with goto tables sorted by
2568 symbol number.  We need a binary search function to find a symbol in the
2569 table.
2570
2571 ###### parser functions
2572
2573         static int search(const struct state *l, int sym)
2574         {
2575                 int lo = 0;
2576                 int hi = l->go_to_cnt;
2577
2578                 if (hi == 0)
2579                         return -1;
2580                 while (lo + 1 < hi) {
2581                         int mid = (lo + hi) / 2;
2582                         if (l->go_to[mid].sym <= sym)
2583                                 lo = mid;
2584                         else
2585                                 hi = mid;
2586                 }
2587                 if (l->go_to[lo].sym == sym)
2588                         return l->go_to[lo].state;
2589                 else
2590                         return -1;
2591         }
2592
2593 ### Memory allocation
2594
2595 The `scanner` returns tokens in a local variable - we want them in allocated
2596 memory so they can live in the `asn_stack`.  So we provide `tok_copy` to
2597 make an allocated copy as required.
2598
2599 ###### parser includes
2600         #include <memory.h>
2601
2602 ###### parser functions
2603
2604         static struct token *tok_copy(struct token tk)
2605         {
2606                 struct token *new = malloc(sizeof(*new));
2607                 *new = tk;
2608                 return new;
2609         }
2610
2611 ### The state stack.
2612
2613 The core data structure for the parser is the stack.  This tracks all
2614 the symbols that have been recognised or partially recognised.
2615
2616 The stack usually won't grow very large - maybe a few tens of entries.
2617 So we dynamically grow an array as required but never bother to
2618 shrink it down again.
2619
2620 We keep the stack as two separate allocations.  One, `asn_stack` stores
2621 the "abstract syntax nodes" which are created by each reduction.  When
2622 we call `do_reduce` we need to pass an array of the `asn`s of the body
2623 of the production, and by keeping a separate `asn` stack, we can just
2624 pass a pointer into this stack.
2625
2626 The other allocation stores all other stack fields of which there are
2627 several.  The `state` is the most important one and guides the parsing
2628 process.  The `sym` is nearly unnecessary as it is implicit in the
2629 state.  However when we want to free entries from the `asn_stack`, it
2630 helps to know what type they are so we can call the right freeing
2631 function.  The symbol leads us to the right free function through
2632 `do_free`.
2633
2634 The stack is most properly seen as alternating states and symbols -
2635 states, like the 'DOT' in items, are between symbols.  Each frame in
2636 our stack holds a state and the symbol that was before it.  The
2637 bottom of stack holds the start state but no symbol, as nothing came
2638 before the beginning.  As we need to store some value, `TK_eof` is used
2639 to mark the beginning of the file as well as the end.
2640
2641 ###### parser functions
2642
2643         struct parser {
2644                 struct frame {
2645                         short state;
2646                         short sym;
2647                 } *stack;
2648                 void **asn_stack;
2649                 int stack_size;
2650                 int tos;
2651         };
2652
2653 #### Shift and pop
2654
2655 Two operations are needed on the stack - shift (which is like push) and pop.
2656
2657 Shift applies not only to terminals but also to non-terminals.  When we
2658 reduce a production we will pop off frames corresponding to the body
2659 symbols, then push on a frame for the head of the production.  This last
2660 is exactly the same process as shifting in a terminal so we use the same
2661 function for both.  In both cases we provide the symbol.  The state is
2662 deduced from the current top-of-stack state and the new symbol.
2663
2664 To simplify other code we arrange for `shift` to fail if there is no `goto`
2665 state for the symbol.  This is useful in basic parsing due to our design
2666 that we shift when we can, and reduce when we cannot.  So the `shift`
2667 function reports if it could.
2668
2669 `shift` is also used to push state zero onto the stack, so if the
2670 stack is empty, it always chooses zero as the next state.
2671
2672 So `shift` finds the next state.  If that succeeds it extends the
2673 allocations if needed and pushes all the information onto the stacks.
2674
2675 ###### parser functions
2676
2677         static int shift(struct parser *p,
2678                          short sym, void *asn,
2679                          const struct state states[])
2680         {
2681                 struct frame next = {0};
2682                 int newstate = p->tos
2683                         ? search(&states[p->stack[p->tos-1].state],
2684                                  sym)
2685                         : 0;
2686                 if (newstate < 0)
2687                         return 0;
2688                 if (p->tos >= p->stack_size) {
2689                         p->stack_size += 10;
2690                         p->stack = realloc(p->stack, p->stack_size
2691                                            * sizeof(p->stack[0]));
2692                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2693                                            * sizeof(p->asn_stack[0]));
2694                 }
2695                 next.sym = sym;
2696                 next.state = newstate;
2697
2698                 p->stack[p->tos] = next;
2699                 p->asn_stack[p->tos] = asn;
2700                 p->tos++;
2701                 return 1;
2702         }
2703
2704 `pop` primarily moves the top of stack (`tos`) back down the required
2705 amount and frees any `asn` entries that need to be freed.  It is called
2706 _after_ we reduce a production, just before we `shift` the nonterminal
2707 in.
2708
2709 ###### parser functions
2710
2711         static void pop(struct parser *p, int num,
2712                         void(*do_free)(short sym, void *asn))
2713         {
2714                 int i;
2715
2716                 p->tos -= num;
2717                 for (i = 0; i < num; i++)
2718                         do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]);
2719         }
2720
2721 ### The heart of the parser.
2722
2723 Now we have the parser.  For each token we might shift it, trigger a
2724 reduction, or start error handling.  2D tokens (IN, OUT, EOL) also need
2725 to be handled.
2726
2727 We return whatever `asn` was returned by reducing production zero.
2728
2729 When we find `TK_in` and `TK_out` tokens which report indents we need
2730 to handle them directly as the grammar cannot express what we want to
2731 do with them.
2732
2733 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2734 record how many indents there are following the previous token.
2735
2736 `TK_out` tokens must be canceled against an indent count
2737 within the stack.  If we can reduce some symbols that are all since
2738 the most recent indent, then we do that first.  If the minimum prefix
2739 of the current state then extends back before the most recent indent,
2740 that indent can be cancelled.  If the minimum prefix is shorter then
2741 the indent had ended prematurely and we must start error handling, which
2742 is still a work-in-progress.
2743
2744 `TK_newline` tokens are ignored unless the top stack frame records
2745 that they are permitted.  In that case they will not be considered for
2746 shifting if it is possible to reduce some symbols that are all since
2747 the most recent start of line.  This is how a newline forcibly
2748 terminates any line-like structure - we try to reduce down to at most
2749 one symbol for each line where newlines are allowed.
2750 A consequence of this is that a rule like
2751
2752 ###### Example: newlines - broken
2753
2754         Newlines ->
2755                 | NEWLINE Newlines
2756         IfStatement -> Newlines if ....
2757
2758 cannot work, as the NEWLINE will never be shifted as the empty string
2759 will be reduced first.  Optional sets of newlines need to be include
2760 in the thing that preceed:
2761
2762 ###### Example: newlines - works
2763
2764         If -> if
2765                 | NEWLINE If
2766         IfStatement -> If ....
2767
2768 Here the NEWLINE will be shifted because nothing can be reduced until
2769 the `if` is seen.
2770
2771 We have already discussed the bulk of the handling of a "reduce" action,
2772 with the `pop()` and `shift()` functions doing much of the work.  There
2773 is a little more complexity needed to manage storage for the asn (Abstract
2774 Syntax Node), and also a test of whether the reduction is permitted.
2775
2776 When we try to shift the result of reducing production-zero, it will
2777 fail because there is no next state.  In this case the asn will not have
2778 been stored on the stack, so it get stored in the `ret` variable, and we
2779 report that that input has been accepted.
2780
2781 ###### parser vars
2782
2783         void *ret = NULL;
2784
2785 ###### try reduce
2786
2787         if (states[tos->state].reduce_prod >= 0) {
2788                 void **body;
2789                 void *res;
2790                 const struct state *nextstate = &states[tos->state];
2791                 int prod = nextstate->reduce_prod;
2792                 int size = nextstate->reduce_size;
2793                 int res_size = nextstate->result_size;
2794
2795                 body = p.asn_stack + (p.tos - size);
2796                 res = res_size ? calloc(1, res_size) : NULL;
2797                 res_size = do_reduce(prod, body, config, res);
2798                 if (res_size != nextstate->result_size)
2799                         abort();
2800                 pop(&p, size, do_free);
2801                 if (!shift(&p, nextstate->reduce_sym, res, states)) {
2802                         accepted = 1;
2803                         ret = res;
2804                         parser_trace_action(trace, "Accept");
2805                 } else
2806                         parser_trace_action(trace, "Reduce");
2807                 continue;
2808         }
2809
2810 If we can neither shift nor reduce we have an error to handle.  There
2811 are two possible responses to an error: we can pop single frames off the
2812 stack until we find a frame that can shift `TK_error`, or we can discard
2813 the current look-ahead symbol.
2814
2815 When we first see an error we do the first of these and set a flag to
2816 record that we are processing an error.  If the normal shift/reduce
2817 tests fail to find that anything can be done from that state, we start
2818 dropping tokens until either we manage to shift one, or reach end-of-file.
2819
2820 ###### parser vars
2821
2822         int in_err = 0;
2823
2824 ###### did shift
2825
2826         in_err = 0;
2827
2828 ###### handle error
2829
2830         if (in_err) {
2831                 parser_trace_action(trace, "DISCARD");
2832                 if (tk->num == TK_eof)
2833                         break;
2834                 free(tk);
2835                 tk = NULL;
2836         } else {
2837                 struct token *err_tk;
2838
2839                 parser_trace_action(trace, "ERROR");
2840
2841                 err_tk = tok_copy(*tk);
2842                 while (p.tos > 0 && shift(&p, TK_error, err_tk, states) == 0)
2843                         // discard this state
2844                         pop(&p, 1, do_free);
2845                 if (p.tos == 0) {
2846                         free(err_tk);
2847                         // no state accepted TK_error
2848                         break;
2849                 }
2850                 in_err = 1;
2851         }
2852
2853 ###### parser includes
2854         #include "parser.h"
2855
2856 ###### parser_run
2857
2858         void *parser_run(struct token_state *tokens,
2859                          const struct state states[],
2860                          int (*do_reduce)(int, void**, struct token_config*, void*),
2861                          void (*do_free)(short, void*),
2862                          FILE *trace, const char *non_term[],
2863                          struct token_config *config)
2864         {
2865                 struct parser p = { 0 };
2866                 struct token *tk = NULL;
2867                 int accepted = 0;
2868                 ## parser vars
2869
2870                 shift(&p, TK_eof, NULL, states);
2871                 while (!accepted && p.tos > 0) {
2872                         struct frame *tos = &p.stack[p.tos-1];
2873                         if (!tk)
2874                                 tk = tok_copy(token_next(tokens));
2875                         parser_trace(trace, &p,
2876                                      tk, states, non_term, config->known_count);
2877
2878                         if (tk->num == TK_in) {
2879                                 free(tk);
2880                                 tk = NULL;
2881                                 parser_trace_action(trace, "Record");
2882                                 continue;
2883                         }
2884                         if (tk->num == TK_out) {
2885                                 if (1) {
2886                                         // OK to cancel
2887
2888                                         free(tk);
2889                                         tk = NULL;
2890                                         parser_trace_action(trace, "Cancel");
2891                                         continue;
2892                                 }
2893                                 // fall through to error handling as both SHIFT and REDUCE
2894                                 // will fail.
2895                         }
2896                         if (tk->num == TK_newline) {
2897                                 if (1) {
2898                                         free(tk);
2899                                         tk = NULL;
2900                                         parser_trace_action(trace, "Discard");
2901                                         continue;
2902                                 }
2903                         }
2904                         if (shift(&p, tk->num, tk, states)) {
2905                                 tk = NULL;
2906                                 parser_trace_action(trace, "Shift");
2907                                 ## did shift
2908                                 continue;
2909                         }
2910
2911                         ## try reduce
2912                         ## handle error
2913                 }
2914                 free(tk);
2915                 pop(&p, p.tos, do_free);
2916                 free(p.asn_stack);
2917                 free(p.stack);
2918                 return ret;
2919         }
2920
2921 ###### exported functions
2922         void *parser_run(struct token_state *tokens,
2923                          const struct state states[],
2924                          int (*do_reduce)(int, void**, struct token_config*, void*),
2925                          void (*do_free)(short, void*),
2926                          FILE *trace, const char *non_term[],
2927                          struct token_config *config);
2928
2929 ### Tracing
2930
2931 Being able to visualize the parser in action can be invaluable when
2932 debugging the parser code, or trying to understand how the parse of a
2933 particular grammar progresses.  The stack contains all the important
2934 state, so just printing out the stack every time around the parse loop
2935 can make it possible to see exactly what is happening.
2936
2937 This doesn't explicitly show each SHIFT and REDUCE action.  However they
2938 are easily deduced from the change between consecutive lines, and the
2939 details of each state can be found by cross referencing the states list
2940 in the stack with the "report" that parsergen can generate.
2941
2942 For terminal symbols, we just dump the token.  For non-terminals we
2943 print the name of the symbol.  The look ahead token is reported at the
2944 end inside square brackets.
2945
2946 ###### parser functions
2947
2948         static char *reserved_words[] = {
2949                 [TK_error]        = "ERROR",
2950                 [TK_in]           = "IN",
2951                 [TK_out]          = "OUT",
2952                 [TK_newline]      = "NEWLINE",
2953                 [TK_eof]          = "$eof",
2954         };
2955
2956         void parser_trace(FILE *trace, struct parser *p,
2957                           struct token *tk, const struct state states[],
2958                           const char *non_term[], int knowns)
2959         {
2960                 int i;
2961                 if (!trace)
2962                         return;
2963                 for (i = 0; i < p->tos; i++) {
2964                         struct frame *f = &p->stack[i];
2965                         if (i) {
2966                                 int sym = f->sym;
2967                                 if (sym < TK_reserved &&
2968                                     reserved_words[sym] != NULL)
2969                                         fputs(reserved_words[sym], trace);
2970                                 else if (sym < TK_reserved + knowns) {
2971                                         struct token *t = p->asn_stack[i];
2972                                         text_dump(trace, t->txt, 20);
2973                                 } else
2974                                         fputs(non_term[sym - TK_reserved - knowns],
2975                                               trace);
2976                                 fputs(" ", trace);
2977                         }
2978                         fprintf(trace, "(%d) ", f->state);
2979                 }
2980                 fprintf(trace, "[");
2981                 if (tk->num < TK_reserved &&
2982                     reserved_words[tk->num] != NULL)
2983                         fputs(reserved_words[tk->num], trace);
2984                 else
2985                         text_dump(trace, tk->txt, 20);
2986                 fprintf(trace, ":%d:%d]", tk->line, tk->col);
2987         }
2988
2989         void parser_trace_action(FILE *trace, char *action)
2990         {
2991                 if (trace)
2992                         fprintf(trace, " - %s\n", action);
2993         }
2994
2995 # A Worked Example
2996
2997 The obvious example for a parser is a calculator.
2998
2999 As `scanner` provides number parsing function using `libgmp` it is not much
3000 work to perform arbitrary rational number calculations.
3001
3002 This calculator takes one expression, or an equality test, per line.
3003 The results are printed and if any equality test fails, the program
3004 exits with an error.
3005
3006 ###### File: parsergen.mk
3007         calc.c calc.h : parsergen parsergen.mdc
3008                 ./parsergen --tag calc -o calc parsergen.mdc
3009         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3010                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3011         calctest : calc
3012                 ./calc parsergen.mdc
3013         demos :: calctest
3014
3015 # calc: header
3016
3017         #include "parse_number.h"
3018         // what do we use for a demo-grammar?  A calculator of course.
3019         struct number {
3020                 mpq_t val;
3021                 char tail[2];
3022                 int err;
3023         };
3024
3025 # calc: code
3026
3027         #include <stdlib.h>
3028         #include <unistd.h>
3029         #include <fcntl.h>
3030         #include <sys/mman.h>
3031         #include <stdio.h>
3032         #include <malloc.h>
3033         #include <gmp.h>
3034         #include <string.h>
3035         #include "mdcode.h"
3036         #include "scanner.h"
3037         #include "parser.h"
3038
3039         #include "calc.h"
3040
3041         static void free_number(struct number *n)
3042         {
3043                 mpq_clear(n->val);
3044                 free(n);
3045         }
3046
3047         static int text_is(struct text t, char *s)
3048         {
3049                 return (strlen(s) == t.len &&
3050                         strncmp(s, t.txt, t.len) == 0);
3051         }
3052
3053         int main(int argc, char *argv[])
3054         {
3055                 int fd = open(argv[1], O_RDONLY);
3056                 int len = lseek(fd, 0, 2);
3057                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3058                 struct section *table = code_extract(file, file+len, NULL);
3059                 struct section *s;
3060                 struct token_config config = {
3061                         .ignored = (1 << TK_line_comment)
3062                                  | (1 << TK_in)
3063                                  | (1 << TK_out),
3064                         .number_chars = ".,_+-",
3065                         .word_start = "",
3066                         .word_cont = "",
3067                 };
3068                 for (s = table; s; s = s->next)
3069                         if (text_is(s->section, "example: input"))
3070                                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3071                 while (table) {
3072                         struct section *t = table->next;
3073                         code_free(table->code);
3074                         free(table);
3075                         table = t;
3076                 }
3077                 exit(0);
3078         }
3079
3080 # calc: grammar
3081
3082         $LEFT + -
3083         $LEFT * / //
3084
3085         Session -> Session Line
3086                 | Line
3087
3088         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3089                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3090                                         gmp_printf("  or as a decimal: %Fg\n", fl);
3091                                         mpf_clear(fl);
3092                                         }
3093                                      }$
3094                 | Expression = Expression NEWLINE ${
3095                         if (mpq_equal($1.val, $3.val))
3096                                 gmp_printf("Both equal %Qd\n", $1.val);
3097                         else {
3098                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
3099                                         $1.val, $3.val);
3100                                 exit(1);
3101                         }
3102                 }$
3103                 | NEWLINE ${ printf("Blank line\n"); }$
3104                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3105
3106         $number
3107         Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3108                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3109                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3110                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3111                 | Expression // Expression ${ {
3112                         mpz_t z0, z1, z2;
3113                         mpq_init($0.val);
3114                         mpz_init(z0); mpz_init(z1); mpz_init(z2);
3115                         mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3116                         mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3117                         mpz_tdiv_q(z0, z1, z2);
3118                         mpq_set_z($0.val, z0);
3119                         mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3120                 } }$
3121                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3122                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3123
3124 # example: input
3125
3126         355/113
3127         3.1415926535 - 355/113
3128         2 + 4 * 5
3129         1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3130         10 * 9 / 2
3131         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
3132
3133         355//113
3134
3135         error