ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator with 2D support #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 "2D support" means that indentation and line breaks can be significant.
   8 Indent tokens (IN and OUT) and end-of-line tokens (EOL and NEWLINE) can
   9 be used to describe the grammar and the parser can selectively ignore
  10 these where they aren't relevant.
  11
  12 There are several distinct sections.
  13
  14 1. `grammar_read` will read a grammar from a literate-code file and
  15    store the productions, symbols, and code fragments.
  16 2. `grammar_analyse` will take that grammar and build LR parsing
  17    states and look-ahead tables.
  18 3. `grammar_report` will present the details of the analysis
  19    in a readable format and will report any conflicts.
  20 4. `parser_generate` will write out C code files with various
  21    tables and with the code fragments arranged in useful places.
  22 5. `parser_run` is a library function intended to be linked together
  23    with the generated parser tables to complete the implementation of
  24    a parser.
  25 6. Finally `calc` is a test grammar for a simple calculator.  The
  26    `parsergen` program built from the C code in this file can extract
  27    that grammar directly from this file and process it.
  28
  29 ###### File: parsergen.c
  30         #include <unistd.h>
  31         #include <stdlib.h>
  32         #include <stdio.h>
  33         ## includes
  34         ## forward declarations
  35         ## declarations
  36         ## functions
  37         ## grammar_read
  38         ## grammar_analyse
  39         ## grammar_report
  40         ## parser_generate
  41         ## main
  42 ###### File: parser.h
  43         ## exported types
  44         ## exported functions
  45 ###### File: libparser.c
  46         #include <unistd.h>
  47         #include <stdlib.h>
  48         #include <stdio.h>
  49         ## parser includes
  50         ## parser functions
  51         ## parser_run
  52 ###### File: parsergen.mk
  53         CFLAGS += -Wall -g
  54         all :: parsergen calc
  55         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  56                 ./md2c parsergen.mdc
  57
  58 ## Reading the grammar
  59
  60 The grammar must be stored in a markdown literate code file as parsed by
  61 "[mdcode][]".  It may have up to four top level (i.e.  unreferenced)
  62 sections: `header`, `code`, `grammar` and `reduce`.  The first two, if
  63 present, will be literally copied into the generated `.c` and `.h`
  64 files.  The third is required and contains the grammar which is
  65 tokenised with "[scanner][]".  `reduce` can provide variable
  66 declarations and common intialization code for all the reduction code
  67 fragments in the grammar.
  68
  69 If the `--tag` option is given, then any top level header that doesn't
  70 start with the tag is ignored, and the tag is striped from the rest.  So
  71 `--tag Foo` means that the four sections expected are `Foo: header`,
  72 `Foo: code`, `Foo: grammar` and `Foo: reduce`.  The tag `calc` is used
  73 to extract the sample calculator from this file.
  74
  75 [mdcode]: mdcode.html
  76 [scanner]: scanner.html
  77
  78 ###### includes
  79         #include "mdcode.h"
  80         #include "scanner.h"
  81
  82 ###### parser includes
  83         #include "mdcode.h"
  84         #include "scanner.h"
  85
  86 The grammar contains production sets, precedence/associativity
  87 declarations, general terminal declarations, and data type declarations.
  88 These are all parsed with _ad hoc_ parsing as we don't have a parser
  89 generator yet.
  90
  91 The precedence and associativity can be set for each production, and
  92 can be inherited from symbols.  The data type (either structure or a
  93 reference to a structure) is potentially defined for each non-terminal
  94 and describes what C structure is used to store information about each
  95 symbol.
  96
  97 ###### declarations
  98         enum assoc {Left, Right, Non};
  99         char *assoc_names[] = {"Left","Right","Non"};
 100
 101         struct symbol {
 102                 struct text struct_name;
 103                 int isref;
 104                 enum assoc assoc;
 105                 unsigned short precedence;
 106                 ## symbol fields
 107         };
 108         struct production {
 109                 unsigned short precedence;
 110                 enum assoc assoc;
 111                 ## production fields
 112         };
 113         struct grammar {
 114                 ## grammar fields
 115         };
 116
 117 The strings reported by `mdcode` and `scanner` are `struct text` which
 118 have length rather than being nul terminated.  To help with printing and
 119 comparing we define `text_is` and `prtxt`, which should possibly go in
 120 `mdcode`.  `scanner` does provide `text_dump` which is useful for
 121 strings which might contain control characters.
 122
 123 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 124 because that is what we need to detect tags.
 125
 126 ###### functions
 127         static int text_is(struct text t, char *s)
 128         {
 129                 return (strlen(s) == t.len &&
 130                         strncmp(s, t.txt, t.len) == 0);
 131         }
 132         static void prtxt(struct text t)
 133         {
 134                 printf("%.*s", t.len, t.txt);
 135         }
 136
 137         static int strip_tag(struct text *t, char *tag)
 138         {
 139                 int skip = strlen(tag) + 1;
 140                 if (skip >= t->len ||
 141                     strncmp(t->txt, tag, skip-1) != 0 ||
 142                     t->txt[skip-1] != ':')
 143                         return 0;
 144                 while (skip < t->len && t->txt[skip] == ' ')
 145                         skip++;
 146                 t->len -= skip;
 147                 t->txt += skip;
 148                 return 1;
 149         }
 150
 151 ### Symbols
 152
 153 Productions are comprised primarily of symbols - terminal and
 154 non-terminal.  We do not make a syntactic distinction between the two,
 155 though non-terminals must be identifiers while terminals can also be
 156 marks.  Non-terminal symbols are those which appear in the head of a
 157 production, terminal symbols are those which don't.  There are also
 158 "virtual" symbols used for precedence marking discussed later, and
 159 sometimes we won't know what type a symbol is yet.
 160
 161 To help with code safety it is possible to declare the terminal symbols.
 162 If this is done, then any symbol used in a production that does not
 163 appear in the head of any production and is not declared as a terminal
 164 is treated as an error.
 165
 166 ###### forward declarations
 167         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 168         char *symtypes = "UVTN";
 169 ###### symbol fields
 170         enum symtype type;
 171 ###### grammar fields
 172         int terminals_declared;
 173
 174 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 175 table of known symbols and the resulting parser will report them as
 176 `TK_reserved + N`.  A small set of identifiers are reserved for the
 177 different token types that `scanner` can report, and an even smaller set
 178 are reserved for a special token that the parser can generate (`EOL`) as
 179 will be described later.  This latter set cannot use predefined numbers,
 180 so they are marked as `isspecial` for now and will get assigned a number
 181 with the non-terminals later.
 182
 183 ###### declarations
 184
 185         static struct { int num; char *name; } reserved_words[] = {
 186                 { TK_error,        "ERROR" },
 187                 { TK_number,       "NUMBER" },
 188                 { TK_ident,        "IDENTIFIER" },
 189                 { TK_mark,         "MARK" },
 190                 { TK_string,       "STRING" },
 191                 { TK_multi_string, "MULTI_STRING" },
 192                 { TK_in,           "IN" },
 193                 { TK_out,          "OUT" },
 194                 { TK_newline,      "NEWLINE" },
 195                 { TK_eof,          "$eof" },
 196                 { -1,              "EOL" },
 197         };
 198
 199 ###### symbol fields
 200         short num;
 201         unsigned int isspecial:1;
 202
 203 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 204 recognised.  The former is automatically expected at the end of the text
 205 being parsed. The latter are always ignored by the parser.
 206
 207 All of these symbols are stored in a simple symbol table.  We use the
 208 `struct text` from `mdcode` to store the name and link them together
 209 into a sorted list using an insertion sort.
 210
 211 We don't have separate `find` and `insert` functions as any symbol we
 212 find needs to be remembered.  We simply expect `find` to always return a
 213 symbol, but its type might be `Unknown`.
 214
 215 ###### includes
 216         #include <string.h>
 217
 218 ###### symbol fields
 219         struct text name;
 220         struct symbol *next;
 221
 222 ###### grammar fields
 223         struct symbol *syms;
 224         int num_syms;
 225
 226 ###### functions
 227         static struct symbol *sym_find(struct grammar *g, struct text s)
 228         {
 229                 struct symbol **l = &g->syms;
 230                 struct symbol *n;
 231                 int cmp = 1;
 232
 233                 while (*l &&
 234                         (cmp = text_cmp((*l)->name, s)) < 0)
 235                                 l = & (*l)->next;
 236                 if (cmp == 0)
 237                         return *l;
 238                 n = calloc(1, sizeof(*n));
 239                 n->name = s;
 240                 n->type = Unknown;
 241                 n->next = *l;
 242                 n->num = -1;
 243                 *l = n;
 244                 return n;
 245         }
 246
 247         static void symbols_init(struct grammar *g)
 248         {
 249                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 250                 int i;
 251                 for (i = 0; i < entries; i++) {
 252                         struct text t;
 253                         struct symbol *s;
 254                         t.txt = reserved_words[i].name;
 255                         t.len = strlen(t.txt);
 256                         s = sym_find(g, t);
 257                         s->type = Terminal;
 258                         s->num = reserved_words[i].num;
 259                         s->isspecial = 1;
 260                 }
 261         }
 262
 263 ### Data types and precedence.
 264
 265 Data type specification, precedence specification, and declaration of
 266 terminals are all introduced by a dollar sign at the start of the line.
 267 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 268 precedence, if it is `TERM` then the line declares terminals without
 269 precedence, otherwise it specifies a data type.
 270
 271 The data type name is simply stored and applied to the head of all
 272 subsequent productions.  It must be the name of a structure optionally
 273 preceded by an asterisk which means a reference or "pointer".  So
 274 `$expression` maps to `struct expression` and `$*statement` maps to
 275 `struct statement *`.
 276
 277 Any productions given before the first data type declaration will have
 278 no data type associated with them and can carry no information.  In
 279 order to allow other non-terminals to have no type, the data type
 280 `$void` can be given.  This does *not* mean that `struct void` will be
 281 used, but rather than no type will be associated with subsequent
 282 non-terminals.
 283
 284 The precedence line must contain a list of symbols, either terminal
 285 symbols or virtual symbols.  It can only contain symbols that have not
 286 been seen yet, so precedence declaration must precede the productions
 287 that they affect.
 288
 289 A "virtual" symbol is an identifier preceded by `$$`.  Virtual symbols
 290 carry precedence information but are not included in the grammar.  A
 291 production can declare that it inherits the precedence of a given
 292 virtual symbol.
 293
 294 This use of `$$` precludes it from being used as a symbol in the
 295 described language.  Two other symbols: `${` and `}$` are also
 296 unavailable.
 297
 298 Each new precedence line introduces a new precedence level and
 299 declares how it associates.  This level is stored in each symbol
 300 listed and may be inherited by any production which uses the symbol.  A
 301 production inherits from the last symbol which has a precedence.
 302
 303 The symbols on the first precedence line have the lowest precedence.
 304 Subsequent lines introduce symbols with higher precedence and so bind
 305 more tightly.
 306
 307 Note that a declaration line (or "dollar line") can start with either of
 308 two different marks: "$" or "$*".  The `dollar_line()` function defined
 309 here is told which was found in the `isref` argument.
 310
 311 ###### grammar fields
 312         struct text current_type;
 313         int type_isref;
 314         int prec_levels;
 315
 316 ###### declarations
 317         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 318         static const char *known[] = { "$$", "${", "}$" };
 319
 320 ###### functions
 321         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 322         {
 323                 struct token t = token_next(ts);
 324                 char *err;
 325                 enum assoc assoc;
 326                 int term = 0;
 327                 int found;
 328
 329                 if (t.num != TK_ident) {
 330                         err = "type or assoc expected after '$'";
 331                         goto abort;
 332                 }
 333                 if (text_is(t.txt, "LEFT"))
 334                         assoc = Left;
 335                 else if (text_is(t.txt, "RIGHT"))
 336                         assoc = Right;
 337                 else if (text_is(t.txt, "NON"))
 338                         assoc = Non;
 339                 else if (text_is(t.txt, "TERM")) {
 340                         term = 1;
 341                         g->terminals_declared = 1;
 342                 } else {
 343                         g->current_type = t.txt;
 344                         g->type_isref = isref;
 345                         if (text_is(t.txt, "void"))
 346                                 g->current_type.txt = NULL;
 347                         t = token_next(ts);
 348                         if (t.num != TK_newline) {
 349                                 err = "Extra tokens after type name";
 350                                 goto abort;
 351                         }
 352                         return NULL;
 353                 }
 354
 355                 if (isref) {
 356                         err = "$* cannot be followed by a precedence";
 357                         goto abort;
 358                 }
 359
 360                 // This is a precedence or TERM line, need some symbols.
 361                 found = 0;
 362                 g->prec_levels += 1;
 363                 t = token_next(ts);
 364                 while (t.num != TK_newline) {
 365                         enum symtype type = Terminal;
 366                         struct symbol *s;
 367                         if (t.num == TK_virtual) {
 368                                 type = Virtual;
 369                                 t = token_next(ts);
 370                                 if (t.num != TK_ident) {
 371                                         err = "$$ must be followed by a word";
 372                                         goto abort;
 373                                 }
 374                                 if (term) {
 375                                         err = "Virtual symbols not permitted on $TERM line";
 376                                         goto abort;
 377                                 }
 378                         } else if (t.num != TK_ident &&
 379                                    t.num != TK_mark) {
 380                                 err = "Illegal token in precedence line";
 381                                 goto abort;
 382                         }
 383                         s = sym_find(g, t.txt);
 384                         if (s->type != Unknown) {
 385                                 err = "Symbols in precedence/TERM line must not already be known.";
 386                                 goto abort;
 387                         }
 388                         s->type = type;
 389                         if (!term) {
 390                                 s->precedence = g->prec_levels;
 391                                 s->assoc = assoc;
 392                         }
 393                         found += 1;
 394                         t = token_next(ts);
 395                 }
 396                 if (found == 0) {
 397                         err = "No symbols given on precedence/TERM line";
 398                         goto abort;
 399                 }
 400                 return NULL;
 401         abort:
 402                 while (t.num != TK_newline && t.num != TK_eof)
 403                         t = token_next(ts);
 404                 return err;
 405         }
 406
 407 ### Productions
 408
 409 A production either starts with an identifier which is the head
 410 non-terminal, or a vertical bar (`|`) in which case this production
 411 uses the same head as the previous one.  The identifier must be
 412 followed by a `->` mark.  All productions for a given non-terminal must
 413 be grouped together with the `nonterminal ->` given only once.
 414
 415 After this a (possibly empty) sequence of identifiers and marks form
 416 the body of the production.  A virtual symbol may be given after the
 417 body by preceding it with `$$`.  If a virtual symbol is given, the
 418 precedence of the production is that for the virtual symbol.  If none
 419 is given, the precedence is inherited from the last symbol in the
 420 production which has a precedence specified.
 421
 422 After the optional precedence may come the `${` mark.  This indicates
 423 the start of a code fragment.  If present, this must be on the same
 424 line as the start of the production.
 425
 426 All of the text from the `${` through to the matching `}$` is
 427 collected and forms the code-fragment for the production.  It must all
 428 be in one `code_node` of the literate code.  The `}$` must be
 429 at the end of a line.
 430
 431 Text in the code fragment will undergo substitutions where `$N` or
 432 `$<N`,for some numeric `N` (or non-numeric indicator as described
 433 later), will be replaced with a variable holding the parse information
 434 for the particular symbol in the production.  `$0` is the head of the
 435 production, `$1` is the first symbol of the body, etc.  The type of `$N`
 436 for a terminal symbol is `struct token`.  For a non-terminal, it is
 437 whatever has been declared for that symbol.  The `<` may be included and
 438 means that the value (usually a reference) is being moved out, so it
 439 will not automatically be freed.  The effect of using '<' is that the
 440 variable is cleared to all-zeros.
 441
 442 While building productions we will need to add to an array which needs to
 443 grow dynamically.
 444
 445 ###### functions
 446         static void array_add(void *varray, int *cnt, void *new)
 447         {
 448                 void ***array = varray;
 449                 int current = 0;
 450                 const int step = 8;
 451                 current = ((*cnt-1) | (step-1))+1;
 452                 if (*cnt == current) {
 453                         /* must grow */
 454                         current += step;
 455                         *array = realloc(*array, current * sizeof(void*));
 456                 }
 457                 (*array)[*cnt] = new;
 458                 (*cnt) += 1;
 459         }
 460
 461 Collecting the code fragment simply involves reading tokens until we
 462 hit the end token `}$`, and noting the character position of the start and
 463 the end.
 464
 465 ###### functions
 466         static struct text collect_code(struct token_state *state,
 467                                         struct token start)
 468         {
 469                 struct text code;
 470                 struct token t;
 471                 code.txt = start.txt.txt + start.txt.len;
 472                 do
 473                         t = token_next(state);
 474                 while (t.node == start.node &&
 475                        t.num != TK_close && t.num != TK_error &&
 476                        t.num != TK_eof);
 477                 if (t.num == TK_close && t.node == start.node)
 478                         code.len = t.txt.txt - code.txt;
 479                 else
 480                         code.txt = NULL;
 481                 return code;
 482         }
 483
 484 Now we have all the bits we need to parse a full production.
 485
 486 ###### includes
 487         #include <memory.h>
 488
 489 ###### grammar fields
 490         struct production **productions;
 491         int production_count;
 492
 493 ###### production fields
 494         struct symbol  *head;
 495         struct symbol **body;
 496         int             body_size;
 497         struct text     code;
 498         int             code_line;
 499
 500 ###### symbol fields
 501         int first_production;
 502
 503 ###### functions
 504         static char *parse_production(struct grammar *g,
 505                                       struct symbol *head,
 506                                       struct token_state *state)
 507         {
 508                 /* Head has already been parsed. */
 509                 struct token tk;
 510                 char *err;
 511                 struct production p, *pp;
 512
 513                 memset(&p, 0, sizeof(p));
 514                 p.head = head;
 515                 tk = token_next(state);
 516                 while (tk.num == TK_ident || tk.num == TK_mark) {
 517                         struct symbol *bs = sym_find(g, tk.txt);
 518                         if (bs->type == Virtual) {
 519                                 err = "Virtual symbol not permitted in production";
 520                                 goto abort;
 521                         }
 522                         if (bs->precedence) {
 523                                 p.precedence = bs->precedence;
 524                                 p.assoc = bs->assoc;
 525                         }
 526                         array_add(&p.body, &p.body_size, bs);
 527                         tk = token_next(state);
 528                 }
 529                 if (tk.num == TK_virtual) {
 530                         struct symbol *vs;
 531                         tk = token_next(state);
 532                         if (tk.num != TK_ident) {
 533                                 err = "word required after $$";
 534                                 goto abort;
 535                         }
 536                         vs = sym_find(g, tk.txt);
 537                         if (vs->precedence == 0) {
 538                                 err = "symbol after $$ must have precedence";
 539                                 goto abort;
 540                         } else {
 541                                 p.precedence = vs->precedence;
 542                                 p.assoc = vs->assoc;
 543                         }
 544                         tk = token_next(state);
 545                 }
 546                 if (tk.num == TK_open) {
 547                         p.code_line = tk.line;
 548                         p.code = collect_code(state, tk);
 549                         if (p.code.txt == NULL) {
 550                                 err = "code fragment not closed properly";
 551                                 goto abort;
 552                         }
 553                         tk = token_next(state);
 554                 }
 555
 556                 if (tk.num != TK_newline && tk.num != TK_eof) {
 557                         err = "stray tokens at end of line";
 558                         goto abort;
 559                 }
 560                 pp = malloc(sizeof(*pp));
 561                 *pp = p;
 562                 array_add(&g->productions, &g->production_count, pp);
 563                 return NULL;
 564         abort:
 565                 while (tk.num != TK_newline && tk.num != TK_eof)
 566                         tk = token_next(state);
 567                 free(p.body);
 568                 return err;
 569         }
 570
 571 With the ability to parse productions and dollar-lines, we have nearly all
 572 that we need to parse a grammar from a `code_node`.
 573
 574 The head of the first production will effectively be the `start` symbol
 575 of the grammar.  However it won't _actually_ be so.  Processing the
 576 grammar is greatly simplified if the real start symbol only has a single
 577 production, and expects `$eof` as the final terminal.  So when we find
 578 the first explicit production we insert an extra implicit production as
 579 production zero which looks like
 580
 581 ###### Example: production 0
 582         $start -> START $eof
 583
 584 where `START` is the first non-terminal given.
 585
 586 ###### create production zero
 587         struct production *p = calloc(1,sizeof(*p));
 588         struct text start = {"$start",6};
 589         struct text eof = {"$eof",4};
 590         struct text code = {"$0 = $<1;", 9};
 591         p->head = sym_find(g, start);
 592         p->head->type = Nonterminal;
 593         p->head->struct_name = g->current_type;
 594         p->head->isref = g->type_isref;
 595         if (g->current_type.txt)
 596                 p->code = code;
 597         array_add(&p->body, &p->body_size, head);
 598         array_add(&p->body, &p->body_size, sym_find(g, eof));
 599         p->head->first_production = g->production_count;
 600         array_add(&g->productions, &g->production_count, p);
 601
 602 Now we are ready to read in the grammar.  We ignore comments
 603 and strings so that the marks which introduce them can be
 604 used as terminals in the grammar.  We don't ignore numbers
 605 even though we don't allow them as that causes the scanner
 606 to produce errors that the parser is better positioned to handle.
 607 We also ignore indentation, but we expect and use newlines.
 608
 609 To allow comments in the grammar, we explicitly check for "//" as the
 610 first token on the line and those lines are skipped.  "//" can still be
 611 used as a terminal anywhere that a terminal is expected.
 612
 613 ###### grammar_read
 614         static struct grammar *grammar_read(struct code_node *code)
 615         {
 616                 struct token_config conf = {
 617                         .word_start = "",
 618                         .word_cont = "",
 619                         .words_marks = known,
 620                         .known_count = sizeof(known)/sizeof(known[0]),
 621                         .number_chars = "",
 622                         .ignored = (1 << TK_line_comment)
 623                                  | (1 << TK_block_comment)
 624                                  | (0 << TK_number)
 625                                  | (1 << TK_string)
 626                                  | (1 << TK_multi_string)
 627                                  | (1 << TK_in)
 628                                  | (1 << TK_out),
 629                 };
 630
 631                 struct token_state *state = token_open(code, &conf);
 632                 struct token tk;
 633                 struct symbol *head = NULL;
 634                 struct grammar *g;
 635                 char *err = NULL;
 636
 637                 g = calloc(1, sizeof(*g));
 638                 symbols_init(g);
 639
 640                 for (tk = token_next(state); tk.num != TK_eof;
 641                      tk = token_next(state)) {
 642                         if (tk.num == TK_newline)
 643                                 continue;
 644                         if (tk.num == TK_ident) {
 645                                 // new non-terminal
 646                                 head = sym_find(g, tk.txt);
 647                                 if (head->type == Nonterminal)
 648                                         err = "This non-terminal has already be used.";
 649                                 else if (head->type == Virtual)
 650                                         err = "Virtual symbol not permitted in head of production";
 651                                 else {
 652                                         head->type = Nonterminal;
 653                                         head->struct_name = g->current_type;
 654                                         head->isref = g->type_isref;
 655                                         if (g->production_count == 0) {
 656                                                 ## create production zero
 657                                         }
 658                                         head->first_production = g->production_count;
 659                                         tk = token_next(state);
 660                                         if (tk.num == TK_mark &&
 661                                             text_is(tk.txt, "->"))
 662                                                 err = parse_production(g, head, state);
 663                                         else
 664                                                 err = "'->' missing in production";
 665                                 }
 666                         } else if (tk.num == TK_mark
 667                                    && text_is(tk.txt, "|")) {
 668                                 // another production for same non-term
 669                                 if (head)
 670                                         err = parse_production(g, head, state);
 671                                 else
 672                                         err = "First production must have a head";
 673                         } else if (tk.num == TK_mark
 674                                    && text_is(tk.txt, "$")) {
 675                                 err = dollar_line(state, g, 0);
 676                         } else if (tk.num == TK_mark
 677                                    && text_is(tk.txt, "$*")) {
 678                                 err = dollar_line(state, g, 1);
 679                         } else if (tk.num == TK_mark
 680                                    && text_is(tk.txt, "//")) {
 681                                 while (tk.num != TK_newline &&
 682                                        tk.num != TK_eof)
 683                                         tk = token_next(state);
 684                         } else {
 685                                 err = "Unrecognised token at start of line.";
 686                         }
 687                         if (err)
 688                                 goto abort;
 689                 }
 690                 token_close(state);
 691
 692                 struct symbol *s;
 693                 for (s = g->syms; s; s = s->next) {
 694                         if (s->type != Unknown)
 695                                 continue;
 696                         if (!g->terminals_declared) {
 697                                 s->type = Terminal;
 698                                 continue;
 699                         }
 700                         err = "not declared";
 701                         fprintf(stderr, "Token %.*s not declared\n",
 702                                 s->name.len, s->name.txt);
 703                 }
 704                 if (err) {
 705                         free(g); // FIXME free content
 706                         g = NULL;
 707                 }
 708                 return g;
 709         abort:
 710                 fprintf(stderr, "Error at line %d: %s\n",
 711                         tk.line, err);
 712                 token_close(state);
 713                 free(g); // FIXME free content
 714                 return NULL;
 715         }
 716
 717 ## Analysing the grammar
 718
 719 The central task in analysing the grammar is to determine a set of
 720 states to drive the parsing process.  This will require finding various
 721 sets of symbols and of "items".  Some of these sets will need to attach
 722 information to each element in the set, so they are more like maps.
 723
 724 Each "item" may have a 'look-ahead' or `LA` set associated with it.
 725 Many of these will be the same as each other.  To avoid memory wastage,
 726 and to simplify some comparisons of sets, these sets will be stored in a
 727 list of unique sets, each assigned a number.
 728
 729 Once we have the data structures in hand to manage these sets and lists,
 730 we can start setting the 'nullable' flag, build the 'FIRST' and 'FOLLOW'
 731 sets, and then create the item sets which define the various states.
 732
 733 ### Symbol sets.
 734
 735 Though we don't only store symbols in these sets, they are the main
 736 things we store, so they are called symbol sets or "symsets".
 737
 738 A symset has a size, an array of shorts, and an optional array of data
 739 values, which are also shorts.  If the array of data is not present, we
 740 store an impossible pointer, as `NULL` is used to indicate that no
 741 memory has been allocated yet;
 742
 743 ###### declarations
 744         struct symset {
 745                 short cnt;
 746                 unsigned short *syms, *data;
 747         };
 748         #define NO_DATA ((unsigned short *)1)
 749         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 750         const struct symset INIT_DATASET = { 0, NULL, NULL };
 751
 752 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
 753 using an insertion sort.  We don't explicitly record the amount of
 754 allocated space as it can be derived directly from the current `cnt`
 755 using `((cnt - 1) | 7) + 1`.
 756
 757 ###### functions
 758         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 759         {
 760                 int i;
 761                 int current = ((s->cnt-1) | 7) + 1;
 762                 if (current == s->cnt) {
 763                         current += 8;
 764                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 765                         if (s->data != NO_DATA)
 766                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 767                 }
 768                 i = s->cnt;
 769                 while (i > 0 && s->syms[i-1] > key) {
 770                         s->syms[i] = s->syms[i-1];
 771                         if (s->data != NO_DATA)
 772                                 s->data[i] = s->data[i-1];
 773                         i--;
 774                 }
 775                 s->syms[i] = key;
 776                 if (s->data != NO_DATA)
 777                         s->data[i] = val;
 778                 s->cnt += 1;
 779         }
 780
 781 Finding a symbol (or item) in a `symset` uses a simple binary search.
 782 We return the index where the value was found (so data can be accessed),
 783 or `-1` to indicate failure.
 784
 785         static int symset_find(struct symset *ss, unsigned short key)
 786         {
 787                 int lo = 0;
 788                 int hi = ss->cnt;
 789
 790                 if (hi == 0)
 791                         return -1;
 792                 while (lo + 1 < hi) {
 793                         int mid = (lo + hi) / 2;
 794                         if (ss->syms[mid] <= key)
 795                                 lo = mid;
 796                         else
 797                                 hi = mid;
 798                 }
 799                 if (ss->syms[lo] == key)
 800                         return lo;
 801                 return -1;
 802         }
 803
 804 We will often want to form the union of two symsets.  When we do, we
 805 will often want to know if anything changed (as that might mean there is
 806 more work to do).  So `symset_union` reports whether anything was added
 807 to the first set.  We use a slow quadratic approach as these sets are
 808 rarely large.  If profiling shows this to be a problem it can be
 809 optimised later.
 810
 811         static int symset_union(struct symset *a, struct symset *b)
 812         {
 813                 int i;
 814                 int added = 0;
 815                 for (i = 0; i < b->cnt; i++)
 816                         if (symset_find(a, b->syms[i]) < 0) {
 817                                 unsigned short data = 0;
 818                                 if (b->data != NO_DATA)
 819                                         data = b->data[i];
 820                                 symset_add(a, b->syms[i], data);
 821                                 added++;
 822                         }
 823                 return added;
 824         }
 825
 826 And of course we must be able to free a symset.
 827
 828         static void symset_free(struct symset ss)
 829         {
 830                 free(ss.syms);
 831                 if (ss.data != NO_DATA)
 832                         free(ss.data);
 833         }
 834
 835 ### Symset Storage
 836
 837 Some symsets are simply stored somewhere appropriate in the `grammar`
 838 data structure, others needs a bit of indirection.  This applies
 839 particularly to "LA" sets.  These will be paired with "items" in an
 840 "itemset".  As itemsets will be stored in a symset, the "LA" set needs
 841 to be stored in the `data` field, so we need a mapping from a small
 842 (short) number to an LA `symset`.
 843
 844 As mentioned earlier this involves creating a list of unique symsets.
 845
 846 For now, we just use a linear list sorted by insertion.  A skiplist
 847 would be more efficient and may be added later.
 848
 849 ###### declarations
 850
 851         struct setlist {
 852                 struct symset ss;
 853                 int num;
 854                 struct setlist *next;
 855         };
 856
 857 ###### grammar fields
 858         struct setlist *sets;
 859         int nextset;
 860
 861 ###### functions
 862
 863         static int ss_cmp(struct symset a, struct symset b)
 864         {
 865                 int i;
 866                 int diff = a.cnt - b.cnt;
 867                 if (diff)
 868                         return diff;
 869                 for (i = 0; i < a.cnt; i++) {
 870                         diff = (int)a.syms[i] - (int)b.syms[i];
 871                         if (diff)
 872                                 return diff;
 873                 }
 874                 return 0;
 875         }
 876
 877         static int save_set(struct grammar *g, struct symset ss)
 878         {
 879                 struct setlist **sl = &g->sets;
 880                 int cmp = 1;
 881                 struct setlist *s;
 882
 883                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 884                         sl = & (*sl)->next;
 885                 if (cmp == 0) {
 886                         symset_free(ss);
 887                         return (*sl)->num;
 888                 }
 889
 890                 s = malloc(sizeof(*s));
 891                 s->ss = ss;
 892                 s->num = g->nextset;
 893                 g->nextset += 1;
 894                 s->next = *sl;
 895                 *sl = s;
 896                 return s->num;
 897         }
 898
 899 Finding a set by number is currently performed by a simple linear
 900 search.  If this turns out to hurt performance, we can store the sets in
 901 a dynamic array like the productions.
 902
 903         static struct symset set_find(struct grammar *g, int num)
 904         {
 905                 struct setlist *sl = g->sets;
 906                 while (sl && sl->num != num)
 907                         sl = sl->next;
 908                 return sl->ss;
 909         }
 910
 911 ### Setting `nullable`
 912
 913 We set `nullable` on the head symbol for any production for which all
 914 the body symbols (if any) are nullable.  As this is a recursive
 915 definition, any change in the `nullable` setting means that we need to
 916 re-evaluate where it needs to be set.
 917
 918 We simply loop around performing the same calculations until no more
 919 changes happen.
 920
 921 ###### symbol fields
 922         int nullable;
 923
 924 ###### functions
 925         static void set_nullable(struct grammar *g)
 926         {
 927                 int check_again = 1;
 928                 while (check_again) {
 929                         int p;
 930                         check_again = 0;
 931                         for (p = 0; p < g->production_count; p++) {
 932                                 struct production *pr = g->productions[p];
 933                                 int s;
 934
 935                                 if (pr->head->nullable)
 936                                         continue;
 937                                 for (s = 0; s < pr->body_size; s++)
 938                                         if (! pr->body[s]->nullable)
 939                                                 break;
 940                                 if (s == pr->body_size) {
 941                                         pr->head->nullable = 1;
 942                                         check_again = 1;
 943                                 }
 944                         }
 945                 }
 946         }
 947
 948 ### Building the `first` sets
 949
 950 When calculating what can follow a particular non-terminal, we will need
 951 to know what the "first" terminal in any subsequent non-terminal might
 952 be.  So we calculate the `first` set for every non-terminal and store
 953 them in an array.  We don't bother recording the "first" set for
 954 terminals as they are trivial.
 955
 956 As the "first" for one symbol might depend on the "first" of another, we
 957 repeat the calculations until no changes happen, just like with
 958 `set_nullable`.  This makes use of the fact that `symset_union` reports
 959 if any change happens.
 960
 961 The core of this, which finds the "first" of part of a production body,
 962 will be reused for computing the "follow" sets, so we split that out
 963 into a separate function, `add_first`, which also reports if it got all
 964 the way to the end of the production without finding a non-nullable
 965 symbol.
 966
 967 ###### grammar fields
 968         struct symset *first;
 969
 970 ###### functions
 971
 972         static int add_first(struct production *pr, int start,
 973                              struct symset *target, struct grammar *g,
 974                              int *to_end)
 975         {
 976                 int s;
 977                 int changed = 0;
 978                 for (s = start; s < pr->body_size; s++) {
 979                         struct symbol *bs = pr->body[s];
 980                         if (bs->type == Terminal) {
 981                                 if (symset_find(target, bs->num) < 0) {
 982                                         symset_add(target, bs->num, 0);
 983                                         changed = 1;
 984                                 }
 985                                 break;
 986                         } else if (symset_union(target, &g->first[bs->num]))
 987                                 changed = 1;
 988                         if (!bs->nullable)
 989                                 break;
 990                 }
 991                 if (to_end)
 992                         *to_end = (s == pr->body_size);
 993                 return changed;
 994         }
 995
 996         static void build_first(struct grammar *g)
 997         {
 998                 int check_again = 1;
 999                 int s;
1000                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1001                 for (s = 0; s < g->num_syms; s++)
1002                         g->first[s] = INIT_SYMSET;
1003
1004                 while (check_again) {
1005                         int p;
1006                         check_again = 0;
1007                         for (p = 0; p < g->production_count; p++) {
1008                                 struct production *pr = g->productions[p];
1009                                 struct symset *head = &g->first[pr->head->num];
1010
1011                                 if (add_first(pr, 0, head, g, NULL))
1012                                         check_again = 1;
1013                         }
1014                 }
1015         }
1016
1017 ### Building the `follow` sets.
1018
1019 There are two different situations when we will want to generate
1020 "follow" sets.  If we are doing an SLR analysis, we want to generate a
1021 single "follow" set for each non-terminal in the grammar.  That is what
1022 is happening here.  If we are doing an LALR or LR analysis we will want
1023 to generate a separate "LA" set for each item.  We do that later in
1024 state generation.
1025
1026 There are two parts to generating a "follow" set.  Firstly we look at
1027 every place that any non-terminal appears in the body of any production,
1028 and we find the set of possible "first" symbols after there.  This is
1029 done using `add_first` above and only needs to be done once as the
1030 "first" sets are now stable and will not change.
1031
1032 ###### grammar fields
1033         struct symset *follow;
1034
1035 ###### follow code
1036
1037         for (p = 0; p < g->production_count; p++) {
1038                 struct production *pr = g->productions[p];
1039                 int b;
1040
1041                 for (b = 0; b < pr->body_size - 1; b++) {
1042                         struct symbol *bs = pr->body[b];
1043                         if (bs->type == Terminal)
1044                                 continue;
1045                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1046                 }
1047         }
1048
1049 The second part is to add the "follow" set of the head of a production
1050 to the "follow" sets of the final symbol in the production, and any
1051 other symbol which is followed only by `nullable` symbols.  As this
1052 depends on "follow" itself we need to repeatedly perform the process
1053 until no further changes happen.
1054
1055 Rather than 2 nested loops, one that repeats the whole process until
1056 there is no change, and one that iterates of the set of productions, we
1057 combine these two functions into a single loop.
1058
1059 ###### follow code
1060
1061         for (check_again = 0, p = 0;
1062              p < g->production_count;
1063              p < g->production_count-1
1064                 ? p++ : check_again ? (check_again = 0, p = 0)
1065                               : p++) {
1066                 struct production *pr = g->productions[p];
1067                 int b;
1068
1069                 for (b = pr->body_size - 1; b >= 0; b--) {
1070                         struct symbol *bs = pr->body[b];
1071                         if (bs->type == Terminal)
1072                                 break;
1073                         if (symset_union(&g->follow[bs->num],
1074                                          &g->follow[pr->head->num]))
1075                                 check_again = 1;
1076                         if (!bs->nullable)
1077                                 break;
1078                 }
1079         }
1080
1081 We now just need to create and initialise the `follow` list to get a
1082 complete function.
1083
1084 ###### functions
1085         static void build_follow(struct grammar *g)
1086         {
1087                 int s, check_again, p;
1088                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1089                 for (s = 0; s < g->num_syms; s++)
1090                         g->follow[s] = INIT_SYMSET;
1091                 ## follow code
1092         }
1093
1094 ### Building itemsets and states
1095
1096 There are three different levels of detail that can be chosen for
1097 building the itemsets and states for the LR grammar.  They are:
1098
1099 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1100    itemsets - look-ahead, if used at all, is simply the 'follow' sets
1101    already computed,
1102 2. LALR(1) where we build look-ahead sets with each item and merge
1103    the LA sets when we find two paths to the same "kernel" set of items.
1104 3. LR(1) where different look-ahead for any item in the set means
1105    a different item set must be created.
1106
1107 ###### forward declarations
1108         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1109
1110 We need to be able to look through existing states to see if a newly
1111 generated state already exists.  For now we use a simple sorted linked
1112 list.
1113
1114 An item is a pair of numbers: the production number and the position of
1115 "DOT", which is an index into the body of the production.  As the
1116 numbers are not enormous we can combine them into a single "short" and
1117 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1118 production number (so 2000 productions with maximum of 31 symbols in the
1119 body).
1120
1121 Comparing the itemsets will be a little different to comparing symsets
1122 as we want to do the lookup after generating the "kernel" of an
1123 itemset, so we need to ignore the offset=zero items which are added during
1124 completion.
1125
1126 To facilitate this, we modify the "DOT" number so that "0" sorts to
1127 the end of the list in the symset, and then only compare items before
1128 the first "0".
1129
1130 ###### declarations
1131         static inline unsigned short item_num(int production, int dot)
1132         {
1133                 return production | ((31-dot) << 11);
1134         }
1135         static inline int item_prod(unsigned short item)
1136         {
1137                 return item & 0x7ff;
1138         }
1139         static inline int item_dot(unsigned short item)
1140         {
1141                 return (31-(item >> 11)) & 0x1f;
1142         }
1143
1144 For LR(1) analysis we need to compare not just the itemset in a state
1145 but also the LA sets.  As we assign each unique LA set a number, we
1146 can just compare the symset and the data values together.
1147
1148 ###### functions
1149         static int itemset_cmp(struct symset a, struct symset b,
1150                                enum grammar_type type)
1151         {
1152                 int i;
1153                 int av, bv;
1154
1155                 for (i = 0;
1156                      i < a.cnt && i < b.cnt &&
1157                      item_dot(a.syms[i]) > 0 &&
1158                      item_dot(b.syms[i]) > 0;
1159                      i++) {
1160                         int diff = a.syms[i] - b.syms[i];
1161                         if (diff)
1162                                 return diff;
1163                         if (type == LR1) {
1164                                 diff = a.data[i] - b.data[i];
1165                                 if (diff)
1166                                         return diff;
1167                         }
1168                 }
1169                 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1170                         av = -1;
1171                 else
1172                         av = a.syms[i];
1173                 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1174                         bv = -1;
1175                 else
1176                         bv = b.syms[i];
1177                 if (av - bv)
1178                         return av - bv;
1179                 if (type < LR1 || av == -1)
1180                         return 0;
1181                 return
1182                         a.data[i] - b.data[i];
1183         }
1184
1185 It will be helpful to know if an itemset has been "completed" or not,
1186 particularly for LALR where itemsets get merged, at which point they
1187 need to be consider for completion again.  So a `completed` flag is
1188 needed.
1189
1190 And now we can build the list of itemsets.  The lookup routine returns
1191 both a success flag and a pointer to where in the list an insert should
1192 happen, so we don't need to search a second time.
1193
1194 Each state will often have at most one reducible production, but
1195 occasionally will have more.  If there are more we will need a full
1196 action table (merged in the go to table).  If there is one, we record it
1197 in the state.  If none, we record that too.  These special values will
1198 be needed both in the generator and in the parser, so they need to be
1199 declared twice.
1200
1201 ###### exported types
1202         enum {
1203                 NONE_REDUCIBLE = -1,
1204                 MANY_REDUCIBLE = -2,
1205         };
1206 ###### declarations
1207         enum {
1208                 NONE_REDUCIBLE = -1,
1209                 MANY_REDUCIBLE = -2,
1210         };
1211         struct itemset {
1212                 struct itemset *next;
1213                 short state;
1214                 short reducible_prod; /* or NONE_REDUCIBLE or MANY_REDUCIBLE */
1215                 struct symset items;
1216                 struct symset go_to;
1217                 enum assoc assoc;
1218                 unsigned short precedence;
1219                 char completed;
1220         };
1221
1222 ###### grammar fields
1223         struct itemset *items;
1224         int states;
1225
1226 ###### functions
1227         static int itemset_find(struct grammar *g, struct itemset ***where,
1228                                 struct symset kernel, enum grammar_type type)
1229         {
1230                 struct itemset **ip;
1231
1232                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1233                         struct itemset *i = *ip;
1234                         int diff;
1235                         diff = itemset_cmp(i->items, kernel, type);
1236                         if (diff < 0)
1237                                 continue;
1238                         if (diff > 0)
1239                                 break;
1240                         /* found */
1241                         *where = ip;
1242                         return 1;
1243                 }
1244                 *where = ip;
1245                 return 0;
1246         }
1247
1248 Adding an itemset may require merging the LA sets if LALR analysis is
1249 happening.  If any new LA set adds any symbols that weren't in the old
1250 LA set, we clear the `completed` flag so that the dependants of this
1251 itemset will be recalculated and their LA sets updated.
1252
1253 `add_itemset` must consume the symsets it is passed, either by adding
1254 them to a data structure, of freeing them.
1255
1256         static int add_itemset(struct grammar *g, struct symset ss,
1257                                enum assoc assoc, unsigned short precedence,
1258                                enum grammar_type type)
1259         {
1260                 struct itemset **where, *is;
1261                 int i;
1262                 int found = itemset_find(g, &where, ss, type);
1263                 if (!found) {
1264                         is = calloc(1, sizeof(*is));
1265                         is->state = g->states;
1266                         g->states += 1;
1267                         is->items = ss;
1268                         is->assoc = assoc;
1269                         is->precedence = precedence;
1270                         is->next = *where;
1271                         is->go_to = INIT_DATASET;
1272                         *where = is;
1273                         return is->state;
1274                 }
1275                 is = *where;
1276                 if (type != LALR) {
1277                         symset_free(ss);
1278                         return is->state;
1279                 }
1280                 for (i = 0; i < ss.cnt; i++) {
1281                         struct symset temp = INIT_SYMSET, s;
1282                         if (ss.data[i] == is->items.data[i])
1283                                 continue;
1284                         s = set_find(g, is->items.data[i]);
1285                         symset_union(&temp, &s);
1286                         s = set_find(g, ss.data[i]);
1287                         if (symset_union(&temp, &s)) {
1288                                 is->items.data[i] = save_set(g, temp);
1289                                 is->completed = 0;
1290                         } else
1291                                 symset_free(temp);
1292                 }
1293                 symset_free(ss);
1294                 return is->state;
1295         }
1296
1297 #### The build
1298
1299 To build all the itemsets, we first insert the initial itemset made from
1300 production zero, complete each itemset, and then generate new itemsets
1301 from old until no new ones can be made.
1302
1303 Completing an itemset means finding all the items where "DOT" is
1304 followed by a nonterminal and adding "DOT=0" items for every production
1305 from that non-terminal - providing each item hasn't already been added.
1306 When we add such an item it might get inserted before the current
1307 one, so to make sure we process it we reset the loop counter to the
1308 start.
1309
1310 If LA sets are needed, the LA set for each new item is found using
1311 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This
1312 may be supplemented by the LA set for the item which produced the new
1313 item.
1314
1315 We also collect a set of all symbols which follow "DOT" (in `done`) as
1316 this is used in the next stage.
1317
1318 When itemsets are created we assign a precedence to the itemset from the
1319 complete item, if there is one.  We ignore the possibility of there
1320 being two and don't (currently) handle precedence in such grammars.
1321 When completing a grammar we ignore any item where DOT is followed by a
1322 terminal with a precedence lower than that for the itemset.  Unless the
1323 terminal has right associativity, we also ignore items where the
1324 terminal has the same precedence.  The result is that unwanted items are
1325 still in the itemset, but the terminal doesn't get into the go to set,
1326 so the item is ineffective.
1327
1328 ###### complete itemset
1329         for (i = 0; i < is->items.cnt; i++) {
1330                 int p = item_prod(is->items.syms[i]);
1331                 int bs = item_dot(is->items.syms[i]);
1332                 struct production *pr = g->productions[p];
1333                 int p2;
1334                 struct symbol *s;
1335                 struct symset LA = INIT_SYMSET;
1336                 unsigned short sn = 0;
1337
1338                 if (i == 0)
1339                         is->reducible_prod = NONE_REDUCIBLE;
1340                 if (bs == pr->body_size) {
1341                         if (is->reducible_prod == NONE_REDUCIBLE)
1342                                 is->reducible_prod = p;
1343                         else
1344                                 is->reducible_prod = MANY_REDUCIBLE;
1345                         continue;
1346                 }
1347                 s = pr->body[bs];
1348                 if (s->precedence && is->precedence &&
1349                     is->precedence > s->precedence)
1350                         /* This terminal has a low precedence and
1351                          * shouldn't be shifted
1352                          */
1353                         continue;
1354                 if (s->precedence && is->precedence &&
1355                     is->precedence == s->precedence && s->assoc != Right)
1356                         /* This terminal has a matching precedence and is
1357                          * not Right-associative, so we mustn't shift it.
1358                          */
1359                         continue;
1360                 if (symset_find(&done, s->num) < 0)
1361                         symset_add(&done, s->num, 0);
1362
1363                 if (s->type != Nonterminal)
1364                         continue;
1365                 check_again = 1;
1366                 if (type >= LALR) {
1367                         // Need the LA set.
1368                         int to_end;
1369                         add_first(pr, bs+1, &LA, g, &to_end);
1370                         if (to_end) {
1371                                 struct symset ss = set_find(g, is->items.data[i]);
1372                                 symset_union(&LA, &ss);
1373                         }
1374                         sn = save_set(g, LA);
1375                         LA = set_find(g, sn);
1376                 }
1377
1378                 /* Add productions for this symbol */
1379                 for (p2 = s->first_production;
1380                      p2 < g->production_count &&
1381                       g->productions[p2]->head == s;
1382                      p2++) {
1383                         int itm = item_num(p2, 0);
1384                         int pos = symset_find(&is->items, itm);
1385                         if (pos < 0) {
1386                                 symset_add(&is->items, itm, sn);
1387                                 /* Will have re-ordered, so start
1388                                  * from beginning again */
1389                                 i = -1;
1390                         } else if (type >= LALR) {
1391                                 struct symset ss = set_find(g, is->items.data[pos]);
1392                                 struct symset tmp = INIT_SYMSET;
1393                                 struct symset *la = &LA;
1394
1395                                 symset_union(&tmp, &ss);
1396                                 if (symset_union(&tmp, la)) {
1397                                         is->items.data[pos] = save_set(g, tmp);
1398                                         i = -1;
1399                                 } else
1400                                         symset_free(tmp);
1401                         }
1402                 }
1403         }
1404
1405 For each symbol we found (and placed in `done`) we collect all the items
1406 for which this symbol is next, and create a new itemset with "DOT"
1407 advanced over the symbol.  This is then added to the collection of
1408 itemsets (or merged with a pre-existing itemset).
1409
1410 ###### derive itemsets
1411         // Now we have a completed itemset, so we need to
1412         // compute all the 'next' itemsets and create them
1413         // if they don't exist.
1414         for (i = 0; i < done.cnt; i++) {
1415                 int j;
1416                 unsigned short state;
1417                 struct symbol *sym = g->symtab[done.syms[i]];
1418                 enum assoc assoc = Non;
1419                 unsigned short precedence = 0;
1420                 struct symset newitemset = INIT_SYMSET;
1421                 if (type >= LALR)
1422                         newitemset = INIT_DATASET;
1423
1424                 for (j = 0; j < is->items.cnt; j++) {
1425                         int itm = is->items.syms[j];
1426                         int p = item_prod(itm);
1427                         int bp = item_dot(itm);
1428                         struct production *pr = g->productions[p];
1429                         unsigned short la = 0;
1430                         int pos;
1431
1432                         if (bp == pr->body_size)
1433                                 continue;
1434                         if (pr->body[bp] != sym)
1435                                 continue;
1436
1437                         bp += 1;
1438                         if (type >= LALR)
1439                                 la = is->items.data[j];
1440                         if (bp == pr->body_size &&
1441                             pr->precedence > 0 &&
1442                             pr->precedence > precedence) {
1443                                 // new itemset is reducible and has a precedence.
1444                                 precedence = pr->precedence;
1445                                 assoc = pr->assoc;
1446                         }
1447                         pos = symset_find(&newitemset, item_num(p, bp));
1448
1449                         if (pos < 0)
1450                                 symset_add(&newitemset, item_num(p, bp), la);
1451                         else if (type >= LALR) {
1452                                 // Need to merge la set.
1453                                 int la2 = newitemset.data[pos];
1454                                 if (la != la2) {
1455                                         struct symset ss = set_find(g, la2);
1456                                         struct symset LA = INIT_SYMSET;
1457                                         symset_union(&LA, &ss);
1458                                         ss = set_find(g, la);
1459                                         if (symset_union(&LA, &ss))
1460                                                 newitemset.data[pos] = save_set(g, LA);
1461                                         else
1462                                                 symset_free(LA);
1463                                 }
1464                         }
1465                 }
1466                 state = add_itemset(g, newitemset, assoc, precedence, type);
1467                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1468                         symset_add(&is->go_to, done.syms[i], state);
1469         }
1470
1471 All that is left is to create the initial itemset from production zero, and
1472 with `TK_eof` as the LA set.
1473
1474 ###### functions
1475         static void build_itemsets(struct grammar *g, enum grammar_type type)
1476         {
1477                 struct symset first = INIT_SYMSET;
1478                 struct itemset *is;
1479                 int check_again;
1480                 unsigned short la = 0;
1481                 if (type >= LALR) {
1482                         // LA set just has eof
1483                         struct symset eof = INIT_SYMSET;
1484                         symset_add(&eof, TK_eof, 0);
1485                         la = save_set(g, eof);
1486                         first = INIT_DATASET;
1487                 }
1488                 // production 0, offset 0 (with no data)
1489                 symset_add(&first, item_num(0, 0), la);
1490                 add_itemset(g, first, Non, 0, type);
1491                 for (check_again = 0, is = g->items;
1492                      is;
1493                      is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1494                         int i;
1495                         struct symset done = INIT_SYMSET;
1496                         if (is->completed)
1497                                 continue;
1498                         is->completed = 1;
1499                         check_again = 1;
1500                         ## complete itemset
1501                         ## derive itemsets
1502                         symset_free(done);
1503                 }
1504         }
1505
1506 ### Completing the analysis.
1507
1508 The exact process of analysis depends on which level was requested.  For
1509 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1510 `LR1` we need first, but not follow.  For `SLR` we need both.
1511
1512 We don't build the "action" tables that you might expect as the parser
1513 doesn't use them.  They will be calculated to some extent if needed for
1514 a report.
1515
1516 Once we have built everything we allocate arrays for the two lists:
1517 symbols and itemsets.  This allows more efficient access during
1518 reporting.  The symbols are grouped as terminals, then non-terminals,
1519 then virtual, with the start of non-terminals recorded as `first_nonterm`.
1520 Special terminals -- meaning just EOL -- are included with the
1521 non-terminals so that they are not expected by the scanner.
1522
1523 ###### grammar fields
1524         struct symbol **symtab;
1525         struct itemset **statetab;
1526         int first_nonterm;
1527
1528 ###### grammar_analyse
1529
1530         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1531         {
1532                 struct symbol *s;
1533                 struct itemset *is;
1534                 int snum = TK_reserved;
1535                 for (s = g->syms; s; s = s->next)
1536                         if (s->num < 0 && s->type == Terminal && !s->isspecial) {
1537                                 s->num = snum;
1538                                 snum++;
1539                         }
1540                 g->first_nonterm = snum;
1541                 for (s = g->syms; s; s = s->next)
1542                         if (s->num < 0 && s->type != Virtual) {
1543                                 s->num = snum;
1544                                 snum++;
1545                         }
1546                 for (s = g->syms; s; s = s->next)
1547                         if (s->num < 0) {
1548                                 s->num = snum;
1549                                 snum++;
1550                         }
1551                 g->num_syms = snum;
1552                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1553                 for (s = g->syms; s; s = s->next)
1554                         g->symtab[s->num] = s;
1555
1556                 set_nullable(g);
1557                 if (type >= SLR)
1558                         build_first(g);
1559
1560                 if (type == SLR)
1561                         build_follow(g);
1562
1563                 build_itemsets(g, type);
1564
1565                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1566                 for (is = g->items; is ; is = is->next)
1567                         g->statetab[is->state] = is;
1568         }
1569
1570 ## Reporting on the Grammar
1571
1572 The purpose of the report is to give the grammar developer insight into
1573 how the grammar parser will work.  It is basically a structured dump of
1574 all the tables that have been generated, plus a description of any conflicts.
1575
1576 ###### grammar_report
1577         static int grammar_report(struct grammar *g, enum grammar_type type)
1578         {
1579                 report_symbols(g);
1580                 if (g->follow)
1581                         report_follow(g);
1582                 report_itemsets(g);
1583                 return report_conflicts(g, type);
1584         }
1585
1586 Firstly we have the complete list of symbols, together with the
1587 "FIRST" set if that was generated.  We add a mark to each symbol to
1588 show if it is nullable (`.`).
1589
1590 ###### functions
1591
1592         static void report_symbols(struct grammar *g)
1593         {
1594                 int n;
1595                 if (g->first)
1596                         printf("SYMBOLS + FIRST:\n");
1597                 else
1598                         printf("SYMBOLS:\n");
1599
1600                 for (n = 0; n < g->num_syms; n++) {
1601                         struct symbol *s = g->symtab[n];
1602                         if (!s)
1603                                 continue;
1604
1605                         printf(" %c%3d%c: ",
1606                                s->nullable ? '.':' ',
1607                                s->num, symtypes[s->type]);
1608                         prtxt(s->name);
1609                         if (s->precedence)
1610                                 printf(" (%d%s)", s->precedence,
1611                                        assoc_names[s->assoc]);
1612
1613                         if (g->first && s->type == Nonterminal) {
1614                                 int j;
1615                                 char c = ':';
1616                                 for (j = 0; j < g->first[n].cnt; j++) {
1617                                         printf("%c ", c);
1618                                         c = ',';
1619                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1620                                 }
1621                         }
1622                         printf("\n");
1623                 }
1624         }
1625
1626 Then we have the follow sets if they were computed.
1627
1628         static void report_follow(struct grammar *g)
1629         {
1630                 int n;
1631                 printf("FOLLOW:\n");
1632                 for (n = 0; n < g->num_syms; n++)
1633                         if (g->follow[n].cnt) {
1634                                 int j;
1635                                 char c = ':';
1636                                 printf("  ");
1637                                 prtxt(g->symtab[n]->name);
1638                                 for (j = 0; j < g->follow[n].cnt; j++) {
1639                                         printf("%c ", c);
1640                                         c = ',';
1641                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1642                                 }
1643                                 printf("\n");
1644                         }
1645         }
1646
1647 And finally the item sets.  These include the GO TO tables and, for
1648 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1649 it up a bit.  First the items, with production number and associativity.
1650
1651         static void report_item(struct grammar *g, int itm)
1652         {
1653                 int p = item_prod(itm);
1654                 int dot = item_dot(itm);
1655                 struct production *pr = g->productions[p];
1656                 int i;
1657
1658                 printf("    ");
1659                 prtxt(pr->head->name);
1660                 printf(" ->");
1661                 for (i = 0; i < pr->body_size; i++) {
1662                         printf(" %s", (dot == i ? ". ": ""));
1663                         prtxt(pr->body[i]->name);
1664                 }
1665                 if (dot == pr->body_size)
1666                         printf(" .");
1667                 printf(" [%d]", p);
1668                 if (pr->precedence && dot == pr->body_size)
1669                         printf(" (%d%s)", pr->precedence,
1670                                assoc_names[pr->assoc]);
1671                 if (dot < pr->body_size &&
1672                     pr->body[dot]->precedence) {
1673                         struct symbol *s = pr->body[dot];
1674                         printf(" [%d%s]", s->precedence,
1675                                assoc_names[s->assoc]);
1676                 }
1677                 printf("\n");
1678         }
1679
1680 The LA sets which are (possibly) reported with each item:
1681
1682         static void report_la(struct grammar *g, int lanum)
1683         {
1684                 struct symset la = set_find(g, lanum);
1685                 int i;
1686                 char c = ':';
1687
1688                 printf("        LOOK AHEAD(%d)", lanum);
1689                 for (i = 0; i < la.cnt; i++) {
1690                         printf("%c ", c);
1691                         c = ',';
1692                         prtxt(g->symtab[la.syms[i]]->name);
1693                 }
1694                 printf("\n");
1695         }
1696
1697 Then the go to sets:
1698
1699         static void report_goto(struct grammar *g, struct symset gt)
1700         {
1701                 int i;
1702                 printf("    GOTO:\n");
1703
1704                 for (i = 0; i < gt.cnt; i++) {
1705                         printf("      ");
1706                         prtxt(g->symtab[gt.syms[i]]->name);
1707                         printf(" -> %d\n", gt.data[i]);
1708                 }
1709         }
1710
1711 Now we can report all the item sets complete with items, LA sets, and GO TO.
1712
1713         static void report_itemsets(struct grammar *g)
1714         {
1715                 int s;
1716                 printf("ITEM SETS(%d)\n", g->states);
1717                 for (s = 0; s < g->states; s++) {
1718                         int j;
1719                         struct itemset *is = g->statetab[s];
1720                         printf("  Itemset %d:", s);
1721                         if (is->precedence)
1722                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1723                         printf("\n");
1724                         for (j = 0; j < is->items.cnt; j++) {
1725                                 report_item(g, is->items.syms[j]);
1726                                 if (is->items.data != NO_DATA)
1727                                         report_la(g, is->items.data[j]);
1728                         }
1729                         report_goto(g, is->go_to);
1730                 }
1731         }
1732
1733 ### Reporting conflicts
1734
1735 Conflict detection varies a lot among different analysis levels.  However
1736 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1737 LR1 are also similar as they have FOLLOW or LA sets.
1738
1739 ###### functions
1740
1741         ## conflict functions
1742
1743         static int report_conflicts(struct grammar *g, enum grammar_type type)
1744         {
1745                 int cnt = 0;
1746                 printf("Conflicts:\n");
1747                 if (type < SLR)
1748                         cnt = conflicts_lr0(g, type);
1749                 else
1750                         cnt = conflicts_slr(g, type);
1751                 if (cnt == 0)
1752                         printf(" - no conflicts\n");
1753                 return cnt;
1754         }
1755
1756 LR0 conflicts are any state which have both a reducible item and
1757 a shiftable item, or two reducible items.
1758
1759 LR05 conflicts only occur if two possible reductions exist,
1760 as shifts always over-ride reductions.
1761
1762 ###### conflict functions
1763         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1764         {
1765                 int i;
1766                 int cnt = 0;
1767                 for (i = 0; i < g->states; i++) {
1768                         struct itemset *is = g->statetab[i];
1769                         int last_reduce = -1;
1770                         int prev_reduce = -1;
1771                         int last_shift = -1;
1772                         int j;
1773                         if (!is)
1774                                 continue;
1775                         for (j = 0; j < is->items.cnt; j++) {
1776                                 int itm = is->items.syms[j];
1777                                 int p = item_prod(itm);
1778                                 int bp = item_dot(itm);
1779                                 struct production *pr = g->productions[p];
1780
1781                                 if (bp == pr->body_size) {
1782                                         prev_reduce = last_reduce;
1783                                         last_reduce = j;
1784                                         continue;
1785                                 }
1786                                 if (pr->body[bp]->type == Terminal)
1787                                         last_shift = j;
1788                         }
1789                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1790                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1791                                 report_item(g, is->items.syms[last_shift]);
1792                                 report_item(g, is->items.syms[last_reduce]);
1793                                 cnt++;
1794                         }
1795                         if (prev_reduce >= 0) {
1796                                 printf("  State %d has 2 (or more) reducible items\n", i);
1797                                 report_item(g, is->items.syms[prev_reduce]);
1798                                 report_item(g, is->items.syms[last_reduce]);
1799                                 cnt++;
1800                         }
1801                 }
1802                 return cnt;
1803         }
1804
1805 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1806 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1807 in the source of the look ahead set.
1808
1809 We build two datasets to reflect the "action" table: one which maps
1810 terminals to items where that terminal could be shifted and another
1811 which maps terminals to items that could be reduced when the terminal
1812 is in look-ahead.  We report when we get conflicts between the two.
1813
1814         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1815         {
1816                 int i;
1817                 int cnt = 0;
1818
1819                 for (i = 0; i < g->states; i++) {
1820                         struct itemset *is = g->statetab[i];
1821                         struct symset shifts = INIT_DATASET;
1822                         struct symset reduce = INIT_DATASET;
1823                         int j;
1824                         if (!is)
1825                                 continue;
1826                         /* First collect the shifts */
1827                         for (j = 0; j < is->items.cnt; j++) {
1828                                 unsigned short itm = is->items.syms[j];
1829                                 int p = item_prod(itm);
1830                                 int bp = item_dot(itm);
1831                                 struct production *pr = g->productions[p];
1832                                 struct symbol *s;
1833
1834                                 if (bp >= pr->body_size ||
1835                                     pr->body[bp]->type != Terminal)
1836                                         /* not shiftable */
1837                                         continue;
1838
1839                                 s = pr->body[bp];
1840                                 if (s->precedence && is->precedence)
1841                                         /* Precedence resolves this, so no conflict */
1842                                         continue;
1843
1844                                 if (symset_find(&shifts, s->num) < 0)
1845                                         symset_add(&shifts, s->num, itm);
1846                         }
1847                         /* Now look for reductions and conflicts */
1848                         for (j = 0; j < is->items.cnt; j++) {
1849                                 unsigned short itm = is->items.syms[j];
1850                                 int p = item_prod(itm);
1851                                 int bp = item_dot(itm);
1852                                 struct production *pr = g->productions[p];
1853
1854                                 if (bp < pr->body_size)
1855                                         continue;
1856                                 /* reducible */
1857                                 struct symset la;
1858                                 if (type == SLR)
1859                                         la = g->follow[pr->head->num];
1860                                 else
1861                                         la = set_find(g, is->items.data[j]);
1862                                 int k;
1863                                 for (k = 0; k < la.cnt; k++) {
1864                                         int pos = symset_find(&shifts, la.syms[k]);
1865                                         if (pos >= 0) {
1866                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1867                                                 cnt++;
1868                                                         prtxt(g->symtab[la.syms[k]]->name);
1869                                                 printf(":\n");
1870                                                 report_item(g, shifts.data[pos]);
1871                                                 report_item(g, itm);
1872                                         }
1873                                         pos = symset_find(&reduce, la.syms[k]);
1874                                         if (pos < 0) {
1875                                                 symset_add(&reduce, la.syms[k], itm);
1876                                                 continue;
1877                                         }
1878                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1879                                         prtxt(g->symtab[la.syms[k]]->name);
1880                                         printf(":\n");
1881                                         report_item(g, itm);
1882                                         report_item(g, reduce.data[pos]);
1883                                         cnt++;
1884                                 }
1885                         }
1886                         symset_free(shifts);
1887                         symset_free(reduce);
1888                 }
1889                 return cnt;
1890         }
1891
1892
1893 ## Generating the parser
1894
1895 The exported part of the parser is the `parse_XX` function, where the name
1896 `XX` is based on the name of the parser files.
1897
1898 This takes a `code_node`, a partially initialized `token_config`, and an
1899 optional `FILE` to send tracing to.  The `token_config` gets the list of
1900 known words added and then is used with the `code_node` to initialize the
1901 scanner.
1902
1903 `parse_XX` then calls the library function `parser_run` to actually
1904 complete the parse.  This needs the `states` table, the `reductions`
1905 table and functions to call the various pieces of code provided in the
1906 grammar file, so they are generated first.
1907
1908 ###### parser_generate
1909
1910         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1911                                struct code_node *pre_reduce)
1912         {
1913                 gen_known(f, g);
1914                 gen_non_term(f, g);
1915                 gen_action(f, g);
1916                 gen_goto(f, g);
1917                 gen_states(f, g);
1918                 gen_reductions(f, g);
1919                 gen_reduce(f, g, file, pre_reduce);
1920                 gen_free(f, g);
1921
1922                 fprintf(f, "#line 0 \"gen_parser\"\n");
1923                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1924                         name);
1925                 fprintf(f, "{\n");
1926                 fprintf(f, "\tstruct token_state *tokens;\n");
1927                 fprintf(f, "\tconfig->words_marks = known;\n");
1928                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1929                 fprintf(f, "\ttokens = token_open(code, config);\n");
1930                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, reductions, do_reduce, do_free, trace, non_term, config);\n");
1931                 fprintf(f, "\ttoken_close(tokens);\n");
1932                 fprintf(f, "\treturn rv;\n");
1933                 fprintf(f, "}\n\n");
1934         }
1935
1936 ### Known words table
1937
1938 The known words table is simply an array of terminal symbols.
1939 The table of nonterminals used for tracing is a similar array.
1940
1941 ###### functions
1942
1943         static void gen_known(FILE *f, struct grammar *g)
1944         {
1945                 int i;
1946                 fprintf(f, "#line 0 \"gen_known\"\n");
1947                 fprintf(f, "static const char *known[] = {\n");
1948                 for (i = TK_reserved;
1949                      i < g->num_syms && g->symtab[i]->type == Terminal;
1950                      i++)
1951                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1952                                 g->symtab[i]->name.txt);
1953                 fprintf(f, "};\n\n");
1954         }
1955
1956         static void gen_non_term(FILE *f, struct grammar *g)
1957         {
1958                 int i;
1959                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1960                 fprintf(f, "static const char *non_term[] = {\n");
1961                 for (i = TK_reserved;
1962                      i < g->num_syms;
1963                      i++)
1964                         if (g->symtab[i]->type == Nonterminal ||
1965                             g->symtab[i]->isspecial)
1966                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1967                                         g->symtab[i]->name.txt);
1968                 fprintf(f, "};\n\n");
1969         }
1970
1971 ### The Action table
1972
1973 For this parser we do not have a separate action table.  The action
1974 table normally maps terminals to one of the actions SHIFT, REDUCE(p), or
1975 ACCEPT.  We can find which terminals to SHIFT via a lookup in the go to
1976 table, and most states have at most one reducible production and in that
1977 case to effect that reduction for any non-shiftable terminal in the
1978 look-ahead.
1979
1980 However, with SLR, LALR, and canonical-LR grammars, there may
1981 occasionally be multiple possible reductions which we choose between
1982 based on the look-ahead.  To handle this possibility we allow reductions
1983 to be stored in the go to table.  An extra flag indicates if an entry is
1984 a state to go to, or a production to reduce.  We add the entries with
1985 that flag here, if necessary.
1986
1987 ###### functions
1988         #define IS_REDUCTION (0x8000)
1989
1990         static void gen_action(FILE *f, struct grammar *g)
1991         {
1992                 int i, j, k;
1993                 for (i = 0; i < g->states; i++) {
1994                         struct itemset *is = g->statetab[i];
1995                         struct symset *gt = &is->go_to;
1996
1997                         if (is->reducible_prod != MANY_REDUCIBLE)
1998                                 continue;
1999                         for (j = 0; j < is->items.cnt; j++) {
2000                                 int itm = is->items.syms[j];
2001                                 int p = item_prod(itm);
2002                                 int bp = item_dot(itm);
2003                                 struct production *pr = g->productions[p];
2004                                 struct symset la;
2005                                 if (bp != pr->body_size)
2006                                         continue;
2007                                 /* This is a reducible item */
2008                                 if (g->follow)
2009                                         la = g->follow[pr->head->num];
2010                                 else
2011                                         la = set_find(g, is->items.data[j]);
2012                                 for (k = 0 ; k < la.cnt; k++)
2013                                         symset_add(gt, la.syms[k], p | IS_REDUCTION);
2014                         }
2015                 }
2016         }
2017
2018 ### States, reductions, and the go to tables.
2019
2020 For each state we record the go to table and the reducible production if
2021 there is one, the details of which are in a separate table of
2022 reductions.  Some of the details of the reducible production are stored
2023 in the `do_reduce` function to come later.  In the go to table we store
2024 the production number and in the reductions table: the body size (useful
2025 for stack management), the resulting symbol (useful for knowing how to
2026 free data later), and the size of the resulting asn object (useful for
2027 preallocation space.
2028
2029 The go to table is stored in a simple array of `sym` and corresponding
2030 `state`.
2031
2032 ###### exported types
2033
2034         struct lookup {
2035                 short sym;
2036                 short state_or_reduction:15;
2037                 unsigned short is_reduction:1;
2038         };
2039         struct reduction {
2040                 short size;
2041                 short sym;
2042                 short result_size;
2043         };
2044         struct state {
2045                 short reduce_prod;      // or NONE_REDUCIBLE or MANY_REDUCIBLE
2046                 short go_to_cnt;
2047                 const struct lookup * go_to; // includes reductions
2048         };
2049
2050 ###### functions
2051
2052         static void gen_goto(FILE *f, struct grammar *g)
2053         {
2054                 int i;
2055                 fprintf(f, "#line 0 \"gen_goto\"\n");
2056                 for (i = 0; i < g->states; i++) {
2057                         struct symset gt = g->statetab[i]->go_to;
2058                         int j;
2059
2060                         if (gt.cnt == 0)
2061                                 continue;
2062                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
2063                                 i);
2064                         for (j = 0; j < gt.cnt; j++) {
2065                                 if (gt.data[j] & IS_REDUCTION)
2066                                         fprintf(f, "\t{ %d, %d, 1 },\n",
2067                                                 gt.syms[j], gt.data[j]&~IS_REDUCTION);
2068                                 else
2069                                         fprintf(f, "\t{ %d, %d, 0 },\n",
2070                                                 gt.syms[j], gt.data[j]);
2071                         }
2072                         fprintf(f, "};\n");
2073                 }
2074         }
2075
2076         static void gen_reductions(FILE *f, struct grammar *g)
2077         {
2078                 int i;
2079                 fprintf(f, "#line 0 \"gen_reductions\"\n");
2080                 fprintf(f, "static const struct reduction reductions[] = {\n");
2081                 for (i = 0; i < g->production_count; i++) {
2082                         struct production *pr = g->productions[i];
2083                         struct symbol *hd = pr->head;
2084                         fprintf(f, "\t{%d, %d, ", pr->body_size, hd->num);
2085                         if (hd->struct_name.txt == NULL)
2086                                 fprintf(f, "0 },\n");
2087                         else
2088                                 fprintf(f, "sizeof(struct %.*s%s) },\n",
2089                                         hd->struct_name.len,
2090                                         hd->struct_name.txt,
2091                                         hd->isref ? "*" : "");
2092                 }
2093                 fprintf(f, "};\n\n");
2094         }
2095
2096         static void gen_states(FILE *f, struct grammar *g)
2097         {
2098                 int i;
2099                 fprintf(f, "#line 0 \"gen_states\"\n");
2100                 fprintf(f, "static const struct state states[] = {\n");
2101                 for (i = 0; i < g->states; i++) {
2102                         struct itemset *is = g->statetab[i];
2103                         int prod = is->reducible_prod;
2104
2105                         if (is->go_to.cnt)
2106                                 fprintf(f, "\t[%d] = { %d, %d, goto_%d },\n",
2107                                         i, prod, is->go_to.cnt, i);
2108                         else
2109                                 fprintf(f, "\t[%d] = { %d, 0, NULL },\n", i, prod);
2110                 }
2111                 fprintf(f, "};\n\n");
2112         }
2113
2114 ### The `do_reduce` function and the code
2115
2116 When the parser engine decides to reduce a production, it calls
2117 `do_reduce` which runs the code that was included with the production,
2118 if any.
2119
2120 This code needs to be able to store data somewhere so we record the size
2121 of the data expected with each state so it can be allocated before
2122 `do_reduce` is called.
2123
2124 In order for the code to access "global" context, we pass in the
2125 "config" pointer that was passed to the parser function.  If the `struct
2126 token_config` is embedded in some larger structure, the reducing code
2127 can access the larger structure using pointer manipulation.  Performing
2128 that pointer manipulation, and any other common code or variables for
2129 all reduce actions, can be provided in code node called "reduce" which
2130 is passed around in `pre_reduce` which you might have already noticed.
2131
2132 The code fragments require translation when written out.  Any `$N` needs
2133 to be converted to a reference either to that buffer (if `$0`) or to the
2134 structure returned by a previous reduction.  These pointers need to be
2135 cast to the appropriate type for each access.  All this is handled in
2136 `gen_code`.
2137
2138 `gen_code` also allows symbol references to contain a '`<`' as in
2139 '`$<2`'.  This is particularly useful for references (or pointers), but
2140 can be used with structures too.  The `<` implies that the value is
2141 being moved out, so the object will not be automatically freed.  It is
2142 equivalent to assigning `NULL` to the pointer or filling a structure
2143 with zeros.
2144
2145 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2146 and possibly a number.  A number by itself (other than zero) selects a
2147 symbol from the body of the production.  A sequence of letters selects
2148 the shortest symbol in the body which contains those letters in the
2149 given order.  If a number follows the letters, then a later occurrence
2150 of that symbol is chosen.  So "`$AB2`" will refer to the structure
2151 attached to the second occurrence of the shortest symbol which contains
2152 an `A` followed by a `B`.  If there is no unique shortest system, or if
2153 the number given is too large, then the symbol reference is not
2154 transformed, and will cause an error when the code is compiled.
2155
2156 ###### functions
2157
2158         static int textchr(struct text t, char c, int s)
2159         {
2160                 int i;
2161                 for (i = s; i < t.len; i++)
2162                         if (t.txt[i] == c)
2163                                 return i;
2164                 return -1;
2165         }
2166
2167         static int subseq_match(char *seq, int slen, struct text name)
2168         {
2169                 int st = 0;
2170                 while (slen > 0) {
2171                         st = textchr(name, *seq, st);
2172                         if (st < 0)
2173                                 return 0;
2174                         slen -= 1;
2175                         seq += 1;
2176                         st += 1;
2177                 }
2178                 return 1;
2179         }
2180
2181         static int choose_sym(char **namep, int len, struct production *p)
2182         {
2183                 char *name = *namep;
2184                 char *nam = name;
2185                 int namlen;
2186                 int n = 0;
2187                 int i, s, slen;
2188                 char c;
2189
2190                 c = *name;
2191                 while (len > 0 &&
2192                        ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2193                         name += 1;
2194                         len -= 1;
2195                         c = *name;
2196                 }
2197                 namlen = name-nam;
2198                 while (len > 0 && (c >= '0' && c <= '9')) {
2199                         name += 1;
2200                         len -= 1;
2201                         n = n * 10 + (c - '0');
2202                         c = *name;
2203                 }
2204                 if (namlen == 0) {
2205                         if (name == *namep || n > p->body_size)
2206                                 return -1;
2207                         *namep = name;
2208                         return n;
2209                 }
2210                 slen = 0; s = -1;
2211                 for (i = 0; i < p->body_size; i++) {
2212                         if (!subseq_match(nam, namlen, p->body[i]->name))
2213                                 continue;
2214                         if (slen == 0 || p->body[i]->name.len < slen) {
2215                                 s = i;
2216                                 slen = p->body[i]->name.len;
2217                         }
2218                         if (s >= 0 && p->body[i] != p->body[s] &&
2219                             p->body[i]->name.len == p->body[s]->name.len)
2220                                 /* not unique, so s cannot be used */
2221                                 s = -1;
2222                 }
2223                 if (s < 0)
2224                         return -1;
2225                 if (n == 0)
2226                         n = 1;
2227                 for (i = 0; i < p->body_size; i++)
2228                         if (p->body[i] == p->body[s]) {
2229                                 n -= 1;
2230                                 if (n == 0)
2231                                         break;
2232                         }
2233                 if (n > 0 || i == p->body_size)
2234                         return -1;
2235                 *namep = name;
2236                 return i + 1;
2237         }
2238
2239         static void gen_code(struct production *p, FILE *f, struct grammar *g)
2240         {
2241                 char *c;
2242                 char *used = calloc(1, p->body_size);
2243                 int i;
2244
2245                 fprintf(f, "\t\t\t");
2246                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2247                         int n;
2248                         int use = 0;
2249                         if (*c != '$') {
2250                                 fputc(*c, f);
2251                                 if (*c == '\n')
2252                                         fputs("\t\t\t", f);
2253                                 continue;
2254                         }
2255                         c++;
2256                         if (*c == '<') {
2257                                 use = 1;
2258                                 c++;
2259                         }
2260                         n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2261                         if (n < 0) {
2262                                 fputc('$', f);
2263                                 if (use)
2264                                         fputc('<', f);
2265                                 fputc(*c, f);
2266                                 continue;
2267                         }
2268                         if (n == 0)
2269                                 fprintf(f, "(*(struct %.*s*%s)ret)",
2270                                         p->head->struct_name.len,
2271                                         p->head->struct_name.txt,
2272                                         p->head->isref ? "*":"");
2273                         else if (p->body[n-1]->type == Terminal)
2274                                 fprintf(f, "(*(struct token *)body[%d])",
2275                                         n-1);
2276                         else if (p->body[n-1]->struct_name.txt == NULL)
2277                                 fprintf(f, "$%d", n);
2278                         else {
2279                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2280                                         p->body[n-1]->struct_name.len,
2281                                         p->body[n-1]->struct_name.txt,
2282                                         p->body[n-1]->isref ? "*":"", n-1);
2283                                 used[n-1] = use;
2284                         }
2285                         c -= 1;
2286                 }
2287                 for (i = 0; i < p->body_size; i++) {
2288                         if (p->body[i]->struct_name.txt &&
2289                             used[i]) {
2290                                 // assume this has been copied out
2291                                 if (p->body[i]->isref)
2292                                         fprintf(f, "\t\t*(void**)body[%d] = NULL;", i);
2293                                 else
2294                                         fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2295                         }
2296                 }
2297                 fputs("\n", f);
2298                 free(used);
2299         }
2300
2301 ###### functions
2302
2303         static void gen_reduce(FILE *f, struct grammar *g, char *file,
2304                                struct code_node *pre_reduce)
2305         {
2306                 int i;
2307                 fprintf(f, "#line 1 \"gen_reduce\"\n");
2308                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2309                 fprintf(f, "{\n");
2310                 fprintf(f, "\tint ret_size = 0;\n");
2311                 if (pre_reduce)
2312                         code_node_print(f, pre_reduce, file);
2313
2314                 fprintf(f, "#line 4 \"gen_reduce\"\n");
2315                 fprintf(f, "\tswitch(prod) {\n");
2316                 for (i = 0; i < g->production_count; i++) {
2317                         struct production *p = g->productions[i];
2318                         fprintf(f, "\tcase %d:\n", i);
2319
2320                         if (p->code.txt) {
2321                                 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2322                                 gen_code(p, f, g);
2323                                 fprintf(f, "#line 7 \"gen_reduce\"\n");
2324                         }
2325
2326                         if (p->head->struct_name.txt)
2327                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2328                                         p->head->struct_name.len,
2329                                         p->head->struct_name.txt,
2330                                         p->head->isref ? "*":"");
2331
2332                         fprintf(f, "\t\tbreak;\n");
2333                 }
2334                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2335         }
2336
2337 ### `do_free`
2338
2339 As each non-terminal can potentially cause a different type of data
2340 structure to be allocated and filled in, we need to be able to free it when
2341 done.
2342
2343 It is particularly important to have fine control over freeing during error
2344 recovery where individual stack frames might need to be freed.
2345
2346 For this, the grammar author is required to defined a `free_XX` function for
2347 each structure that is used by a non-terminal.  `do_free` will call whichever
2348 is appropriate given a symbol number, and will call `free` (as is
2349 appropriate for tokens) on any terminal symbol.
2350
2351 ###### functions
2352
2353         static void gen_free(FILE *f, struct grammar *g)
2354         {
2355                 int i;
2356
2357                 fprintf(f, "#line 0 \"gen_free\"\n");
2358                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2359                 fprintf(f, "{\n");
2360                 fprintf(f, "\tif (!asn) return;\n");
2361                 fprintf(f, "\tif (sym < %d", g->first_nonterm);
2362                 /* Need to handle special terminals too */
2363                 for (i = 0; i < g->num_syms; i++) {
2364                         struct symbol *s = g->symtab[i];
2365                         if (i >= g->first_nonterm && s->type == Terminal &&
2366                             s->isspecial)
2367                                 fprintf(f, " || sym == %d", s->num);
2368                 }
2369                 fprintf(f, ") {\n");
2370                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2371                 fprintf(f, "\tswitch(sym) {\n");
2372
2373                 for (i = 0; i < g->num_syms; i++) {
2374                         struct symbol *s = g->symtab[i];
2375                         if (!s ||
2376                             s->type != Nonterminal ||
2377                             s->struct_name.txt == NULL)
2378                                 continue;
2379
2380                         fprintf(f, "\tcase %d:\n", s->num);
2381                         if (s->isref) {
2382                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2383                                         s->struct_name.len,
2384                                         s->struct_name.txt);
2385                                 fprintf(f, "\t\tfree(asn);\n");
2386                         } else
2387                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2388                                         s->struct_name.len,
2389                                         s->struct_name.txt);
2390                         fprintf(f, "\t\tbreak;\n");
2391                 }
2392                 fprintf(f, "\t}\n}\n\n");
2393         }
2394
2395 ## The main routine.
2396
2397 There are three key parts to the "main" function of parsergen: processing
2398 the arguments, loading the grammar file, and dealing with the grammar.
2399
2400 ### Argument processing.
2401
2402 Command line options allow the selection of analysis mode, name of output
2403 file, and whether or not a report should be generated.  By default we create
2404 a report only if no code output was requested.
2405
2406 The `parse_XX` function name uses the basename of the output file which
2407 should not have a suffix (`.c`).  `.c` is added to the given name for the
2408 code, and `.h` is added for the header (if header text is specifed in the
2409 grammar file).
2410
2411 ###### includes
2412         #include <getopt.h>
2413
2414 ###### declarations
2415         static const struct option long_options[] = {
2416                 { "LR0",        0, NULL, '0' },
2417                 { "LR05",       0, NULL, '5' },
2418                 { "SLR",        0, NULL, 'S' },
2419                 { "LALR",       0, NULL, 'L' },
2420                 { "LR1",        0, NULL, '1' },
2421                 { "tag",        1, NULL, 't' },
2422                 { "report",     0, NULL, 'R' },
2423                 { "output",     1, NULL, 'o' },
2424                 { NULL,         0, NULL, 0   }
2425         };
2426         const char *options = "05SL1t:Ro:";
2427
2428 ###### process arguments
2429         int opt;
2430         char *outfile = NULL;
2431         char *infile;
2432         char *name;
2433         char *tag = NULL;
2434         int report = 1;
2435         enum grammar_type type = LR05;
2436         while ((opt = getopt_long(argc, argv, options,
2437                                   long_options, NULL)) != -1) {
2438                 switch(opt) {
2439                 case '0':
2440                         type = LR0; break;
2441                 case '5':
2442                         type = LR05; break;
2443                 case 'S':
2444                         type = SLR; break;
2445                 case 'L':
2446                         type = LALR; break;
2447                 case '1':
2448                         type = LR1; break;
2449                 case 'R':
2450                         report = 2; break;
2451                 case 'o':
2452                         outfile = optarg; break;
2453                 case 't':
2454                         tag = optarg; break;
2455                 default:
2456                         fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2457                         exit(1);
2458                 }
2459         }
2460         if (optind < argc)
2461                 infile = argv[optind++];
2462         else {
2463                 fprintf(stderr, "No input file given\n");
2464                 exit(1);
2465         }
2466         if (outfile && report == 1)
2467                 report = 0;
2468         name = outfile;
2469         if (name && strchr(name, '/'))
2470                 name = strrchr(name, '/')+1;
2471
2472         if (optind < argc) {
2473                 fprintf(stderr, "Excess command line arguments\n");
2474                 exit(1);
2475         }
2476
2477 ### Loading the grammar file
2478
2479 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2480 map it.
2481
2482 Once we have extracted the code (with `mdcode`) we expect to find three
2483 or four sections: header, code, grammar, reduce.
2484 Anything else that is not excluded by the `--tag` option is an error.
2485
2486 "header", "code", and "reduce" are optional, though it is hard to build
2487 a working parser without the first two.  "grammar" must be provided.
2488
2489 ###### includes
2490         #include <fcntl.h>
2491         #include <sys/mman.h>
2492         #include <errno.h>
2493
2494 ###### functions
2495         static int errs;
2496         static void pr_err(char *msg)
2497         {
2498                 errs++;
2499                 fprintf(stderr, "%s\n", msg);
2500         }
2501
2502 ###### load file
2503         struct section *table;
2504         int fd;
2505         int len;
2506         char *file;
2507         fd = open(infile, O_RDONLY);
2508         if (fd < 0) {
2509                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2510                         infile, strerror(errno));
2511                 exit(1);
2512         }
2513         len = lseek(fd, 0, 2);
2514         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2515         table = code_extract(file, file+len, pr_err);
2516
2517         struct code_node *hdr = NULL;
2518         struct code_node *code = NULL;
2519         struct code_node *gram = NULL;
2520         struct code_node *pre_reduce = NULL;
2521         for (s = table; s; s = s->next) {
2522                 struct text sec = s->section;
2523                 if (tag && !strip_tag(&sec, tag))
2524                         continue;
2525                 if (text_is(sec, "header"))
2526                         hdr = s->code;
2527                 else if (text_is(sec, "code"))
2528                         code = s->code;
2529                 else if (text_is(sec, "grammar"))
2530                         gram = s->code;
2531                 else if (text_is(sec, "reduce"))
2532                         pre_reduce = s->code;
2533                 else {
2534                         fprintf(stderr, "Unknown content section: %.*s\n",
2535                                 s->section.len, s->section.txt);
2536                         rv |= 2;
2537                 }
2538         }
2539
2540 ### Processing the input
2541
2542 As we need to append an extention to a filename and then open it for
2543 writing, and we need to do this twice, it helps to have a separate function.
2544
2545 ###### functions
2546
2547         static FILE *open_ext(char *base, char *ext)
2548         {
2549                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2550                 FILE *f;
2551                 strcat(strcpy(fn, base), ext);
2552                 f = fopen(fn, "w");
2553                 free(fn);
2554                 return f;
2555         }
2556
2557 If we can read the grammar, then we analyse and optionally report on it.  If
2558 the report finds conflicts we will exit with an error status.
2559
2560 ###### process input
2561         struct grammar *g = NULL;
2562         if (gram == NULL) {
2563                 fprintf(stderr, "No grammar section provided\n");
2564                 rv |= 2;
2565         } else {
2566                 g = grammar_read(gram);
2567                 if (!g) {
2568                         fprintf(stderr, "Failure to parse grammar\n");
2569                         rv |= 2;
2570                 }
2571         }
2572         if (g) {
2573                 grammar_analyse(g, type);
2574                 if (report)
2575                         if (grammar_report(g, type))
2576                                 rv |= 1;
2577         }
2578
2579 If a "headers" section is defined, we write it out and include a declaration
2580 for the `parse_XX` function so it can be used from separate code.
2581
2582         if (rv == 0 && hdr && outfile) {
2583                 FILE *f = open_ext(outfile, ".h");
2584                 if (f) {
2585                         code_node_print(f, hdr, infile);
2586                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2587                                 name);
2588                         fclose(f);
2589                 } else {
2590                         fprintf(stderr, "Cannot create %s.h\n",
2591                                 outfile);
2592                         rv |= 4;
2593                 }
2594         }
2595
2596 And if all goes well and an output file was provided, we create the `.c`
2597 file with the code section (if any) and the parser tables and function.
2598
2599         if (rv == 0 && outfile) {
2600                 FILE *f = open_ext(outfile, ".c");
2601                 if (f) {
2602                         if (code)
2603                                 code_node_print(f, code, infile);
2604                         gen_parser(f, g, infile, name, pre_reduce);
2605                         fclose(f);
2606                 } else {
2607                         fprintf(stderr, "Cannot create %s.c\n",
2608                                 outfile);
2609                         rv |= 4;
2610                 }
2611         }
2612
2613 And that about wraps it up.  We need to set the locale so that UTF-8 is
2614 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2615
2616 ###### File: parsergen.mk
2617         parsergen : parsergen.o libscanner.o libmdcode.o
2618                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2619
2620 ###### includes
2621         #include <locale.h>
2622
2623 ###### main
2624
2625         int main(int argc, char *argv[])
2626         {
2627                 struct section *s;
2628                 int rv = 0;
2629
2630                 setlocale(LC_ALL,"");
2631
2632                 ## process arguments
2633                 ## load file
2634                 ## process input
2635
2636                 return rv;
2637         }
2638
2639 ## The SHIFT/REDUCE parser
2640
2641 Having analysed the grammar and generated all the tables, we only need
2642 the shift/reduce engine to bring it all together.
2643
2644 ### Go to table lookup
2645
2646 The parser generator has nicely provided us with go to tables sorted by
2647 symbol number.  We need a binary search function to find a symbol in the
2648 table.
2649
2650 ###### parser functions
2651
2652         static int search(const struct state *l, int sym, int reduction)
2653         {
2654                 int lo = 0;
2655                 int hi = l->go_to_cnt;
2656
2657                 if (hi == 0)
2658                         return -1;
2659                 while (lo + 1 < hi) {
2660                         int mid = (lo + hi) / 2;
2661                         if (l->go_to[mid].sym <= sym)
2662                                 lo = mid;
2663                         else
2664                                 hi = mid;
2665                 }
2666                 if (l->go_to[lo].sym != sym)
2667                         return -1;
2668                 if (!reduction == !l->go_to[lo].is_reduction)
2669                         return l->go_to[lo].state_or_reduction;
2670                 return -1;
2671         }
2672
2673 ### Memory allocation
2674
2675 The `scanner` returns tokens in a local variable - we want them in allocated
2676 memory so they can live in the `asn_stack`.  So we provide `tok_copy` to
2677 make an allocated copy as required.
2678
2679 ###### parser includes
2680         #include <memory.h>
2681
2682 ###### parser functions
2683
2684         static struct token *tok_copy(struct token tk)
2685         {
2686                 struct token *new = malloc(sizeof(*new));
2687                 *new = tk;
2688                 return new;
2689         }
2690
2691 ### The state stack.
2692
2693 The core data structure for the parser is the stack.  This tracks all
2694 the symbols that have been recognised or partially recognised.
2695
2696 The stack usually won't grow very large - maybe a few tens of entries.
2697 So we dynamically grow an array as required but never bother to
2698 shrink it down again.
2699
2700 We keep the stack as two separate allocations.  One, `asn_stack`, stores
2701 the "abstract syntax nodes" which are created by each reduction.  When
2702 we call `do_reduce` we need to pass an array of the `asn`s of the body
2703 of the production, and by keeping a separate `asn` stack, we can just
2704 pass a pointer into this stack.
2705
2706 The other allocation stores all other stack fields of which there are
2707 two.  The `state` is the most important one and guides the parsing
2708 process.  The `sym` is nearly unnecessary as it is implicit in the
2709 state.  However when we want to free entries from the `asn_stack`, it
2710 helps to know what type they are so we can call the right freeing
2711 function.  The symbol leads us to the right free function through
2712 `do_free`.
2713
2714 The stack is most properly seen as alternating states and symbols -
2715 states, like the 'DOT' in items, are between symbols.  Each frame in
2716 our stack holds a state and the symbol that was before it.  The
2717 bottom of stack holds the start state but no symbol, as nothing came
2718 before the beginning.  As we need to store some value, `TK_eof` is used
2719 to mark the beginning of the file as well as the end.
2720
2721 ###### parser functions
2722
2723         struct parser {
2724                 struct frame {
2725                         short state;
2726                         short sym;
2727                 } *stack;
2728                 void **asn_stack;
2729                 int stack_size;
2730                 int tos;
2731
2732                 ## parser state
2733         };
2734
2735 #### Shift and pop
2736
2737 Two operations are needed on the stack - shift (which is like push) and pop.
2738
2739 Shift applies not only to terminals but also to non-terminals.  When we
2740 reduce a production we will pop off frames corresponding to the body
2741 symbols, then push on a frame for the head of the production.  This last
2742 is exactly the same process as shifting in a terminal so we use the same
2743 function for both.  In both cases we provide the symbol.  The state is
2744 deduced from the current top-of-stack state and the new symbol.
2745
2746 To simplify other code we arrange for `shift` to fail if there is no `go to`
2747 state for the symbol.  This is useful in basic parsing due to our design
2748 that we shift when we can, and reduce when we cannot.  So the `shift`
2749 function reports if it could.
2750
2751 `shift` is also used to push state zero onto the stack, so if the
2752 stack is empty, it always chooses zero as the next state.
2753
2754 So `shift` finds the next state.  If that succeeds it extends the
2755 allocations if needed and pushes all the information onto the stacks.
2756
2757 ###### parser functions
2758
2759         static int shift(struct parser *p,
2760                          short sym, void *asn,
2761                          const struct state states[])
2762         {
2763                 struct frame next = {0};
2764                 int newstate = p->tos
2765                         ? search(&states[p->stack[p->tos-1].state],
2766                                  sym, 0)
2767                         : 0;
2768                 if (newstate < 0)
2769                         return 0;
2770
2771                 if (p->tos >= p->stack_size) {
2772                         p->stack_size += 10;
2773                         p->stack = realloc(p->stack, p->stack_size
2774                                            * sizeof(p->stack[0]));
2775                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2776                                            * sizeof(p->asn_stack[0]));
2777                 }
2778                 next.sym = sym;
2779                 next.state = newstate;
2780
2781                 p->stack[p->tos] = next;
2782                 p->asn_stack[p->tos] = asn;
2783                 p->tos++;
2784                 return 1;
2785         }
2786
2787 `pop` primarily moves the top of stack (`tos`) back down the required
2788 amount and frees any `asn` entries that need to be freed.  It is called
2789 _after_ we reduce a production, just before we `shift` the nonterminal
2790 in.
2791
2792 ###### parser functions
2793
2794         static void pop(struct parser *p, int num,
2795                         void(*do_free)(short sym, void *asn))
2796         {
2797                 int i;
2798
2799                 p->tos -= num;
2800                 for (i = 0; i < num; i++)
2801                         do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]);
2802         }
2803
2804 ### The heart of the parser.
2805
2806 Now we have the parser.  For each token we might shift it, trigger a
2807 reduction, or start error handling.  2D tokens (IN, OUT, NEWLINE, EOL)
2808 might also be ignored.  Ignoring tokens is combined with shifting.
2809
2810 ###### parser vars
2811
2812         struct parser p = { 0 };
2813         struct token *tk = NULL;
2814         int accepted = 0;
2815
2816 ###### heart of parser
2817
2818         shift(&p, TK_eof, NULL, states);
2819         while (!accepted && p.tos > 0) {
2820                 struct frame *tos = &p.stack[p.tos-1];
2821                 if (!tk)
2822                         tk = tok_copy(token_next(tokens));
2823                 parser_trace(trace, &p,
2824                              tk, states, non_term, config->known_count);
2825
2826                 ## try shift or ignore
2827                 ## try reduce
2828                 ## handle error
2829         }
2830
2831 Indents are ignored unless they can be shifted onto the stack
2832 immediately or nothing can be shifted (in which case we reduce, and try
2833 again).  The end of an indented section - the OUT token - is ignored
2834 precisely when the indent was ignored.  To keep track of this we need a
2835 small stack of flags, which is easily stored as bits in an `unsigned
2836 long`.  This will never overflow and the scanner only allows 20 levels
2837 of indentation.
2838
2839 ###### parser state
2840         unsigned long ignored_indents;
2841
2842 NEWLINE is ignored when in an indented section of text which was not
2843 explicitly expected by the grammar.  So if the most recent indent is
2844 ignored, so is any NEWLINE token.
2845
2846 If a NEWLINE is seen but it cannot be shifted, we try to shift an EOL
2847 token instead.  If that succeeds, we make a new copy of the NEWLINE
2848 token and continue.  This allows a NEWLINE to appear to be preceded by
2849 an indefinite number of EOL tokens.
2850
2851 The token number for `EOL` cannot be statically declared, so when the
2852 parser starts we need to look through the array of non-terminals to find
2853 the EOL.
2854
2855 ###### parser state
2856         int tk_eol;
2857
2858 ###### find eol
2859         p.tk_eol = 0;
2860         while (strcmp(non_term[p.tk_eol], "EOL") != 0)
2861                 p.tk_eol += 1;
2862         p.tk_eol += TK_reserved + config->known_count;
2863
2864 For other tokens, we shift the next token if that is possible, otherwise
2865 we try to reduce a production.
2866
2867 ###### try shift or ignore
2868
2869         if ((tk->num == TK_newline || tk->num == TK_out) &&
2870             (p.ignored_indents & 1)) {
2871                 /* indented, so ignore OUT and NEWLINE */
2872                 if (tk->num == TK_out)
2873                         p.ignored_indents >>= 1;
2874                 free(tk);
2875                 tk = NULL;
2876                 parser_trace_action(trace, "Ignore");
2877                 continue;
2878         }
2879
2880         if (shift(&p, tk->num, tk, states)) {
2881                 if (tk->num == TK_out)
2882                         p.ignored_indents >>= 1;
2883                 if (tk->num == TK_in)
2884                         p.ignored_indents <<= 1;
2885
2886                 parser_trace_action(trace, "Shift");
2887                 tk = NULL;
2888                 ## did shift
2889                 continue;
2890         } else if (tk->num == TK_newline &&
2891                    shift(&p, p.tk_eol, tk, states)) {
2892                 tk = tok_copy(*tk);
2893                 parser_trace_action(trace, "ShiftEOL");
2894                 continue;
2895         }
2896
2897         if (tk->num == TK_in) {
2898                 int should_reduce;
2899                 if (states[tos->state].reduce_prod == MANY_REDUCIBLE)
2900                         /* Only reduce if TK_in in lookahead */
2901                         should_reduce = (search(&states[tos->state], TK_in, 1) >= 0);
2902                 else
2903                         /* Only reduce if we cannot shift anything */
2904                         should_reduce = (states[tos->state].go_to_cnt == 0);
2905                 if (!should_reduce) {
2906                         /* No indent expected here and reduce is not indicated,
2907                          * so ignore IN
2908                          */
2909                         free(tk);
2910                         tk = NULL;
2911                         p.ignored_indents <<= 1;
2912                         p.ignored_indents |= 1;
2913                         parser_trace_action(trace, "Ignore");
2914                         continue;
2915                 }
2916         }
2917
2918 We have already discussed the bulk of the handling of a "reduce" action,
2919 with the `pop()` and `shift()` functions doing much of the work.  There
2920 is a little more complexity needed to manage storage for the asn (Abstract
2921 Syntax Node), and also a test of whether the reduction is permitted.
2922
2923 When we try to shift the result of reducing production-zero, it will
2924 fail because there is no next state.  In this case the asn will not have
2925 been stored on the stack, so it get stored in the `ret` variable, and we
2926 report that that input has been accepted.
2927
2928 ###### parser vars
2929
2930         void *ret = NULL;
2931         int reduction;
2932
2933 ###### try reduce
2934
2935         reduction = states[tos->state].reduce_prod;
2936         if (reduction == MANY_REDUCIBLE)
2937                 reduction = search(&states[tos->state], tk->num, 1);
2938         if (reduction >= 0) {
2939                 void **body;
2940                 void *res;
2941                 int size = reductions[reduction].size;
2942                 int res_size = reductions[reduction].result_size;
2943
2944                 body = p.asn_stack + (p.tos - size);
2945                 res = res_size ? calloc(1, res_size) : NULL;
2946                 res_size = do_reduce(reduction, body, config, res);
2947                 if (res_size != reductions[reduction].result_size)
2948                         abort();
2949                 pop(&p, size, do_free);
2950                 if (!shift(&p, reductions[reduction].sym, res, states)) {
2951                         accepted = 1;
2952                         ret = res;
2953                         parser_trace_action(trace, "Accept");
2954                 } else
2955                         parser_trace_action(trace, "Reduce");
2956                 continue;
2957         }
2958
2959 If we can neither shift nor reduce we have an error to handle.  There
2960 are two possible responses to an error: we can pop single frames off the
2961 stack until we find a frame that can shift `TK_error`, or we can discard
2962 the current look-ahead symbol.
2963
2964 When we first see an error we do the first of these and set a flag to
2965 record that we are processing an error.  If the normal shift/reduce
2966 tests fail to find that anything can be done from that state, we start
2967 dropping tokens until either we manage to shift one, or reach end-of-file.
2968
2969 ###### parser vars
2970
2971         int in_err = 0;
2972
2973 ###### did shift
2974
2975         in_err = 0;
2976
2977 ###### handle error
2978
2979         if (in_err) {
2980                 parser_trace_action(trace, "DISCARD");
2981                 if (tk->num == TK_eof)
2982                         break;
2983                 free(tk);
2984                 tk = NULL;
2985         } else {
2986                 struct token *err_tk;
2987
2988                 parser_trace_action(trace, "ERROR");
2989
2990                 err_tk = tok_copy(*tk);
2991                 while (p.tos > 0 && shift(&p, TK_error, err_tk, states) == 0)
2992                         // discard this state
2993                         pop(&p, 1, do_free);
2994                 if (p.tos == 0) {
2995                         free(err_tk);
2996                         // no state accepted TK_error
2997                         break;
2998                 }
2999                 in_err = 1;
3000         }
3001
3002 ###### parser includes
3003         #include "parser.h"
3004
3005 ###### parser_run
3006
3007         void *parser_run(struct token_state *tokens,
3008                          const struct state states[],
3009                          const struct reduction reductions[],
3010                          int (*do_reduce)(int, void**, struct token_config*, void*),
3011                          void (*do_free)(short, void*),
3012                          FILE *trace, const char *non_term[],
3013                          struct token_config *config)
3014         {
3015                 ## parser vars
3016
3017                 ## find eol
3018
3019                 ## heart of parser
3020
3021                 free(tk);
3022                 pop(&p, p.tos, do_free);
3023                 free(p.asn_stack);
3024                 free(p.stack);
3025                 return ret;
3026         }
3027
3028 ###### exported functions
3029         void *parser_run(struct token_state *tokens,
3030                          const struct state states[],
3031                          const struct reduction reductions[],
3032                          int (*do_reduce)(int, void**, struct token_config*, void*),
3033                          void (*do_free)(short, void*),
3034                          FILE *trace, const char *non_term[],
3035                          struct token_config *config);
3036
3037 ### Tracing
3038
3039 Being able to visualize the parser in action can be invaluable when
3040 debugging the parser code, or trying to understand how the parse of a
3041 particular grammar progresses.  The stack contains all the important
3042 state, so just printing out the stack every time around the parse loop
3043 can make it possible to see exactly what is happening.
3044
3045 This doesn't explicitly show each SHIFT and REDUCE action.  However they
3046 are easily deduced from the change between consecutive lines, and the
3047 details of each state can be found by cross referencing the states list
3048 in the stack with the "report" that parsergen can generate.
3049
3050 For terminal symbols, we just dump the token.  For non-terminals we
3051 print the name of the symbol.  The look ahead token is reported at the
3052 end inside square brackets.
3053
3054 ###### parser functions
3055
3056         static char *reserved_words[] = {
3057                 [TK_error]        = "ERROR",
3058                 [TK_in]           = "IN",
3059                 [TK_out]          = "OUT",
3060                 [TK_newline]      = "NEWLINE",
3061                 [TK_eof]          = "$eof",
3062         };
3063
3064         void parser_trace(FILE *trace, struct parser *p,
3065                           struct token *tk, const struct state states[],
3066                           const char *non_term[], int knowns)
3067         {
3068                 int i;
3069                 if (!trace)
3070                         return;
3071                 for (i = 0; i < p->tos; i++) {
3072                         struct frame *f = &p->stack[i];
3073                         if (i) {
3074                                 int sym = f->sym;
3075                                 if (sym < TK_reserved &&
3076                                     reserved_words[sym] != NULL)
3077                                         fputs(reserved_words[sym], trace);
3078                                 else if (sym < TK_reserved + knowns) {
3079                                         struct token *t = p->asn_stack[i];
3080                                         text_dump(trace, t->txt, 20);
3081                                 } else
3082                                         fputs(non_term[sym - TK_reserved - knowns],
3083                                               trace);
3084                                 fputs(" ", trace);
3085                         }
3086                         fprintf(trace, "(%d) ", f->state);
3087                 }
3088                 fprintf(trace, "[");
3089                 if (tk->num < TK_reserved &&
3090                     reserved_words[tk->num] != NULL)
3091                         fputs(reserved_words[tk->num], trace);
3092                 else
3093                         text_dump(trace, tk->txt, 20);
3094                 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3095         }
3096
3097         void parser_trace_action(FILE *trace, char *action)
3098         {
3099                 if (trace)
3100                         fprintf(trace, " - %s\n", action);
3101         }
3102
3103 # A Worked Example
3104
3105 The obvious example for a parser is a calculator.
3106
3107 As `scanner` provides number parsing function using `libgmp` it is not much
3108 work to perform arbitrary rational number calculations.
3109
3110 This calculator takes one expression, or an equality test, per line.
3111 The results are printed and if any equality test fails, the program
3112 exits with an error.
3113
3114 ###### File: parsergen.mk
3115         calc.c calc.h : parsergen parsergen.mdc
3116                 ./parsergen --tag calc -o calc parsergen.mdc
3117         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3118                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3119         calctest : calc
3120                 ./calc parsergen.mdc
3121         demos :: calctest
3122
3123 # calc: header
3124
3125         #include "parse_number.h"
3126         // what do we use for a demo-grammar?  A calculator of course.
3127         struct number {
3128                 mpq_t val;
3129                 char tail[2];
3130                 int err;
3131         };
3132
3133 # calc: code
3134
3135         #include <stdlib.h>
3136         #include <unistd.h>
3137         #include <fcntl.h>
3138         #include <sys/mman.h>
3139         #include <stdio.h>
3140         #include <malloc.h>
3141         #include <gmp.h>
3142         #include <string.h>
3143         #include "mdcode.h"
3144         #include "scanner.h"
3145         #include "parser.h"
3146
3147         #include "calc.h"
3148
3149         static void free_number(struct number *n)
3150         {
3151                 mpq_clear(n->val);
3152                 free(n);
3153         }
3154
3155         static int text_is(struct text t, char *s)
3156         {
3157                 return (strlen(s) == t.len &&
3158                         strncmp(s, t.txt, t.len) == 0);
3159         }
3160
3161         int main(int argc, char *argv[])
3162         {
3163                 int fd = open(argv[1], O_RDONLY);
3164                 int len = lseek(fd, 0, 2);
3165                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3166                 struct section *table = code_extract(file, file+len, NULL);
3167                 struct section *s;
3168                 struct token_config config = {
3169                         .ignored = (1 << TK_line_comment)
3170                                  | (1 << TK_in)
3171                                  | (1 << TK_out),
3172                         .number_chars = ".,_+-",
3173                         .word_start = "",
3174                         .word_cont = "",
3175                 };
3176                 for (s = table; s; s = s->next)
3177                         if (text_is(s->section, "example: input"))
3178                                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3179                 while (table) {
3180                         struct section *t = table->next;
3181                         code_free(table->code);
3182                         free(table);
3183                         table = t;
3184                 }
3185                 exit(0);
3186         }
3187
3188 # calc: grammar
3189
3190         $LEFT + -
3191         $LEFT * / //
3192
3193         Session -> Session Line
3194                 | Line
3195
3196         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3197                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3198                                         gmp_printf("  or as a decimal: %Fg\n", fl);
3199                                         mpf_clear(fl);
3200                                         }
3201                                      }$
3202                 | Expression = Expression NEWLINE ${
3203                         if (mpq_equal($1.val, $3.val))
3204                                 gmp_printf("Both equal %Qd\n", $1.val);
3205                         else {
3206                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
3207                                         $1.val, $3.val);
3208                                 exit(1);
3209                         }
3210                 }$
3211                 | NEWLINE ${ printf("Blank line\n"); }$
3212                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3213
3214         $number
3215         Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3216                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3217                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3218                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3219                 | Expression // Expression ${ {
3220                         mpz_t z0, z1, z2;
3221                         mpq_init($0.val);
3222                         mpz_init(z0); mpz_init(z1); mpz_init(z2);
3223                         mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3224                         mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3225                         mpz_tdiv_q(z0, z1, z2);
3226                         mpq_set_z($0.val, z0);
3227                         mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3228                 } }$
3229                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3230                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3231
3232 # example: input
3233
3234         355/113
3235         3.1415926535 - 355/113
3236         2 + 4 * 5
3237         1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3238         10 * 9 / 2
3239         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
3240
3241         355//113
3242
3243         error
3244
3245 ## A second example
3246
3247 I had originally thought that the LR05 style grammar would suit me there
3248 should never be a sequence of token that means either of two different
3249 things depending on what comes follows it.  Without that ambiguity, the
3250 extra strength of LALR, or even SLR, would not be needed.  However I
3251 discovered this was too simplistic.  There are times when it is quite
3252 reasonable for the empty string, or maybe a NEWLINE, to have different
3253 meanings depending on the next token.  For this, LR05 is not sufficient,
3254 so I added full action-table support.
3255
3256 This example tests the selection of REDUCE actions based on the
3257 look-ahead symbol, using a trivial grammar where the empty string can
3258 reasonably mean different things depending on what it precedes.
3259
3260
3261 ####### File: parsergen.mk
3262         demos :: do_lalr_demo
3263         lalr_demo.c lalr_demo.h : parsergen parsergen.mdc
3264                 ./parsergen --LALR --tag demo -o lalr_demo parsergen.mdc
3265         lalr_demo: lalr_demo.o libparser.o libscanner.o libmdcode.o
3266                 $(CC) -o lalr_demo lalr_demo.c libparser.o libscanner.o libmdcode.o -licuuc
3267         do_lalr_demo: lalr_demo
3268                 NOTRACE=1 ./lalr_demo
3269                 @echo 'Should produce "start of line, empty sign, empty sigl"'
3270
3271 ####### demo: grammar
3272
3273         $TERM $ @ + -
3274         line -> ${ printf("start of line"); }$
3275                 | line term
3276         term -> sigl IDENTIFIER
3277                 | sign NUMBER
3278         sigl ->   $
3279                 | @
3280                 | ${ printf(", empty sigl"); }$
3281         sign ->   +
3282                 | -
3283                 | ${ printf(", empty sign"); }$
3284
3285 ####### demo: header
3286
3287         //
3288
3289 ####### demo: code
3290
3291         #include <stdlib.h>
3292         #include <stdio.h>
3293         #include "mdcode.h"
3294         #include "scanner.h"
3295         #include "parser.h"
3296         #include "lalr_demo.h"
3297         int main(int argc, char *argv[])
3298         {
3299                 char test[] = "````\n$a -1 5 b\n";
3300                 struct section *table = code_extract(test, test+sizeof(test)-1, NULL);
3301                 struct token_config config = {
3302                         .ignored = (1 << TK_line_comment)
3303                                  | (1 << TK_newline)
3304                                  | (1 << TK_in)
3305                                  | (1 << TK_out),
3306                         .number_chars = "",
3307                         .word_start = "",
3308                         .word_cont = "",
3309                 };
3310                 parse_lalr_demo(table->code, &config, getenv("NOTRACE") ? NULL : stderr);
3311                 printf("\n");
3312                 exit(0);
3313         }