ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator with 2D support #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 "2D support" means that indentation and line breaks can be significant.
   8 Indent tokens (IN and OUT) and end-of-line tokens (EOL and NEWLINE) can
   9 be used to describe the grammar and the parser can selectively ignore
  10 these where they aren't relevant.
  11
  12 There are several distinct sections.
  13
  14 1. `grammar_read` will read a grammar from a literate-code file and
  15    store the productions, symbols, and code fragments.
  16 2. `grammar_analyse` will take that grammar and build LR parsing
  17    states and look-ahead tables.
  18 3. `grammar_report` will present the details of the analysis
  19    in a readable format and will report any conflicts.
  20 4. `parser_generate` will write out C code files with various
  21    tables and with the code fragments arranged in useful places.
  22 5. `parser_run` is a library function intended to be linked together
  23    with the generated parser tables to complete the implementation of
  24    a parser.
  25 6. Finally `calc` is a test grammar for a simple calculator.  The
  26    `parsergen` program built from the C code in this file can extract
  27    that grammar directly from this file and process it.
  28
  29 ###### File: parsergen.c
  30         #include <unistd.h>
  31         #include <stdlib.h>
  32         #include <stdio.h>
  33         ## includes
  34         ## forward declarations
  35         ## declarations
  36         ## functions
  37         ## grammar_read
  38         ## grammar_analyse
  39         ## grammar_report
  40         ## parser_generate
  41         ## main
  42 ###### File: parser.h
  43         ## exported types
  44         ## exported functions
  45 ###### File: libparser.c
  46         #include <unistd.h>
  47         #include <stdlib.h>
  48         #include <stdio.h>
  49         ## parser includes
  50         ## parser functions
  51         ## parser_run
  52 ###### File: parsergen.mk
  53         CFLAGS += -Wall -g
  54         all :: parsergen calc
  55         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  56                 ./md2c parsergen.mdc
  57
  58 ## Reading the grammar
  59
  60 The grammar must be stored in a markdown literate code file as parsed by
  61 "[mdcode][]".  It may have up to four top level (i.e.  unreferenced)
  62 sections: `header`, `code`, `grammar` and `reduce`.  The first two, if
  63 present, will be literally copied into the generated `.c` and `.h`
  64 files.  The third is required and contains the grammar which is
  65 tokenised with "[scanner][]".  `reduce` can provide variable
  66 declarations and common intialization code for all the reduction code
  67 fragments in the grammar.
  68
  69 If the `--tag` option is given, then any top level header that doesn't
  70 start with the tag is ignored, and the tag is striped from the rest.  So
  71 `--tag Foo` means that the four sections expected are `Foo: header`,
  72 `Foo: code`, `Foo: grammar` and `Foo: reduce`.  The tag `calc` is used
  73 to extract the sample calculator from this file.
  74
  75 [mdcode]: mdcode.html
  76 [scanner]: scanner.html
  77
  78 ###### includes
  79         #include "mdcode.h"
  80         #include "scanner.h"
  81
  82 ###### parser includes
  83         #include "mdcode.h"
  84         #include "scanner.h"
  85
  86 The grammar contains production sets, precedence/associativity
  87 declarations, general terminal declarations, and data type declarations.
  88 These are all parsed with _ad hoc_ parsing as we don't have a parser
  89 generator yet.
  90
  91 The precedence and associativity can be set for each production, and
  92 can be inherited from symbols.  The data type (either structure or a
  93 reference to a structure) is potentially defined for each non-terminal
  94 and describes what C structure is used to store information about each
  95 symbol.
  96
  97 ###### declarations
  98         enum assoc {Left, Right, Non};
  99         char *assoc_names[] = {"Left","Right","Non"};
 100
 101         struct symbol {
 102                 struct text struct_name;
 103                 int isref;
 104                 enum assoc assoc;
 105                 unsigned short precedence;
 106                 ## symbol fields
 107         };
 108         struct production {
 109                 unsigned short precedence;
 110                 enum assoc assoc;
 111                 ## production fields
 112         };
 113         struct grammar {
 114                 ## grammar fields
 115         };
 116
 117 The strings reported by `mdcode` and `scanner` are `struct text` which
 118 have length rather than being nul terminated.  To help with printing and
 119 comparing we define `text_is` and `prtxt`, which should possibly go in
 120 `mdcode`.  `scanner` does provide `text_dump` which is useful for
 121 strings which might contain control characters.
 122
 123 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 124 because that is what we need to detect tags.
 125
 126 ###### functions
 127         static int text_is(struct text t, char *s)
 128         {
 129                 return (strlen(s) == t.len &&
 130                         strncmp(s, t.txt, t.len) == 0);
 131         }
 132         static void prtxt(struct text t)
 133         {
 134                 printf("%.*s", t.len, t.txt);
 135         }
 136
 137         static int strip_tag(struct text *t, char *tag)
 138         {
 139                 int skip = strlen(tag) + 1;
 140                 if (skip >= t->len ||
 141                     strncmp(t->txt, tag, skip-1) != 0 ||
 142                     t->txt[skip-1] != ':')
 143                         return 0;
 144                 while (skip < t->len && t->txt[skip] == ' ')
 145                         skip++;
 146                 t->len -= skip;
 147                 t->txt += skip;
 148                 return 1;
 149         }
 150
 151 ### Symbols
 152
 153 Productions are comprised primarily of symbols - terminal and
 154 non-terminal.  We do not make a syntactic distinction between the two,
 155 though non-terminals must be identifiers while terminals can also be
 156 marks.  Non-terminal symbols are those which appear in the head of a
 157 production, terminal symbols are those which don't.  There are also
 158 "virtual" symbols used for precedence marking discussed later, and
 159 sometimes we won't know what type a symbol is yet.
 160
 161 To help with code safety it is possible to declare the terminal symbols.
 162 If this is done, then any symbol used in a production that does not
 163 appear in the head of any production and is not declared as a terminal
 164 is treated as an error.
 165
 166 ###### forward declarations
 167         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 168         char *symtypes = "UVTN";
 169 ###### symbol fields
 170         enum symtype type;
 171 ###### grammar fields
 172         int terminals_declared;
 173
 174 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 175 table of known symbols and the resulting parser will report them as
 176 `TK_reserved + N`.  A small set of identifiers are reserved for the
 177 different token types that `scanner` can report, and an even smaller set
 178 are reserved for a special token that the parser can generate (`EOL`) as
 179 will be described later.  This latter set cannot use predefined numbers,
 180 so they are marked as `isspecial` for now and will get assigned a number
 181 with the non-terminals later.
 182
 183 ###### declarations
 184
 185         static struct { int num; char *name; } reserved_words[] = {
 186                 { TK_error,        "ERROR" },
 187                 { TK_number,       "NUMBER" },
 188                 { TK_ident,        "IDENTIFIER" },
 189                 { TK_mark,         "MARK" },
 190                 { TK_string,       "STRING" },
 191                 { TK_multi_string, "MULTI_STRING" },
 192                 { TK_in,           "IN" },
 193                 { TK_out,          "OUT" },
 194                 { TK_newline,      "NEWLINE" },
 195                 { TK_eof,          "$eof" },
 196                 { -1,              "EOL" },
 197         };
 198
 199 ###### symbol fields
 200         short num;
 201         unsigned int isspecial:1;
 202
 203 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 204 recognised.  The former is automatically expected at the end of the text
 205 being parsed. The latter are always ignored by the parser.
 206
 207 All of these symbols are stored in a simple symbol table.  We use the
 208 `struct text` from `mdcode` to store the name and link them together
 209 into a sorted list using an insertion sort.
 210
 211 We don't have separate `find` and `insert` functions as any symbol we
 212 find needs to be remembered.  We simply expect `find` to always return a
 213 symbol, but its type might be `Unknown`.
 214
 215 ###### includes
 216         #include <string.h>
 217
 218 ###### symbol fields
 219         struct text name;
 220         struct symbol *next;
 221
 222 ###### grammar fields
 223         struct symbol *syms;
 224         int num_syms;
 225
 226 ###### functions
 227         static struct symbol *sym_find(struct grammar *g, struct text s)
 228         {
 229                 struct symbol **l = &g->syms;
 230                 struct symbol *n;
 231                 int cmp = 1;
 232
 233                 while (*l &&
 234                         (cmp = text_cmp((*l)->name, s)) < 0)
 235                                 l = & (*l)->next;
 236                 if (cmp == 0)
 237                         return *l;
 238                 n = calloc(1, sizeof(*n));
 239                 n->name = s;
 240                 n->type = Unknown;
 241                 n->next = *l;
 242                 n->num = -1;
 243                 *l = n;
 244                 return n;
 245         }
 246
 247         static void symbols_init(struct grammar *g)
 248         {
 249                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 250                 int i;
 251                 for (i = 0; i < entries; i++) {
 252                         struct text t;
 253                         struct symbol *s;
 254                         t.txt = reserved_words[i].name;
 255                         t.len = strlen(t.txt);
 256                         s = sym_find(g, t);
 257                         s->type = Terminal;
 258                         s->num = reserved_words[i].num;
 259                         s->isspecial = 1;
 260                 }
 261         }
 262
 263 ### Data types and precedence.
 264
 265 Data type specification, precedence specification, and declaration of
 266 terminals are all introduced by a dollar sign at the start of the line.
 267 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 268 precedence, if it is `TERM` then the line declares terminals without
 269 precedence, otherwise it specifies a data type.
 270
 271 The data type name is simply stored and applied to the head of all
 272 subsequent productions.  It must be the name of a structure optionally
 273 preceded by an asterisk which means a reference or "pointer".  So
 274 `$expression` maps to `struct expression` and `$*statement` maps to
 275 `struct statement *`.
 276
 277 Any productions given before the first data type declaration will have
 278 no data type associated with them and can carry no information.  In
 279 order to allow other non-terminals to have no type, the data type
 280 `$void` can be given.  This does *not* mean that `struct void` will be
 281 used, but rather than no type will be associated with subsequent
 282 non-terminals.
 283
 284 The precedence line must contain a list of symbols, either terminal
 285 symbols or virtual symbols.  It can only contain symbols that have not
 286 been seen yet, so precedence declaration must precede the productions
 287 that they affect.
 288
 289 A "virtual" symbol is an identifier preceded by `$$`.  Virtual symbols
 290 carry precedence information but are not included in the grammar.  A
 291 production can declare that it inherits the precedence of a given
 292 virtual symbol.
 293
 294 This use of `$$` precludes it from being used as a symbol in the
 295 described language.  Two other symbols: `${` and `}$` are also
 296 unavailable.
 297
 298 Each new precedence line introduces a new precedence level and
 299 declares how it associates.  This level is stored in each symbol
 300 listed and may be inherited by any production which uses the symbol.  A
 301 production inherits from the last symbol which has a precedence.
 302
 303 The symbols on the first precedence line have the lowest precedence.
 304 Subsequent lines introduce symbols with higher precedence and so bind
 305 more tightly.
 306
 307 Note that a declaration line (or "dollar line") can start with either of
 308 two different marks: "$" or "$*".  The `dollar_line()` function defined
 309 here is told which was found in the `isref` argument.
 310
 311 ###### grammar fields
 312         struct text current_type;
 313         int type_isref;
 314         int prec_levels;
 315
 316 ###### declarations
 317         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 318         static const char *known[] = { "$$", "${", "}$" };
 319
 320 ###### functions
 321         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 322         {
 323                 struct token t = token_next(ts);
 324                 char *err;
 325                 enum assoc assoc;
 326                 int term = 0;
 327                 int found;
 328
 329                 if (t.num != TK_ident) {
 330                         err = "type or assoc expected after '$'";
 331                         goto abort;
 332                 }
 333                 if (text_is(t.txt, "LEFT"))
 334                         assoc = Left;
 335                 else if (text_is(t.txt, "RIGHT"))
 336                         assoc = Right;
 337                 else if (text_is(t.txt, "NON"))
 338                         assoc = Non;
 339                 else if (text_is(t.txt, "TERM")) {
 340                         term = 1;
 341                         g->terminals_declared = 1;
 342                 } else {
 343                         g->current_type = t.txt;
 344                         g->type_isref = isref;
 345                         if (text_is(t.txt, "void"))
 346                                 g->current_type.txt = NULL;
 347                         t = token_next(ts);
 348                         if (t.num != TK_newline) {
 349                                 err = "Extra tokens after type name";
 350                                 goto abort;
 351                         }
 352                         return NULL;
 353                 }
 354
 355                 if (isref) {
 356                         err = "$* cannot be followed by a precedence";
 357                         goto abort;
 358                 }
 359
 360                 // This is a precedence or TERM line, need some symbols.
 361                 found = 0;
 362                 g->prec_levels += 1;
 363                 t = token_next(ts);
 364                 while (t.num != TK_newline) {
 365                         enum symtype type = Terminal;
 366                         struct symbol *s;
 367                         if (t.num == TK_virtual) {
 368                                 type = Virtual;
 369                                 t = token_next(ts);
 370                                 if (t.num != TK_ident) {
 371                                         err = "$$ must be followed by a word";
 372                                         goto abort;
 373                                 }
 374                                 if (term) {
 375                                         err = "Virtual symbols not permitted on $TERM line";
 376                                         goto abort;
 377                                 }
 378                         } else if (t.num != TK_ident &&
 379                                    t.num != TK_mark) {
 380                                 err = "Illegal token in precedence line";
 381                                 goto abort;
 382                         }
 383                         s = sym_find(g, t.txt);
 384                         if (s->type != Unknown) {
 385                                 err = "Symbols in precedence/TERM line must not already be known.";
 386                                 goto abort;
 387                         }
 388                         s->type = type;
 389                         if (!term) {
 390                                 s->precedence = g->prec_levels;
 391                                 s->assoc = assoc;
 392                         }
 393                         found += 1;
 394                         t = token_next(ts);
 395                 }
 396                 if (found == 0) {
 397                         err = "No symbols given on precedence/TERM line";
 398                         goto abort;
 399                 }
 400                 return NULL;
 401         abort:
 402                 while (t.num != TK_newline && t.num != TK_eof)
 403                         t = token_next(ts);
 404                 return err;
 405         }
 406
 407 ### Productions
 408
 409 A production either starts with an identifier which is the head
 410 non-terminal, or a vertical bar (`|`) in which case this production
 411 uses the same head as the previous one.  The identifier must be
 412 followed by a `->` mark.  All productions for a given non-terminal must
 413 be grouped together with the `nonterminal ->` given only once.
 414
 415 After this a (possibly empty) sequence of identifiers and marks form
 416 the body of the production.  A virtual symbol may be given after the
 417 body by preceding it with `$$`.  If a virtual symbol is given, the
 418 precedence of the production is that for the virtual symbol.  If none
 419 is given, the precedence is inherited from the last symbol in the
 420 production which has a precedence specified.
 421
 422 After the optional precedence may come the `${` mark.  This indicates
 423 the start of a code fragment.  If present, this must be on the same
 424 line as the start of the production.
 425
 426 All of the text from the `${` through to the matching `}$` is
 427 collected and forms the code-fragment for the production.  It must all
 428 be in one `code_node` of the literate code.  The `}$` must be
 429 at the end of a line.
 430
 431 Text in the code fragment will undergo substitutions where `$N` or
 432 `$<N`,for some numeric `N` (or non-numeric indicator as described
 433 later), will be replaced with a variable holding the parse information
 434 for the particular symbol in the production.  `$0` is the head of the
 435 production, `$1` is the first symbol of the body, etc.  The type of `$N`
 436 for a terminal symbol is `struct token`.  For a non-terminal, it is
 437 whatever has been declared for that symbol.  The `<` may be included and
 438 means that the value (usually a reference) is being moved out, so it
 439 will not automatically be freed.  The effect of using '<' is that the
 440 variable is cleared to all-zeros.
 441
 442 While building productions we will need to add to an array which needs to
 443 grow dynamically.
 444
 445 ###### functions
 446         static void array_add(void *varray, int *cnt, void *new)
 447         {
 448                 void ***array = varray;
 449                 int current = 0;
 450                 const int step = 8;
 451                 current = ((*cnt-1) | (step-1))+1;
 452                 if (*cnt == current) {
 453                         /* must grow */
 454                         current += step;
 455                         *array = realloc(*array, current * sizeof(void*));
 456                 }
 457                 (*array)[*cnt] = new;
 458                 (*cnt) += 1;
 459         }
 460
 461 Collecting the code fragment simply involves reading tokens until we
 462 hit the end token `}$`, and noting the character position of the start and
 463 the end.
 464
 465 ###### functions
 466         static struct text collect_code(struct token_state *state,
 467                                         struct token start)
 468         {
 469                 struct text code;
 470                 struct token t;
 471                 code.txt = start.txt.txt + start.txt.len;
 472                 do
 473                         t = token_next(state);
 474                 while (t.node == start.node &&
 475                        t.num != TK_close && t.num != TK_error &&
 476                        t.num != TK_eof);
 477                 if (t.num == TK_close && t.node == start.node)
 478                         code.len = t.txt.txt - code.txt;
 479                 else
 480                         code.txt = NULL;
 481                 return code;
 482         }
 483
 484 Now we have all the bits we need to parse a full production.
 485
 486 ###### includes
 487         #include <memory.h>
 488
 489 ###### grammar fields
 490         struct production **productions;
 491         int production_count;
 492
 493 ###### production fields
 494         struct symbol  *head;
 495         struct symbol **body;
 496         int             body_size;
 497         struct text     code;
 498         int             code_line;
 499
 500 ###### symbol fields
 501         int first_production;
 502
 503 ###### functions
 504         static char *parse_production(struct grammar *g,
 505                                       struct symbol *head,
 506                                       struct token_state *state)
 507         {
 508                 /* Head has already been parsed. */
 509                 struct token tk;
 510                 char *err;
 511                 struct production p, *pp;
 512
 513                 memset(&p, 0, sizeof(p));
 514                 p.head = head;
 515                 tk = token_next(state);
 516                 while (tk.num == TK_ident || tk.num == TK_mark) {
 517                         struct symbol *bs = sym_find(g, tk.txt);
 518                         if (bs->type == Virtual) {
 519                                 err = "Virtual symbol not permitted in production";
 520                                 goto abort;
 521                         }
 522                         if (bs->precedence) {
 523                                 p.precedence = bs->precedence;
 524                                 p.assoc = bs->assoc;
 525                         }
 526                         array_add(&p.body, &p.body_size, bs);
 527                         tk = token_next(state);
 528                 }
 529                 if (tk.num == TK_virtual) {
 530                         struct symbol *vs;
 531                         tk = token_next(state);
 532                         if (tk.num != TK_ident) {
 533                                 err = "word required after $$";
 534                                 goto abort;
 535                         }
 536                         vs = sym_find(g, tk.txt);
 537                         if (vs->precedence == 0) {
 538                                 err = "symbol after $$ must have precedence";
 539                                 goto abort;
 540                         } else {
 541                                 p.precedence = vs->precedence;
 542                                 p.assoc = vs->assoc;
 543                         }
 544                         tk = token_next(state);
 545                 }
 546                 if (tk.num == TK_open) {
 547                         p.code_line = tk.line;
 548                         p.code = collect_code(state, tk);
 549                         if (p.code.txt == NULL) {
 550                                 err = "code fragment not closed properly";
 551                                 goto abort;
 552                         }
 553                         tk = token_next(state);
 554                 }
 555
 556                 if (tk.num != TK_newline && tk.num != TK_eof) {
 557                         err = "stray tokens at end of line";
 558                         goto abort;
 559                 }
 560                 pp = malloc(sizeof(*pp));
 561                 *pp = p;
 562                 array_add(&g->productions, &g->production_count, pp);
 563                 return NULL;
 564         abort:
 565                 while (tk.num != TK_newline && tk.num != TK_eof)
 566                         tk = token_next(state);
 567                 free(p.body);
 568                 return err;
 569         }
 570
 571 With the ability to parse productions and dollar-lines, we have nearly all
 572 that we need to parse a grammar from a `code_node`.
 573
 574 The head of the first production will effectively be the `start` symbol
 575 of the grammar.  However it won't _actually_ be so.  Processing the
 576 grammar is greatly simplified if the real start symbol only has a single
 577 production, and expects `$eof` as the final terminal.  So when we find
 578 the first explicit production we insert an extra implicit production as
 579 production zero which looks like
 580
 581 ###### Example: production 0
 582         $start -> START $eof
 583
 584 where `START` is the first non-terminal given.
 585
 586 ###### create production zero
 587         struct production *p = calloc(1,sizeof(*p));
 588         struct text start = {"$start",6};
 589         struct text eof = {"$eof",4};
 590         struct text code = {"$0 = $<1;", 9};
 591         p->head = sym_find(g, start);
 592         p->head->type = Nonterminal;
 593         p->head->struct_name = g->current_type;
 594         p->head->isref = g->type_isref;
 595         if (g->current_type.txt)
 596                 p->code = code;
 597         array_add(&p->body, &p->body_size, head);
 598         array_add(&p->body, &p->body_size, sym_find(g, eof));
 599         p->head->first_production = g->production_count;
 600         array_add(&g->productions, &g->production_count, p);
 601
 602 Now we are ready to read in the grammar.  We ignore comments
 603 and strings so that the marks which introduce them can be
 604 used as terminals in the grammar.  We don't ignore numbers
 605 even though we don't allow them as that causes the scanner
 606 to produce errors that the parser is better positioned to handle.
 607 We also ignore indentation, but we expect and use newlines.
 608
 609 To allow comments in the grammar, we explicitly check for "//" as the
 610 first token on the line and those lines are skipped.  "//" can still be
 611 used as a terminal anywhere that a terminal is expected.
 612
 613 ###### grammar_read
 614         static struct grammar *grammar_read(struct code_node *code)
 615         {
 616                 struct token_config conf = {
 617                         .word_start = "",
 618                         .word_cont = "",
 619                         .words_marks = known,
 620                         .known_count = sizeof(known)/sizeof(known[0]),
 621                         .number_chars = "",
 622                         .ignored = (1 << TK_line_comment)
 623                                  | (1 << TK_block_comment)
 624                                  | (0 << TK_number)
 625                                  | (1 << TK_string)
 626                                  | (1 << TK_multi_string)
 627                                  | (1 << TK_in)
 628                                  | (1 << TK_out),
 629                 };
 630
 631                 struct token_state *state = token_open(code, &conf);
 632                 struct token tk;
 633                 struct symbol *head = NULL;
 634                 struct grammar *g;
 635                 char *err = NULL;
 636
 637                 g = calloc(1, sizeof(*g));
 638                 symbols_init(g);
 639
 640                 for (tk = token_next(state); tk.num != TK_eof;
 641                      tk = token_next(state)) {
 642                         if (tk.num == TK_newline)
 643                                 continue;
 644                         if (tk.num == TK_ident) {
 645                                 // new non-terminal
 646                                 head = sym_find(g, tk.txt);
 647                                 if (head->type == Nonterminal)
 648                                         err = "This non-terminal has already be used.";
 649                                 else if (head->type == Virtual)
 650                                         err = "Virtual symbol not permitted in head of production";
 651                                 else {
 652                                         head->type = Nonterminal;
 653                                         head->struct_name = g->current_type;
 654                                         head->isref = g->type_isref;
 655                                         if (g->production_count == 0) {
 656                                                 ## create production zero
 657                                         }
 658                                         head->first_production = g->production_count;
 659                                         tk = token_next(state);
 660                                         if (tk.num == TK_mark &&
 661                                             text_is(tk.txt, "->"))
 662                                                 err = parse_production(g, head, state);
 663                                         else
 664                                                 err = "'->' missing in production";
 665                                 }
 666                         } else if (tk.num == TK_mark
 667                                    && text_is(tk.txt, "|")) {
 668                                 // another production for same non-term
 669                                 if (head)
 670                                         err = parse_production(g, head, state);
 671                                 else
 672                                         err = "First production must have a head";
 673                         } else if (tk.num == TK_mark
 674                                    && text_is(tk.txt, "$")) {
 675                                 err = dollar_line(state, g, 0);
 676                         } else if (tk.num == TK_mark
 677                                    && text_is(tk.txt, "$*")) {
 678                                 err = dollar_line(state, g, 1);
 679                         } else if (tk.num == TK_mark
 680                                    && text_is(tk.txt, "//")) {
 681                                 while (tk.num != TK_newline &&
 682                                        tk.num != TK_eof)
 683                                         tk = token_next(state);
 684                         } else {
 685                                 err = "Unrecognised token at start of line.";
 686                         }
 687                         if (err)
 688                                 goto abort;
 689                 }
 690                 token_close(state);
 691
 692                 struct symbol *s;
 693                 for (s = g->syms; s; s = s->next) {
 694                         if (s->type != Unknown)
 695                                 continue;
 696                         if (!g->terminals_declared) {
 697                                 s->type = Terminal;
 698                                 continue;
 699                         }
 700                         err = "not declared";
 701                         fprintf(stderr, "Token %.*s not declared\n",
 702                                 s->name.len, s->name.txt);
 703                 }
 704                 if (err) {
 705                         free(g); // FIXME free content
 706                         g = NULL;
 707                 }
 708                 return g;
 709         abort:
 710                 fprintf(stderr, "Error at line %d: %s\n",
 711                         tk.line, err);
 712                 token_close(state);
 713                 free(g); // FIXME free content
 714                 return NULL;
 715         }
 716
 717 ## Analysing the grammar
 718
 719 The central task in analysing the grammar is to determine a set of
 720 states to drive the parsing process.  This will require finding various
 721 sets of symbols and of "items".  Some of these sets will need to attach
 722 information to each element in the set, so they are more like maps.
 723
 724 Each "item" may have a 'look-ahead' or `LA` set associated with it.
 725 Many of these will be the same as each other.  To avoid memory wastage,
 726 and to simplify some comparisons of sets, these sets will be stored in a
 727 list of unique sets, each assigned a number.
 728
 729 Once we have the data structures in hand to manage these sets and lists,
 730 we can start setting the 'nullable' flag, build the 'FIRST' and 'FOLLOW'
 731 sets, and then create the item sets which define the various states.
 732
 733 ### Symbol sets.
 734
 735 Though we don't only store symbols in these sets, they are the main
 736 things we store, so they are called symbol sets or "symsets".
 737
 738 A symset has a size, an array of shorts, and an optional array of data
 739 values, which are also shorts.  If the array of data is not present, we
 740 store an impossible pointer, as `NULL` is used to indicate that no
 741 memory has been allocated yet;
 742
 743 ###### declarations
 744         struct symset {
 745                 short cnt;
 746                 unsigned short *syms, *data;
 747         };
 748         #define NO_DATA ((unsigned short *)1)
 749         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 750         const struct symset INIT_DATASET = { 0, NULL, NULL };
 751
 752 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
 753 using an insertion sort.  We don't explicitly record the amount of
 754 allocated space as it can be derived directly from the current `cnt`
 755 using `((cnt - 1) | 7) + 1`.
 756
 757 ###### functions
 758         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 759         {
 760                 int i;
 761                 int current = ((s->cnt-1) | 7) + 1;
 762                 if (current == s->cnt) {
 763                         current += 8;
 764                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 765                         if (s->data != NO_DATA)
 766                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 767                 }
 768                 i = s->cnt;
 769                 while (i > 0 && s->syms[i-1] > key) {
 770                         s->syms[i] = s->syms[i-1];
 771                         if (s->data != NO_DATA)
 772                                 s->data[i] = s->data[i-1];
 773                         i--;
 774                 }
 775                 s->syms[i] = key;
 776                 if (s->data != NO_DATA)
 777                         s->data[i] = val;
 778                 s->cnt += 1;
 779         }
 780
 781 Finding a symbol (or item) in a `symset` uses a simple binary search.
 782 We return the index where the value was found (so data can be accessed),
 783 or `-1` to indicate failure.
 784
 785         static int symset_find(struct symset *ss, unsigned short key)
 786         {
 787                 int lo = 0;
 788                 int hi = ss->cnt;
 789
 790                 if (hi == 0)
 791                         return -1;
 792                 while (lo + 1 < hi) {
 793                         int mid = (lo + hi) / 2;
 794                         if (ss->syms[mid] <= key)
 795                                 lo = mid;
 796                         else
 797                                 hi = mid;
 798                 }
 799                 if (ss->syms[lo] == key)
 800                         return lo;
 801                 return -1;
 802         }
 803
 804 We will often want to form the union of two symsets.  When we do, we
 805 will often want to know if anything changed (as that might mean there is
 806 more work to do).  So `symset_union` reports whether anything was added
 807 to the first set.  We use a slow quadratic approach as these sets are
 808 rarely large.  If profiling shows this to be a problem it can be
 809 optimised later.
 810
 811         static int symset_union(struct symset *a, struct symset *b)
 812         {
 813                 int i;
 814                 int added = 0;
 815                 for (i = 0; i < b->cnt; i++)
 816                         if (symset_find(a, b->syms[i]) < 0) {
 817                                 unsigned short data = 0;
 818                                 if (b->data != NO_DATA)
 819                                         data = b->data[i];
 820                                 symset_add(a, b->syms[i], data);
 821                                 added++;
 822                         }
 823                 return added;
 824         }
 825
 826 And of course we must be able to free a symset.
 827
 828         static void symset_free(struct symset ss)
 829         {
 830                 free(ss.syms);
 831                 if (ss.data != NO_DATA)
 832                         free(ss.data);
 833         }
 834
 835 ### Symset Storage
 836
 837 Some symsets are simply stored somewhere appropriate in the `grammar`
 838 data structure, others needs a bit of indirection.  This applies
 839 particularly to "LA" sets.  These will be paired with "items" in an
 840 "itemset".  As itemsets will be stored in a symset, the "LA" set needs
 841 to be stored in the `data` field, so we need a mapping from a small
 842 (short) number to an LA `symset`.
 843
 844 As mentioned earlier this involves creating a list of unique symsets.
 845
 846 For now, we just use a linear list sorted by insertion.  A skiplist
 847 would be more efficient and may be added later.
 848
 849 ###### declarations
 850
 851         struct setlist {
 852                 struct symset ss;
 853                 int num;
 854                 struct setlist *next;
 855         };
 856
 857 ###### grammar fields
 858         struct setlist *sets;
 859         int nextset;
 860
 861 ###### functions
 862
 863         static int ss_cmp(struct symset a, struct symset b)
 864         {
 865                 int i;
 866                 int diff = a.cnt - b.cnt;
 867                 if (diff)
 868                         return diff;
 869                 for (i = 0; i < a.cnt; i++) {
 870                         diff = (int)a.syms[i] - (int)b.syms[i];
 871                         if (diff)
 872                                 return diff;
 873                 }
 874                 return 0;
 875         }
 876
 877         static int save_set(struct grammar *g, struct symset ss)
 878         {
 879                 struct setlist **sl = &g->sets;
 880                 int cmp = 1;
 881                 struct setlist *s;
 882
 883                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 884                         sl = & (*sl)->next;
 885                 if (cmp == 0) {
 886                         symset_free(ss);
 887                         return (*sl)->num;
 888                 }
 889
 890                 s = malloc(sizeof(*s));
 891                 s->ss = ss;
 892                 s->num = g->nextset;
 893                 g->nextset += 1;
 894                 s->next = *sl;
 895                 *sl = s;
 896                 return s->num;
 897         }
 898
 899 Finding a set by number is currently performed by a simple linear
 900 search.  If this turns out to hurt performance, we can store the sets in
 901 a dynamic array like the productions.
 902
 903         static struct symset set_find(struct grammar *g, int num)
 904         {
 905                 struct setlist *sl = g->sets;
 906                 while (sl && sl->num != num)
 907                         sl = sl->next;
 908                 return sl->ss;
 909         }
 910
 911 ### Setting `nullable`
 912
 913 We set `nullable` on the head symbol for any production for which all
 914 the body symbols (if any) are nullable.  As this is a recursive
 915 definition, any change in the `nullable` setting means that we need to
 916 re-evaluate where it needs to be set.
 917
 918 We simply loop around performing the same calculations until no more
 919 changes happen.
 920
 921 ###### symbol fields
 922         int nullable;
 923
 924 ###### functions
 925         static void set_nullable(struct grammar *g)
 926         {
 927                 int check_again = 1;
 928                 while (check_again) {
 929                         int p;
 930                         check_again = 0;
 931                         for (p = 0; p < g->production_count; p++) {
 932                                 struct production *pr = g->productions[p];
 933                                 int s;
 934
 935                                 if (pr->head->nullable)
 936                                         continue;
 937                                 for (s = 0; s < pr->body_size; s++)
 938                                         if (! pr->body[s]->nullable)
 939                                                 break;
 940                                 if (s == pr->body_size) {
 941                                         pr->head->nullable = 1;
 942                                         check_again = 1;
 943                                 }
 944                         }
 945                 }
 946         }
 947
 948 ### Building the `first` sets
 949
 950 When calculating what can follow a particular non-terminal, we will need
 951 to know what the "first" terminal in any subsequent non-terminal might
 952 be.  So we calculate the `first` set for every non-terminal and store
 953 them in an array.  We don't bother recording the "first" set for
 954 terminals as they are trivial.
 955
 956 As the "first" for one symbol might depend on the "first" of another, we
 957 repeat the calculations until no changes happen, just like with
 958 `set_nullable`.  This makes use of the fact that `symset_union` reports
 959 if any change happens.
 960
 961 The core of this, which finds the "first" of part of a production body,
 962 will be reused for computing the "follow" sets, so we split that out
 963 into a separate function, `add_first`, which also reports if it got all
 964 the way to the end of the production without finding a non-nullable
 965 symbol.
 966
 967 ###### grammar fields
 968         struct symset *first;
 969
 970 ###### functions
 971
 972         static int add_first(struct production *pr, int start,
 973                              struct symset *target, struct grammar *g,
 974                              int *to_end)
 975         {
 976                 int s;
 977                 int changed = 0;
 978                 for (s = start; s < pr->body_size; s++) {
 979                         struct symbol *bs = pr->body[s];
 980                         if (bs->type == Terminal) {
 981                                 if (symset_find(target, bs->num) < 0) {
 982                                         symset_add(target, bs->num, 0);
 983                                         changed = 1;
 984                                 }
 985                                 break;
 986                         } else if (symset_union(target, &g->first[bs->num]))
 987                                 changed = 1;
 988                         if (!bs->nullable)
 989                                 break;
 990                 }
 991                 if (to_end)
 992                         *to_end = (s == pr->body_size);
 993                 return changed;
 994         }
 995
 996         static void build_first(struct grammar *g)
 997         {
 998                 int check_again = 1;
 999                 int s;
1000                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1001                 for (s = 0; s < g->num_syms; s++)
1002                         g->first[s] = INIT_SYMSET;
1003
1004                 while (check_again) {
1005                         int p;
1006                         check_again = 0;
1007                         for (p = 0; p < g->production_count; p++) {
1008                                 struct production *pr = g->productions[p];
1009                                 struct symset *head = &g->first[pr->head->num];
1010
1011                                 if (add_first(pr, 0, head, g, NULL))
1012                                         check_again = 1;
1013                         }
1014                 }
1015         }
1016
1017 ### Building the `follow` sets.
1018
1019 There are two different situations when we will want to generate
1020 "follow" sets.  If we are doing an SLR analysis, we want to generate a
1021 single "follow" set for each non-terminal in the grammar.  That is what
1022 is happening here.  If we are doing an LALR or LR analysis we will want
1023 to generate a separate "LA" set for each item.  We do that later in
1024 state generation.
1025
1026 There are two parts to generating a "follow" set.  Firstly we look at
1027 every place that any non-terminal appears in the body of any production,
1028 and we find the set of possible "first" symbols after there.  This is
1029 done using `add_first` above and only needs to be done once as the
1030 "first" sets are now stable and will not change.
1031
1032 ###### grammar fields
1033         struct symset *follow;
1034
1035 ###### follow code
1036
1037         for (p = 0; p < g->production_count; p++) {
1038                 struct production *pr = g->productions[p];
1039                 int b;
1040
1041                 for (b = 0; b < pr->body_size - 1; b++) {
1042                         struct symbol *bs = pr->body[b];
1043                         if (bs->type == Terminal)
1044                                 continue;
1045                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1046                 }
1047         }
1048
1049 The second part is to add the "follow" set of the head of a production
1050 to the "follow" sets of the final symbol in the production, and any
1051 other symbol which is followed only by `nullable` symbols.  As this
1052 depends on "follow" itself we need to repeatedly perform the process
1053 until no further changes happen.
1054
1055 Rather than 2 nested loops, one that repeats the whole process until
1056 there is no change, and one that iterates of the set of productions, we
1057 combine these two functions into a single loop.
1058
1059 ###### follow code
1060
1061         for (check_again = 0, p = 0;
1062              p < g->production_count;
1063              p < g->production_count-1
1064                 ? p++ : check_again ? (check_again = 0, p = 0)
1065                               : p++) {
1066                 struct production *pr = g->productions[p];
1067                 int b;
1068
1069                 for (b = pr->body_size - 1; b >= 0; b--) {
1070                         struct symbol *bs = pr->body[b];
1071                         if (bs->type == Terminal)
1072                                 break;
1073                         if (symset_union(&g->follow[bs->num],
1074                                          &g->follow[pr->head->num]))
1075                                 check_again = 1;
1076                         if (!bs->nullable)
1077                                 break;
1078                 }
1079         }
1080
1081 We now just need to create and initialise the `follow` list to get a
1082 complete function.
1083
1084 ###### functions
1085         static void build_follow(struct grammar *g)
1086         {
1087                 int s, check_again, p;
1088                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1089                 for (s = 0; s < g->num_syms; s++)
1090                         g->follow[s] = INIT_SYMSET;
1091                 ## follow code
1092         }
1093
1094 ### Building itemsets and states
1095
1096 There are three different levels of detail that can be chosen for
1097 building the itemsets and states for the LR grammar.  They are:
1098
1099 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1100    itemsets - look-ahead, if used at all, is simply the 'follow' sets
1101    already computed,
1102 2. LALR(1) where we build look-ahead sets with each item and merge
1103    the LA sets when we find two paths to the same "kernel" set of items.
1104 3. LR(1) where different look-ahead for any item in the set means
1105    a different item set must be created.
1106
1107 ###### forward declarations
1108         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1109
1110 We need to be able to look through existing states to see if a newly
1111 generated state already exists.  For now we use a simple sorted linked
1112 list.
1113
1114 An item is a pair of numbers: the production number and the position of
1115 "DOT", which is an index into the body of the production.  As the
1116 numbers are not enormous we can combine them into a single "short" and
1117 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1118 production number (so 2000 productions with maximum of 31 symbols in the
1119 body).
1120
1121 Comparing the itemsets will be a little different to comparing symsets
1122 as we want to do the lookup after generating the "kernel" of an
1123 itemset, so we need to ignore the offset=zero items which are added during
1124 completion.
1125
1126 To facilitate this, we modify the "DOT" number so that "0" sorts to
1127 the end of the list in the symset, and then only compare items before
1128 the first "0".
1129
1130 ###### declarations
1131         static inline unsigned short item_num(int production, int dot)
1132         {
1133                 return production | ((31-dot) << 11);
1134         }
1135         static inline int item_prod(unsigned short item)
1136         {
1137                 return item & 0x7ff;
1138         }
1139         static inline int item_dot(unsigned short item)
1140         {
1141                 return (31-(item >> 11)) & 0x1f;
1142         }
1143
1144 For LR(1) analysis we need to compare not just the itemset in a state
1145 but also the LA sets.  As we assign each unique LA set a number, we
1146 can just compare the symset and the data values together.
1147
1148 ###### functions
1149         static int itemset_cmp(struct symset a, struct symset b,
1150                                enum grammar_type type)
1151         {
1152                 int i;
1153                 int av, bv;
1154
1155                 for (i = 0;
1156                      i < a.cnt && i < b.cnt &&
1157                      item_dot(a.syms[i]) > 0 &&
1158                      item_dot(b.syms[i]) > 0;
1159                      i++) {
1160                         int diff = a.syms[i] - b.syms[i];
1161                         if (diff)
1162                                 return diff;
1163                         if (type == LR1) {
1164                                 diff = a.data[i] - b.data[i];
1165                                 if (diff)
1166                                         return diff;
1167                         }
1168                 }
1169                 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1170                         av = -1;
1171                 else
1172                         av = a.syms[i];
1173                 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1174                         bv = -1;
1175                 else
1176                         bv = b.syms[i];
1177                 if (av - bv)
1178                         return av - bv;
1179                 if (type < LR1 || av == -1)
1180                         return 0;
1181                 return
1182                         a.data[i] - b.data[i];
1183         }
1184
1185 It will be helpful to know if an itemset has been "completed" or not,
1186 particularly for LALR where itemsets get merged, at which point they
1187 need to be consider for completion again.  So a `completed` flag is
1188 needed.
1189
1190 And now we can build the list of itemsets.  The lookup routine returns
1191 both a success flag and a pointer to where in the list an insert should
1192 happen, so we don't need to search a second time.
1193
1194 Each state will often have at most one reducible production, but
1195 occasionally will have more.  If there are more we will need a full
1196 action table (merged in the go to table).  If there is one, we record it
1197 in the state.  If none, we record that too.  These special values will
1198 be needed both in the generator and in the parser, so they need to be
1199 declared twice.
1200
1201 ###### exported types
1202         enum {
1203                 NONE_REDUCIBLE = -1,
1204                 MANY_REDUCIBLE = -2,
1205         };
1206 ###### declarations
1207         enum {
1208                 NONE_REDUCIBLE = -1,
1209                 MANY_REDUCIBLE = -2,
1210         };
1211         struct itemset {
1212                 struct itemset *next;
1213                 short state;
1214                 short reducible_prod; /* or NONE_REDUCIBLE or MANY_REDUCIBLE */
1215                 struct symset items;
1216                 struct symset go_to;
1217                 enum assoc assoc;
1218                 unsigned short precedence;
1219                 char completed;
1220         };
1221
1222 ###### grammar fields
1223         struct itemset *items;
1224         int states;
1225
1226 ###### functions
1227         static int itemset_find(struct grammar *g, struct itemset ***where,
1228                                 struct symset kernel, enum grammar_type type)
1229         {
1230                 struct itemset **ip;
1231
1232                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1233                         struct itemset *i = *ip;
1234                         int diff;
1235                         diff = itemset_cmp(i->items, kernel, type);
1236                         if (diff < 0)
1237                                 continue;
1238                         if (diff > 0)
1239                                 break;
1240                         /* found */
1241                         *where = ip;
1242                         return 1;
1243                 }
1244                 *where = ip;
1245                 return 0;
1246         }
1247
1248 Adding an itemset may require merging the LA sets if LALR analysis is
1249 happening.  If any new LA set adds any symbols that weren't in the old
1250 LA set, we clear the `completed` flag so that the dependants of this
1251 itemset will be recalculated and their LA sets updated.
1252
1253 `add_itemset` must consume the symsets it is passed, either by adding
1254 them to a data structure, of freeing them.
1255
1256         static int add_itemset(struct grammar *g, struct symset ss,
1257                                enum assoc assoc, unsigned short precedence,
1258                                enum grammar_type type)
1259         {
1260                 struct itemset **where, *is;
1261                 int i;
1262                 int found = itemset_find(g, &where, ss, type);
1263                 if (!found) {
1264                         is = calloc(1, sizeof(*is));
1265                         is->state = g->states;
1266                         g->states += 1;
1267                         is->items = ss;
1268                         is->assoc = assoc;
1269                         is->precedence = precedence;
1270                         is->next = *where;
1271                         is->go_to = INIT_DATASET;
1272                         *where = is;
1273                         return is->state;
1274                 }
1275                 is = *where;
1276                 if (type != LALR) {
1277                         symset_free(ss);
1278                         return is->state;
1279                 }
1280                 for (i = 0; i < ss.cnt; i++) {
1281                         struct symset temp = INIT_SYMSET, s;
1282                         if (ss.data[i] == is->items.data[i])
1283                                 continue;
1284                         s = set_find(g, is->items.data[i]);
1285                         symset_union(&temp, &s);
1286                         s = set_find(g, ss.data[i]);
1287                         if (symset_union(&temp, &s)) {
1288                                 is->items.data[i] = save_set(g, temp);
1289                                 is->completed = 0;
1290                         } else
1291                                 symset_free(temp);
1292                 }
1293                 symset_free(ss);
1294                 return is->state;
1295         }
1296
1297 #### The build
1298
1299 To build all the itemsets, we first insert the initial itemset made from
1300 production zero, complete each itemset, and then generate new itemsets
1301 from old until no new ones can be made.
1302
1303 Completing an itemset means finding all the items where "DOT" is
1304 followed by a nonterminal and adding "DOT=0" items for every production
1305 from that non-terminal - providing each item hasn't already been added.
1306 When we add such an item it might get inserted before the current
1307 one, so to make sure we process it we reset the loop counter to the
1308 start.
1309
1310 If LA sets are needed, the LA set for each new item is found using
1311 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This
1312 may be supplemented by the LA set for the item which produced the new
1313 item.
1314
1315 We also collect a set of all symbols which follow "DOT" (in `done`) as
1316 this is used in the next stage.
1317
1318 When itemsets are created we assign a precedence to the itemset from the
1319 complete item, if there is one.  We ignore the possibility of there
1320 being two and don't (currently) handle precedence in such grammars.
1321 When completing a grammar we ignore any item where DOT is followed by a
1322 terminal with a precedence lower than that for the itemset.  Unless the
1323 terminal has right associativity, we also ignore items where the
1324 terminal has the same precedence.  The result is that unwanted items are
1325 still in the itemset, but the terminal doesn't get into the go to set,
1326 so the item is ineffective.
1327
1328 ###### complete itemset
1329         for (i = 0; i < is->items.cnt; i++) {
1330                 int p = item_prod(is->items.syms[i]);
1331                 int bs = item_dot(is->items.syms[i]);
1332                 struct production *pr = g->productions[p];
1333                 int p2;
1334                 struct symbol *s;
1335                 struct symset LA = INIT_SYMSET;
1336                 unsigned short sn = 0;
1337
1338                 if (i == 0)
1339                         is->reducible_prod = NONE_REDUCIBLE;
1340                 if (bs == pr->body_size) {
1341                         if (is->reducible_prod == NONE_REDUCIBLE)
1342                                 is->reducible_prod = p;
1343                         else
1344                                 is->reducible_prod = MANY_REDUCIBLE;
1345                         continue;
1346                 }
1347                 s = pr->body[bs];
1348                 if (s->precedence && is->precedence &&
1349                     is->precedence > s->precedence)
1350                         /* This terminal has a low precedence and
1351                          * shouldn't be shifted
1352                          */
1353                         continue;
1354                 if (s->precedence && is->precedence &&
1355                     is->precedence == s->precedence && s->assoc != Right)
1356                         /* This terminal has a matching precedence and is
1357                          * not Right-associative, so we mustn't shift it.
1358                          */
1359                         continue;
1360                 if (symset_find(&done, s->num) < 0)
1361                         symset_add(&done, s->num, 0);
1362
1363                 if (s->type != Nonterminal)
1364                         continue;
1365                 check_again = 1;
1366                 if (type >= LALR) {
1367                         // Need the LA set.
1368                         int to_end;
1369                         add_first(pr, bs+1, &LA, g, &to_end);
1370                         if (to_end) {
1371                                 struct symset ss = set_find(g, is->items.data[i]);
1372                                 symset_union(&LA, &ss);
1373                         }
1374                         sn = save_set(g, LA);
1375                         LA = set_find(g, sn);
1376                 }
1377
1378                 /* Add productions for this symbol */
1379                 for (p2 = s->first_production;
1380                      p2 < g->production_count &&
1381                       g->productions[p2]->head == s;
1382                      p2++) {
1383                         int itm = item_num(p2, 0);
1384                         int pos = symset_find(&is->items, itm);
1385                         if (pos < 0) {
1386                                 symset_add(&is->items, itm, sn);
1387                                 /* Will have re-ordered, so start
1388                                  * from beginning again */
1389                                 i = -1;
1390                         } else if (type >= LALR) {
1391                                 struct symset ss = set_find(g, is->items.data[pos]);
1392                                 struct symset tmp = INIT_SYMSET;
1393                                 struct symset *la = &LA;
1394
1395                                 symset_union(&tmp, &ss);
1396                                 if (symset_union(&tmp, la)) {
1397                                         is->items.data[pos] = save_set(g, tmp);
1398                                         i = -1;
1399                                 } else
1400                                         symset_free(tmp);
1401                         }
1402                 }
1403         }
1404
1405 For each symbol we found (and placed in `done`) we collect all the items
1406 for which this symbol is next, and create a new itemset with "DOT"
1407 advanced over the symbol.  This is then added to the collection of
1408 itemsets (or merged with a pre-existing itemset).
1409
1410 ###### derive itemsets
1411         // Now we have a completed itemset, so we need to
1412         // compute all the 'next' itemsets and create them
1413         // if they don't exist.
1414         for (i = 0; i < done.cnt; i++) {
1415                 int j;
1416                 unsigned short state;
1417                 struct symbol *sym = g->symtab[done.syms[i]];
1418                 enum assoc assoc = Non;
1419                 unsigned short precedence = 0;
1420                 struct symset newitemset = INIT_SYMSET;
1421                 if (type >= LALR)
1422                         newitemset = INIT_DATASET;
1423
1424                 for (j = 0; j < is->items.cnt; j++) {
1425                         int itm = is->items.syms[j];
1426                         int p = item_prod(itm);
1427                         int bp = item_dot(itm);
1428                         struct production *pr = g->productions[p];
1429                         unsigned short la = 0;
1430                         int pos;
1431
1432                         if (bp == pr->body_size)
1433                                 continue;
1434                         if (pr->body[bp] != sym)
1435                                 continue;
1436
1437                         bp += 1;
1438                         if (type >= LALR)
1439                                 la = is->items.data[j];
1440                         if (bp == pr->body_size &&
1441                             pr->precedence > 0 &&
1442                             pr->precedence > precedence) {
1443                                 // new itemset is reducible and has a precedence.
1444                                 precedence = pr->precedence;
1445                                 assoc = pr->assoc;
1446                         }
1447                         pos = symset_find(&newitemset, item_num(p, bp));
1448
1449                         if (pos < 0)
1450                                 symset_add(&newitemset, item_num(p, bp), la);
1451                         else if (type >= LALR) {
1452                                 // Need to merge la set.
1453                                 int la2 = newitemset.data[pos];
1454                                 if (la != la2) {
1455                                         struct symset ss = set_find(g, la2);
1456                                         struct symset LA = INIT_SYMSET;
1457                                         symset_union(&LA, &ss);
1458                                         ss = set_find(g, la);
1459                                         if (symset_union(&LA, &ss))
1460                                                 newitemset.data[pos] = save_set(g, LA);
1461                                         else
1462                                                 symset_free(LA);
1463                                 }
1464                         }
1465                 }
1466                 state = add_itemset(g, newitemset, assoc, precedence, type);
1467                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1468                         symset_add(&is->go_to, done.syms[i], state);
1469         }
1470
1471 All that is left is to create the initial itemset from production zero, and
1472 with `TK_eof` as the LA set.
1473
1474 ###### functions
1475         static void build_itemsets(struct grammar *g, enum grammar_type type)
1476         {
1477                 struct symset first = INIT_SYMSET;
1478                 struct itemset *is;
1479                 int check_again;
1480                 unsigned short la = 0;
1481                 if (type >= LALR) {
1482                         // LA set just has eof
1483                         struct symset eof = INIT_SYMSET;
1484                         symset_add(&eof, TK_eof, 0);
1485                         la = save_set(g, eof);
1486                         first = INIT_DATASET;
1487                 }
1488                 // production 0, offset 0 (with no data)
1489                 symset_add(&first, item_num(0, 0), la);
1490                 add_itemset(g, first, Non, 0, type);
1491                 for (check_again = 0, is = g->items;
1492                      is;
1493                      is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1494                         int i;
1495                         struct symset done = INIT_SYMSET;
1496                         if (is->completed)
1497                                 continue;
1498                         is->completed = 1;
1499                         check_again = 1;
1500                         ## complete itemset
1501                         ## derive itemsets
1502                         symset_free(done);
1503                 }
1504         }
1505
1506 ### Completing the analysis.
1507
1508 The exact process of analysis depends on which level was requested.  For
1509 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1510 `LR1` we need first, but not follow.  For `SLR` we need both.
1511
1512 We don't build the "action" tables that you might expect as the parser
1513 doesn't use them.  They will be calculated to some extent if needed for
1514 a report.
1515
1516 Once we have built everything we allocate arrays for the two lists:
1517 symbols and itemsets.  This allows more efficient access during
1518 reporting.  The symbols are grouped as terminals, then non-terminals,
1519 then virtual, with the start of non-terminals recorded as `first_nonterm`.
1520 Special terminals -- meaning just EOL -- are included with the
1521 non-terminals so that they are not expected by the scanner.
1522
1523 ###### grammar fields
1524         struct symbol **symtab;
1525         struct itemset **statetab;
1526         int first_nonterm;
1527
1528 ###### grammar_analyse
1529
1530         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1531         {
1532                 struct symbol *s;
1533                 struct itemset *is;
1534                 int snum = TK_reserved;
1535                 for (s = g->syms; s; s = s->next)
1536                         if (s->num < 0 && s->type == Terminal && !s->isspecial) {
1537                                 s->num = snum;
1538                                 snum++;
1539                         }
1540                 g->first_nonterm = snum;
1541                 for (s = g->syms; s; s = s->next)
1542                         if (s->num < 0 && s->type != Virtual) {
1543                                 s->num = snum;
1544                                 snum++;
1545                         }
1546                 for (s = g->syms; s; s = s->next)
1547                         if (s->num < 0) {
1548                                 s->num = snum;
1549                                 snum++;
1550                         }
1551                 g->num_syms = snum;
1552                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1553                 for (s = g->syms; s; s = s->next)
1554                         g->symtab[s->num] = s;
1555
1556                 set_nullable(g);
1557                 if (type >= SLR)
1558                         build_first(g);
1559
1560                 if (type == SLR)
1561                         build_follow(g);
1562
1563                 build_itemsets(g, type);
1564
1565                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1566                 for (is = g->items; is ; is = is->next)
1567                         g->statetab[is->state] = is;
1568         }
1569
1570 ## Reporting on the Grammar
1571
1572 The purpose of the report is to give the grammar developer insight into
1573 how the grammar parser will work.  It is basically a structured dump of
1574 all the tables that have been generated, plus a description of any conflicts.
1575
1576 ###### grammar_report
1577         static int grammar_report(struct grammar *g, enum grammar_type type)
1578         {
1579                 report_symbols(g);
1580                 if (g->follow)
1581                         report_follow(g);
1582                 report_itemsets(g);
1583                 return report_conflicts(g, type);
1584         }
1585
1586 Firstly we have the complete list of symbols, together with the
1587 "FIRST" set if that was generated.  We add a mark to each symbol to
1588 show if it is nullable (`.`).
1589
1590 ###### functions
1591
1592         static void report_symbols(struct grammar *g)
1593         {
1594                 int n;
1595                 if (g->first)
1596                         printf("SYMBOLS + FIRST:\n");
1597                 else
1598                         printf("SYMBOLS:\n");
1599
1600                 for (n = 0; n < g->num_syms; n++) {
1601                         struct symbol *s = g->symtab[n];
1602                         if (!s)
1603                                 continue;
1604
1605                         printf(" %c%3d%c: ",
1606                                s->nullable ? '.':' ',
1607                                s->num, symtypes[s->type]);
1608                         prtxt(s->name);
1609                         if (s->precedence)
1610                                 printf(" (%d%s)", s->precedence,
1611                                        assoc_names[s->assoc]);
1612
1613                         if (g->first && s->type == Nonterminal) {
1614                                 int j;
1615                                 char c = ':';
1616                                 for (j = 0; j < g->first[n].cnt; j++) {
1617                                         printf("%c ", c);
1618                                         c = ',';
1619                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1620                                 }
1621                         }
1622                         printf("\n");
1623                 }
1624         }
1625
1626 Then we have the follow sets if they were computed.
1627
1628         static void report_follow(struct grammar *g)
1629         {
1630                 int n;
1631                 printf("FOLLOW:\n");
1632                 for (n = 0; n < g->num_syms; n++)
1633                         if (g->follow[n].cnt) {
1634                                 int j;
1635                                 char c = ':';
1636                                 printf("  ");
1637                                 prtxt(g->symtab[n]->name);
1638                                 for (j = 0; j < g->follow[n].cnt; j++) {
1639                                         printf("%c ", c);
1640                                         c = ',';
1641                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1642                                 }
1643                                 printf("\n");
1644                         }
1645         }
1646
1647 And finally the item sets.  These include the GO TO tables and, for
1648 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1649 it up a bit.  First the items, with production number and associativity.
1650
1651         static void report_item(struct grammar *g, int itm)
1652         {
1653                 int p = item_prod(itm);
1654                 int dot = item_dot(itm);
1655                 struct production *pr = g->productions[p];
1656                 int i;
1657
1658                 printf("    ");
1659                 prtxt(pr->head->name);
1660                 printf(" ->");
1661                 for (i = 0; i < pr->body_size; i++) {
1662                         printf(" %s", (dot == i ? ". ": ""));
1663                         prtxt(pr->body[i]->name);
1664                 }
1665                 if (dot == pr->body_size)
1666                         printf(" .");
1667                 printf(" [%d]", p);
1668                 if (pr->precedence && dot == pr->body_size)
1669                         printf(" (%d%s)", pr->precedence,
1670                                assoc_names[pr->assoc]);
1671                 if (dot < pr->body_size &&
1672                     pr->body[dot]->precedence) {
1673                         struct symbol *s = pr->body[dot];
1674                         printf(" [%d%s]", s->precedence,
1675                                assoc_names[s->assoc]);
1676                 }
1677                 printf("\n");
1678         }
1679
1680 The LA sets which are (possibly) reported with each item:
1681
1682         static void report_la(struct grammar *g, int lanum)
1683         {
1684                 struct symset la = set_find(g, lanum);
1685                 int i;
1686                 char c = ':';
1687
1688                 printf("        LOOK AHEAD(%d)", lanum);
1689                 for (i = 0; i < la.cnt; i++) {
1690                         printf("%c ", c);
1691                         c = ',';
1692                         prtxt(g->symtab[la.syms[i]]->name);
1693                 }
1694                 printf("\n");
1695         }
1696
1697 Then the go to sets:
1698
1699         static void report_goto(struct grammar *g, struct symset gt)
1700         {
1701                 int i;
1702                 printf("    GOTO:\n");
1703
1704                 for (i = 0; i < gt.cnt; i++) {
1705                         printf("      ");
1706                         prtxt(g->symtab[gt.syms[i]]->name);
1707                         printf(" -> %d\n", gt.data[i]);
1708                 }
1709         }
1710
1711 Now we can report all the item sets complete with items, LA sets, and GO TO.
1712
1713         static void report_itemsets(struct grammar *g)
1714         {
1715                 int s;
1716                 printf("ITEM SETS(%d)\n", g->states);
1717                 for (s = 0; s < g->states; s++) {
1718                         int j;
1719                         struct itemset *is = g->statetab[s];
1720                         printf("  Itemset %d:", s);
1721                         if (is->precedence)
1722                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1723                         printf("\n");
1724                         for (j = 0; j < is->items.cnt; j++) {
1725                                 report_item(g, is->items.syms[j]);
1726                                 if (is->items.data != NO_DATA)
1727                                         report_la(g, is->items.data[j]);
1728                         }
1729                         report_goto(g, is->go_to);
1730                 }
1731         }
1732
1733 ### Reporting conflicts
1734
1735 Conflict detection varies a lot among different analysis levels.  However
1736 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1737 LR1 are also similar as they have FOLLOW or LA sets.
1738
1739 ###### functions
1740
1741         ## conflict functions
1742
1743         static int report_conflicts(struct grammar *g, enum grammar_type type)
1744         {
1745                 int cnt = 0;
1746                 printf("Conflicts:\n");
1747                 if (type < SLR)
1748                         cnt = conflicts_lr0(g, type);
1749                 else
1750                         cnt = conflicts_slr(g, type);
1751                 if (cnt == 0)
1752                         printf(" - no conflicts\n");
1753                 return cnt;
1754         }
1755
1756 LR0 conflicts are any state which have both a reducible item and
1757 a shiftable item, or two reducible items.
1758
1759 LR05 conflicts only occur if two possible reductions exist,
1760 as shifts always over-ride reductions.
1761
1762 ###### conflict functions
1763         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1764         {
1765                 int i;
1766                 int cnt = 0;
1767                 for (i = 0; i < g->states; i++) {
1768                         struct itemset *is = g->statetab[i];
1769                         int last_reduce = -1;
1770                         int prev_reduce = -1;
1771                         int last_shift = -1;
1772                         int j;
1773                         if (!is)
1774                                 continue;
1775                         for (j = 0; j < is->items.cnt; j++) {
1776                                 int itm = is->items.syms[j];
1777                                 int p = item_prod(itm);
1778                                 int bp = item_dot(itm);
1779                                 struct production *pr = g->productions[p];
1780
1781                                 if (bp == pr->body_size) {
1782                                         prev_reduce = last_reduce;
1783                                         last_reduce = j;
1784                                         continue;
1785                                 }
1786                                 if (pr->body[bp]->type == Terminal)
1787                                         last_shift = j;
1788                         }
1789                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1790                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1791                                 report_item(g, is->items.syms[last_shift]);
1792                                 report_item(g, is->items.syms[last_reduce]);
1793                                 cnt++;
1794                         }
1795                         if (prev_reduce >= 0) {
1796                                 printf("  State %d has 2 (or more) reducible items\n", i);
1797                                 report_item(g, is->items.syms[prev_reduce]);
1798                                 report_item(g, is->items.syms[last_reduce]);
1799                                 cnt++;
1800                         }
1801                 }
1802                 return cnt;
1803         }
1804
1805 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1806 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1807 in the source of the look ahead set.
1808
1809 We build two datasets to reflect the "action" table: one which maps
1810 terminals to items where that terminal could be shifted and another
1811 which maps terminals to items that could be reduced when the terminal
1812 is in look-ahead.  We report when we get conflicts between the two.
1813
1814         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1815         {
1816                 int i;
1817                 int cnt = 0;
1818
1819                 for (i = 0; i < g->states; i++) {
1820                         struct itemset *is = g->statetab[i];
1821                         struct symset shifts = INIT_DATASET;
1822                         struct symset reduce = INIT_DATASET;
1823                         int j;
1824                         if (!is)
1825                                 continue;
1826                         /* First collect the shifts */
1827                         for (j = 0; j < is->items.cnt; j++) {
1828                                 unsigned short itm = is->items.syms[j];
1829                                 int p = item_prod(itm);
1830                                 int bp = item_dot(itm);
1831                                 struct production *pr = g->productions[p];
1832                                 struct symbol *s;
1833
1834                                 if (bp >= pr->body_size ||
1835                                     pr->body[bp]->type != Terminal)
1836                                         /* not shiftable */
1837                                         continue;
1838
1839                                 s = pr->body[bp];
1840                                 if (s->precedence && is->precedence)
1841                                         /* Precedence resolves this, so no conflict */
1842                                         continue;
1843
1844                                 if (symset_find(&shifts, s->num) < 0)
1845                                         symset_add(&shifts, s->num, itm);
1846                         }
1847                         /* Now look for reductions and conflicts */
1848                         for (j = 0; j < is->items.cnt; j++) {
1849                                 unsigned short itm = is->items.syms[j];
1850                                 int p = item_prod(itm);
1851                                 int bp = item_dot(itm);
1852                                 struct production *pr = g->productions[p];
1853
1854                                 if (bp < pr->body_size)
1855                                         continue;
1856                                 /* reducible */
1857                                 struct symset la;
1858                                 if (type == SLR)
1859                                         la = g->follow[pr->head->num];
1860                                 else
1861                                         la = set_find(g, is->items.data[j]);
1862                                 int k;
1863                                 for (k = 0; k < la.cnt; k++) {
1864                                         int pos = symset_find(&shifts, la.syms[k]);
1865                                         if (pos >= 0) {
1866                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1867                                                 cnt++;
1868                                                         prtxt(g->symtab[la.syms[k]]->name);
1869                                                 printf(":\n");
1870                                                 report_item(g, shifts.data[pos]);
1871                                                 report_item(g, itm);
1872                                         }
1873                                         pos = symset_find(&reduce, la.syms[k]);
1874                                         if (pos < 0) {
1875                                                 symset_add(&reduce, la.syms[k], itm);
1876                                                 continue;
1877                                         }
1878                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1879                                         prtxt(g->symtab[la.syms[k]]->name);
1880                                         printf(":\n");
1881                                         report_item(g, itm);
1882                                         report_item(g, reduce.data[pos]);
1883                                         cnt++;
1884                                 }
1885                         }
1886                         symset_free(shifts);
1887                         symset_free(reduce);
1888                 }
1889                 return cnt;
1890         }
1891
1892
1893 ## Generating the parser
1894
1895 The exported part of the parser is the `parse_XX` function, where the name
1896 `XX` is based on the name of the parser files.
1897
1898 This takes a `code_node`, a partially initialized `token_config`, and an
1899 optional `FILE` to send tracing to.  The `token_config` gets the list of
1900 known words added and then is used with the `code_node` to initialize the
1901 scanner.
1902
1903 `parse_XX` then calls the library function `parser_run` to actually
1904 complete the parse.  This needs the `states` table, the `reductions`
1905 table and functions to call the various pieces of code provided in the
1906 grammar file, so they are generated first.
1907
1908 ###### parser_generate
1909
1910         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1911                                struct code_node *pre_reduce)
1912         {
1913                 gen_known(f, g);
1914                 gen_non_term(f, g);
1915                 gen_action(f, g);
1916                 gen_goto(f, g);
1917                 gen_states(f, g);
1918                 gen_reductions(f, g);
1919                 gen_reduce(f, g, file, pre_reduce);
1920                 gen_free(f, g);
1921
1922                 fprintf(f, "#line 0 \"gen_parser\"\n");
1923                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1924                         name);
1925                 fprintf(f, "{\n");
1926                 fprintf(f, "\tstruct token_state *tokens;\n");
1927                 fprintf(f, "\tconfig->words_marks = known;\n");
1928                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1929                 fprintf(f, "\ttokens = token_open(code, config);\n");
1930                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, reductions, do_reduce, do_free, trace, non_term, config);\n");
1931                 fprintf(f, "\ttoken_close(tokens);\n");
1932                 fprintf(f, "\treturn rv;\n");
1933                 fprintf(f, "}\n\n");
1934         }
1935
1936 ### Known words table
1937
1938 The known words table is simply an array of terminal symbols.
1939 The table of nonterminals used for tracing is a similar array.
1940
1941 ###### functions
1942
1943         static void gen_known(FILE *f, struct grammar *g)
1944         {
1945                 int i;
1946                 fprintf(f, "#line 0 \"gen_known\"\n");
1947                 fprintf(f, "static const char *known[] = {\n");
1948                 for (i = TK_reserved;
1949                      i < g->num_syms && g->symtab[i]->type == Terminal;
1950                      i++)
1951                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1952                                 g->symtab[i]->name.txt);
1953                 fprintf(f, "};\n\n");
1954         }
1955
1956         static void gen_non_term(FILE *f, struct grammar *g)
1957         {
1958                 int i;
1959                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1960                 fprintf(f, "static const char *non_term[] = {\n");
1961                 for (i = TK_reserved;
1962                      i < g->num_syms;
1963                      i++)
1964                         if (g->symtab[i]->type == Nonterminal ||
1965                             g->symtab[i]->isspecial)
1966                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1967                                         g->symtab[i]->name.txt);
1968                 fprintf(f, "};\n\n");
1969         }
1970
1971 ### The Action table
1972
1973 For this parser we do not have a separate action table.  The action
1974 table normally maps terminals to one of the actions SHIFT, REDUCE(p), or
1975 ACCEPT.  We can find which terminals to SHIFT via a lookup in the go to
1976 table, and most states have at most one reducible production and in that
1977 case to effect that reduction for any non-shiftable terminal in the
1978 look-ahead.
1979
1980 However, with SLR, LALR, and canonical-LR grammars, there may
1981 occasionally be multiple possible reductions which we choose between
1982 based on the look-ahead.  To handle this possibility we allow reductions
1983 to be stored in the go to table.  An extra flag indicates if an entry is
1984 a state to go to, or a production to reduce.  We add the entries with
1985 that flag here, if necessary.
1986
1987 ###### functions
1988         #define IS_REDUCTION (0x8000)
1989
1990         static void gen_action(FILE *f, struct grammar *g)
1991         {
1992                 int i, j, k;
1993                 for (i = 0; i < g->states; i++) {
1994                         struct itemset *is = g->statetab[i];
1995                         struct symset *gt = &is->go_to;
1996
1997                         if (is->reducible_prod != MANY_REDUCIBLE)
1998                                 continue;
1999                         for (j = 0; j < is->items.cnt; j++) {
2000                                 int itm = is->items.syms[j];
2001                                 int p = item_prod(itm);
2002                                 int bp = item_dot(itm);
2003                                 struct production *pr = g->productions[p];
2004                                 struct symset la;
2005                                 if (bp != pr->body_size)
2006                                         continue;
2007                                 /* This is a reducible item */
2008                                 if (g->follow)
2009                                         la = g->follow[pr->head->num];
2010                                 else
2011                                         la = set_find(g, is->items.data[j]);
2012                                 for (k = 0 ; k < la.cnt; k++)
2013                                         symset_add(gt, la.syms[k], p | IS_REDUCTION);
2014                         }
2015                 }
2016         }
2017
2018 ### States, reductions, and the go to tables.
2019
2020 For each state we record the go to table and the reducible production if
2021 there is one, the details of which are in a separate table of
2022 reductions.  Some of the details of the reducible production are stored
2023 in the `do_reduce` function to come later.  In the go to table we store
2024 the production number and in the reductions table: the body size (useful
2025 for stack management), the resulting symbol (useful for knowing how to
2026 free data later), and the size of the resulting asn object (useful for
2027 preallocation space.
2028
2029 The go to table is stored in a simple array of `sym` and corresponding
2030 `state`.
2031
2032 ###### exported types
2033
2034         struct lookup {
2035                 short sym;
2036                 short state_or_reduction:15;
2037                 unsigned short is_reduction:1;
2038         };
2039         struct reduction {
2040                 short size;
2041                 short sym;
2042                 short result_size;
2043         };
2044         struct state {
2045                 short reduce_prod;      // or NONE_REDUCIBLE or MANY_REDUCIBLE
2046                 short go_to_cnt;
2047                 const struct lookup * go_to; // includes reductions
2048         };
2049
2050 ###### functions
2051
2052         static void gen_goto(FILE *f, struct grammar *g)
2053         {
2054                 int i;
2055                 fprintf(f, "#line 0 \"gen_goto\"\n");
2056                 for (i = 0; i < g->states; i++) {
2057                         struct symset gt = g->statetab[i]->go_to;
2058                         int j;
2059
2060                         if (gt.cnt == 0)
2061                                 continue;
2062                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
2063                                 i);
2064                         for (j = 0; j < gt.cnt; j++) {
2065                                 if (gt.data[j] & IS_REDUCTION)
2066                                         fprintf(f, "\t{ %d, %d, 1 },\n",
2067                                                 gt.syms[j], gt.data[j]&~IS_REDUCTION);
2068                                 else
2069                                         fprintf(f, "\t{ %d, %d, 0 },\n",
2070                                                 gt.syms[j], gt.data[j]);
2071                         }
2072                         fprintf(f, "};\n");
2073                 }
2074         }
2075
2076         static void gen_reductions(FILE *f, struct grammar *g)
2077         {
2078                 int i;
2079                 fprintf(f, "#line 0 \"gen_reductions\"\n");
2080                 fprintf(f, "static const struct reduction reductions[] = {\n");
2081                 for (i = 0; i < g->production_count; i++) {
2082                         struct production *pr = g->productions[i];
2083                         struct symbol *hd = pr->head;
2084                         fprintf(f, "\t{%d, %d, ", pr->body_size, hd->num);
2085                         if (hd->struct_name.txt == NULL)
2086                                 fprintf(f, "0 },\n");
2087                         else
2088                                 fprintf(f, "sizeof(struct %.*s%s) },\n",
2089                                         hd->struct_name.len,
2090                                         hd->struct_name.txt,
2091                                         hd->isref ? "*" : "");
2092                 }
2093                 fprintf(f, "};\n\n");
2094         }
2095
2096         static void gen_states(FILE *f, struct grammar *g)
2097         {
2098                 int i;
2099                 fprintf(f, "#line 0 \"gen_states\"\n");
2100                 fprintf(f, "static const struct state states[] = {\n");
2101                 for (i = 0; i < g->states; i++) {
2102                         struct itemset *is = g->statetab[i];
2103                         int prod = is->reducible_prod;
2104
2105                         if (is->go_to.cnt)
2106                                 fprintf(f, "\t[%d] = { %d, %d, goto_%d },\n",
2107                                         i, prod, is->go_to.cnt, i);
2108                         else
2109                                 fprintf(f, "\t[%d] = { %d, 0, NULL },\n", i, prod);
2110                 }
2111                 fprintf(f, "};\n\n");
2112         }
2113
2114 ### The `do_reduce` function and the code
2115
2116 When the parser engine decides to reduce a production, it calls
2117 `do_reduce` which runs the code that was included with the production,
2118 if any.
2119
2120 This code needs to be able to store data somewhere so we record the size
2121 of the data expected with each state so it can be allocated before
2122 `do_reduce` is called.
2123
2124 In order for the code to access "global" context, we pass in the
2125 "config" pointer that was passed to the parser function.  If the `struct
2126 token_config` is embedded in some larger structure, the reducing code
2127 can access the larger structure using pointer manipulation.  Performing
2128 that pointer manipulation, and any other common code or variables for
2129 all reduce actions, can be provided in code node called "reduce" which
2130 is passed around in `pre_reduce` which you might have already noticed.
2131
2132 The code fragments require translation when written out.  Any `$N` needs
2133 to be converted to a reference either to that buffer (if `$0`) or to the
2134 structure returned by a previous reduction.  These pointers need to be
2135 cast to the appropriate type for each access.  All this is handled in
2136 `gen_code`.
2137
2138 `gen_code` also allows symbol references to contain a '`<`' as in
2139 '`$<2`'.  This is particularly useful for references (or pointers), but
2140 can be used with structures too.  The `<` implies that the value is
2141 being moved out, so the object will not be automatically freed.  It is
2142 equivalent to assigning `NULL` to the pointer or filling a structure
2143 with zeros.
2144
2145 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2146 and possibly a number.  A number by itself (other than zero) selects a
2147 symbol from the body of the production.  A sequence of letters selects
2148 the shortest symbol in the body which contains those letters in the
2149 given order.  If a number follows the letters, then a later occurrence
2150 of that symbol is chosen.  So "`$AB2`" will refer to the structure
2151 attached to the second occurrence of the shortest symbol which contains
2152 an `A` followed by a `B`.  If there is no unique shortest system, or if
2153 the number given is too large, then the symbol reference is not
2154 transformed, and will cause an error when the code is compiled.
2155
2156 ###### functions
2157
2158         static int textchr(struct text t, char c, int s)
2159         {
2160                 int i;
2161                 for (i = s; i < t.len; i++)
2162                         if (t.txt[i] == c)
2163                                 return i;
2164                 return -1;
2165         }
2166
2167         static int subseq_match(char *seq, int slen, struct text name)
2168         {
2169                 int st = 0;
2170                 while (slen > 0) {
2171                         st = textchr(name, *seq, st);
2172                         if (st < 0)
2173                                 return 0;
2174                         slen -= 1;
2175                         seq += 1;
2176                         st += 1;
2177                 }
2178                 return 1;
2179         }
2180
2181         static int choose_sym(char **namep, int len, struct production *p)
2182         {
2183                 char *name = *namep;
2184                 char *nam = name;
2185                 int namlen;
2186                 int n = 0;
2187                 int i, s, slen;
2188                 char c;
2189
2190                 c = *name;
2191                 while (len > 0 &&
2192                        ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2193                         name += 1;
2194                         len -= 1;
2195                         c = *name;
2196                 }
2197                 namlen = name-nam;
2198                 while (len > 0 && (c >= '0' && c <= '9')) {
2199                         name += 1;
2200                         len -= 1;
2201                         n = n * 10 + (c - '0');
2202                         c = *name;
2203                 }
2204                 if (namlen == 0) {
2205                         if (name == *namep || n > p->body_size)
2206                                 return -1;
2207                         *namep = name;
2208                         return n;
2209                 }
2210                 slen = 0; s = -1;
2211                 for (i = 0; i < p->body_size; i++) {
2212                         if (!subseq_match(nam, namlen, p->body[i]->name))
2213                                 continue;
2214                         if (slen == 0 || p->body[i]->name.len < slen) {
2215                                 s = i;
2216                                 slen = p->body[i]->name.len;
2217                         }
2218                         if (s >= 0 && p->body[i] != p->body[s] &&
2219                             p->body[i]->name.len == p->body[s]->name.len)
2220                                 /* not unique, so s cannot be used */
2221                                 s = -1;
2222                 }
2223                 if (s < 0)
2224                         return -1;
2225                 if (n == 0)
2226                         n = 1;
2227                 for (i = 0; i < p->body_size; i++)
2228                         if (p->body[i] == p->body[s]) {
2229                                 n -= 1;
2230                                 if (n == 0)
2231                                         break;
2232                         }
2233                 if (n > 0 || i == p->body_size)
2234                         return -1;
2235                 *namep = name;
2236                 return i + 1;
2237         }
2238
2239         static void gen_code(struct production *p, FILE *f, struct grammar *g)
2240         {
2241                 char *c;
2242                 char *used = calloc(1, p->body_size);
2243                 int i;
2244
2245                 fprintf(f, "\t\t\t");
2246                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2247                         int n;
2248                         int use = 0;
2249                         if (*c != '$') {
2250                                 fputc(*c, f);
2251                                 if (*c == '\n')
2252                                         fputs("\t\t\t", f);
2253                                 continue;
2254                         }
2255                         c++;
2256                         if (*c == '<') {
2257                                 use = 1;
2258                                 c++;
2259                         }
2260                         n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2261                         if (n < 0) {
2262                                 fputc('$', f);
2263                                 if (use)
2264                                         fputc('<', f);
2265                                 fputc(*c, f);
2266                                 continue;
2267                         }
2268                         if (n == 0)
2269                                 fprintf(f, "(*(struct %.*s*%s)ret)",
2270                                         p->head->struct_name.len,
2271                                         p->head->struct_name.txt,
2272                                         p->head->isref ? "*":"");
2273                         else if (p->body[n-1]->type == Terminal)
2274                                 fprintf(f, "(*(struct token *)body[%d])",
2275                                         n-1);
2276                         else if (p->body[n-1]->struct_name.txt == NULL)
2277                                 fprintf(f, "$%d", n);
2278                         else {
2279                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2280                                         p->body[n-1]->struct_name.len,
2281                                         p->body[n-1]->struct_name.txt,
2282                                         p->body[n-1]->isref ? "*":"", n-1);
2283                                 used[n-1] = use;
2284                         }
2285                         c -= 1;
2286                 }
2287                 fputs("\n", f);
2288                 for (i = 0; i < p->body_size; i++) {
2289                         if (p->body[i]->struct_name.txt &&
2290                             used[i]) {
2291                                 // assume this has been copied out
2292                                 if (p->body[i]->isref)
2293                                         fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2294                                 else
2295                                         fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2296                         }
2297                 }
2298                 free(used);
2299         }
2300
2301 ###### functions
2302
2303         static void gen_reduce(FILE *f, struct grammar *g, char *file,
2304                                struct code_node *pre_reduce)
2305         {
2306                 int i;
2307                 fprintf(f, "#line 1 \"gen_reduce\"\n");
2308                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2309                 fprintf(f, "{\n");
2310                 fprintf(f, "\tint ret_size = 0;\n");
2311                 if (pre_reduce)
2312                         code_node_print(f, pre_reduce, file);
2313
2314                 fprintf(f, "#line 4 \"gen_reduce\"\n");
2315                 fprintf(f, "\tswitch(prod) {\n");
2316                 for (i = 0; i < g->production_count; i++) {
2317                         struct production *p = g->productions[i];
2318                         fprintf(f, "\tcase %d:\n", i);
2319
2320                         if (p->code.txt) {
2321                                 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2322                                 gen_code(p, f, g);
2323                         }
2324
2325                         if (p->head->struct_name.txt)
2326                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2327                                         p->head->struct_name.len,
2328                                         p->head->struct_name.txt,
2329                                         p->head->isref ? "*":"");
2330
2331                         fprintf(f, "\t\tbreak;\n");
2332                 }
2333                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2334         }
2335
2336 ### `do_free`
2337
2338 As each non-terminal can potentially cause a different type of data
2339 structure to be allocated and filled in, we need to be able to free it when
2340 done.
2341
2342 It is particularly important to have fine control over freeing during error
2343 recovery where individual stack frames might need to be freed.
2344
2345 For this, the grammar author is required to defined a `free_XX` function for
2346 each structure that is used by a non-terminal.  `do_free` will call whichever
2347 is appropriate given a symbol number, and will call `free` (as is
2348 appropriate for tokens) on any terminal symbol.
2349
2350 ###### functions
2351
2352         static void gen_free(FILE *f, struct grammar *g)
2353         {
2354                 int i;
2355
2356                 fprintf(f, "#line 0 \"gen_free\"\n");
2357                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2358                 fprintf(f, "{\n");
2359                 fprintf(f, "\tif (!asn) return;\n");
2360                 fprintf(f, "\tif (sym < %d", g->first_nonterm);
2361                 /* Need to handle special terminals too */
2362                 for (i = 0; i < g->num_syms; i++) {
2363                         struct symbol *s = g->symtab[i];
2364                         if (i >= g->first_nonterm && s->type == Terminal &&
2365                             s->isspecial)
2366                                 fprintf(f, " || sym == %d", s->num);
2367                 }
2368                 fprintf(f, ") {\n");
2369                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2370                 fprintf(f, "\tswitch(sym) {\n");
2371
2372                 for (i = 0; i < g->num_syms; i++) {
2373                         struct symbol *s = g->symtab[i];
2374                         if (!s ||
2375                             s->type != Nonterminal ||
2376                             s->struct_name.txt == NULL)
2377                                 continue;
2378
2379                         fprintf(f, "\tcase %d:\n", s->num);
2380                         if (s->isref) {
2381                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2382                                         s->struct_name.len,
2383                                         s->struct_name.txt);
2384                                 fprintf(f, "\t\tfree(asn);\n");
2385                         } else
2386                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2387                                         s->struct_name.len,
2388                                         s->struct_name.txt);
2389                         fprintf(f, "\t\tbreak;\n");
2390                 }
2391                 fprintf(f, "\t}\n}\n\n");
2392         }
2393
2394 ## The main routine.
2395
2396 There are three key parts to the "main" function of parsergen: processing
2397 the arguments, loading the grammar file, and dealing with the grammar.
2398
2399 ### Argument processing.
2400
2401 Command line options allow the selection of analysis mode, name of output
2402 file, and whether or not a report should be generated.  By default we create
2403 a report only if no code output was requested.
2404
2405 The `parse_XX` function name uses the basename of the output file which
2406 should not have a suffix (`.c`).  `.c` is added to the given name for the
2407 code, and `.h` is added for the header (if header text is specifed in the
2408 grammar file).
2409
2410 ###### includes
2411         #include <getopt.h>
2412
2413 ###### declarations
2414         static const struct option long_options[] = {
2415                 { "LR0",        0, NULL, '0' },
2416                 { "LR05",       0, NULL, '5' },
2417                 { "SLR",        0, NULL, 'S' },
2418                 { "LALR",       0, NULL, 'L' },
2419                 { "LR1",        0, NULL, '1' },
2420                 { "tag",        1, NULL, 't' },
2421                 { "report",     0, NULL, 'R' },
2422                 { "output",     1, NULL, 'o' },
2423                 { NULL,         0, NULL, 0   }
2424         };
2425         const char *options = "05SL1t:Ro:";
2426
2427 ###### process arguments
2428         int opt;
2429         char *outfile = NULL;
2430         char *infile;
2431         char *name;
2432         char *tag = NULL;
2433         int report = 1;
2434         enum grammar_type type = LR05;
2435         while ((opt = getopt_long(argc, argv, options,
2436                                   long_options, NULL)) != -1) {
2437                 switch(opt) {
2438                 case '0':
2439                         type = LR0; break;
2440                 case '5':
2441                         type = LR05; break;
2442                 case 'S':
2443                         type = SLR; break;
2444                 case 'L':
2445                         type = LALR; break;
2446                 case '1':
2447                         type = LR1; break;
2448                 case 'R':
2449                         report = 2; break;
2450                 case 'o':
2451                         outfile = optarg; break;
2452                 case 't':
2453                         tag = optarg; break;
2454                 default:
2455                         fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2456                         exit(1);
2457                 }
2458         }
2459         if (optind < argc)
2460                 infile = argv[optind++];
2461         else {
2462                 fprintf(stderr, "No input file given\n");
2463                 exit(1);
2464         }
2465         if (outfile && report == 1)
2466                 report = 0;
2467         name = outfile;
2468         if (name && strchr(name, '/'))
2469                 name = strrchr(name, '/')+1;
2470
2471         if (optind < argc) {
2472                 fprintf(stderr, "Excess command line arguments\n");
2473                 exit(1);
2474         }
2475
2476 ### Loading the grammar file
2477
2478 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2479 map it.
2480
2481 Once we have extracted the code (with `mdcode`) we expect to find three
2482 or four sections: header, code, grammar, reduce.
2483 Anything else that is not excluded by the `--tag` option is an error.
2484
2485 "header", "code", and "reduce" are optional, though it is hard to build
2486 a working parser without the first two.  "grammar" must be provided.
2487
2488 ###### includes
2489         #include <fcntl.h>
2490         #include <sys/mman.h>
2491         #include <errno.h>
2492
2493 ###### functions
2494         static int errs;
2495         static void pr_err(char *msg)
2496         {
2497                 errs++;
2498                 fprintf(stderr, "%s\n", msg);
2499         }
2500
2501 ###### load file
2502         struct section *table;
2503         int fd;
2504         int len;
2505         char *file;
2506         fd = open(infile, O_RDONLY);
2507         if (fd < 0) {
2508                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2509                         infile, strerror(errno));
2510                 exit(1);
2511         }
2512         len = lseek(fd, 0, 2);
2513         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2514         table = code_extract(file, file+len, pr_err);
2515
2516         struct code_node *hdr = NULL;
2517         struct code_node *code = NULL;
2518         struct code_node *gram = NULL;
2519         struct code_node *pre_reduce = NULL;
2520         for (s = table; s; s = s->next) {
2521                 struct text sec = s->section;
2522                 if (tag && !strip_tag(&sec, tag))
2523                         continue;
2524                 if (text_is(sec, "header"))
2525                         hdr = s->code;
2526                 else if (text_is(sec, "code"))
2527                         code = s->code;
2528                 else if (text_is(sec, "grammar"))
2529                         gram = s->code;
2530                 else if (text_is(sec, "reduce"))
2531                         pre_reduce = s->code;
2532                 else {
2533                         fprintf(stderr, "Unknown content section: %.*s\n",
2534                                 s->section.len, s->section.txt);
2535                         rv |= 2;
2536                 }
2537         }
2538
2539 ### Processing the input
2540
2541 As we need to append an extention to a filename and then open it for
2542 writing, and we need to do this twice, it helps to have a separate function.
2543
2544 ###### functions
2545
2546         static FILE *open_ext(char *base, char *ext)
2547         {
2548                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2549                 FILE *f;
2550                 strcat(strcpy(fn, base), ext);
2551                 f = fopen(fn, "w");
2552                 free(fn);
2553                 return f;
2554         }
2555
2556 If we can read the grammar, then we analyse and optionally report on it.  If
2557 the report finds conflicts we will exit with an error status.
2558
2559 ###### process input
2560         struct grammar *g = NULL;
2561         if (gram == NULL) {
2562                 fprintf(stderr, "No grammar section provided\n");
2563                 rv |= 2;
2564         } else {
2565                 g = grammar_read(gram);
2566                 if (!g) {
2567                         fprintf(stderr, "Failure to parse grammar\n");
2568                         rv |= 2;
2569                 }
2570         }
2571         if (g) {
2572                 grammar_analyse(g, type);
2573                 if (report)
2574                         if (grammar_report(g, type))
2575                                 rv |= 1;
2576         }
2577
2578 If a "headers" section is defined, we write it out and include a declaration
2579 for the `parse_XX` function so it can be used from separate code.
2580
2581         if (rv == 0 && hdr && outfile) {
2582                 FILE *f = open_ext(outfile, ".h");
2583                 if (f) {
2584                         code_node_print(f, hdr, infile);
2585                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2586                                 name);
2587                         fclose(f);
2588                 } else {
2589                         fprintf(stderr, "Cannot create %s.h\n",
2590                                 outfile);
2591                         rv |= 4;
2592                 }
2593         }
2594
2595 And if all goes well and an output file was provided, we create the `.c`
2596 file with the code section (if any) and the parser tables and function.
2597
2598         if (rv == 0 && outfile) {
2599                 FILE *f = open_ext(outfile, ".c");
2600                 if (f) {
2601                         if (code)
2602                                 code_node_print(f, code, infile);
2603                         gen_parser(f, g, infile, name, pre_reduce);
2604                         fclose(f);
2605                 } else {
2606                         fprintf(stderr, "Cannot create %s.c\n",
2607                                 outfile);
2608                         rv |= 4;
2609                 }
2610         }
2611
2612 And that about wraps it up.  We need to set the locale so that UTF-8 is
2613 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2614
2615 ###### File: parsergen.mk
2616         parsergen : parsergen.o libscanner.o libmdcode.o
2617                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2618
2619 ###### includes
2620         #include <locale.h>
2621
2622 ###### main
2623
2624         int main(int argc, char *argv[])
2625         {
2626                 struct section *s;
2627                 int rv = 0;
2628
2629                 setlocale(LC_ALL,"");
2630
2631                 ## process arguments
2632                 ## load file
2633                 ## process input
2634
2635                 return rv;
2636         }
2637
2638 ## The SHIFT/REDUCE parser
2639
2640 Having analysed the grammar and generated all the tables, we only need
2641 the shift/reduce engine to bring it all together.
2642
2643 ### Go to table lookup
2644
2645 The parser generator has nicely provided us with go to tables sorted by
2646 symbol number.  We need a binary search function to find a symbol in the
2647 table.
2648
2649 ###### parser functions
2650
2651         static int search(const struct state *l, int sym, int reduction)
2652         {
2653                 int lo = 0;
2654                 int hi = l->go_to_cnt;
2655
2656                 if (hi == 0)
2657                         return -1;
2658                 while (lo + 1 < hi) {
2659                         int mid = (lo + hi) / 2;
2660                         if (l->go_to[mid].sym <= sym)
2661                                 lo = mid;
2662                         else
2663                                 hi = mid;
2664                 }
2665                 if (l->go_to[lo].sym != sym)
2666                         return -1;
2667                 if (!reduction == !l->go_to[lo].is_reduction)
2668                         return l->go_to[lo].state_or_reduction;
2669                 return -1;
2670         }
2671
2672 ### Memory allocation
2673
2674 The `scanner` returns tokens in a local variable - we want them in allocated
2675 memory so they can live in the `asn_stack`.  So we provide `tok_copy` to
2676 make an allocated copy as required.
2677
2678 ###### parser includes
2679         #include <memory.h>
2680
2681 ###### parser functions
2682
2683         static struct token *tok_copy(struct token tk)
2684         {
2685                 struct token *new = malloc(sizeof(*new));
2686                 *new = tk;
2687                 return new;
2688         }
2689
2690 ### The state stack.
2691
2692 The core data structure for the parser is the stack.  This tracks all
2693 the symbols that have been recognised or partially recognised.
2694
2695 The stack usually won't grow very large - maybe a few tens of entries.
2696 So we dynamically grow an array as required but never bother to
2697 shrink it down again.
2698
2699 We keep the stack as two separate allocations.  One, `asn_stack`, stores
2700 the "abstract syntax nodes" which are created by each reduction.  When
2701 we call `do_reduce` we need to pass an array of the `asn`s of the body
2702 of the production, and by keeping a separate `asn` stack, we can just
2703 pass a pointer into this stack.
2704
2705 The other allocation stores all other stack fields of which there are
2706 two.  The `state` is the most important one and guides the parsing
2707 process.  The `sym` is nearly unnecessary as it is implicit in the
2708 state.  However when we want to free entries from the `asn_stack`, it
2709 helps to know what type they are so we can call the right freeing
2710 function.  The symbol leads us to the right free function through
2711 `do_free`.
2712
2713 The stack is most properly seen as alternating states and symbols -
2714 states, like the 'DOT' in items, are between symbols.  Each frame in
2715 our stack holds a state and the symbol that was before it.  The
2716 bottom of stack holds the start state but no symbol, as nothing came
2717 before the beginning.  As we need to store some value, `TK_eof` is used
2718 to mark the beginning of the file as well as the end.
2719
2720 ###### parser functions
2721
2722         struct parser {
2723                 struct frame {
2724                         short state;
2725                         short sym;
2726                 } *stack;
2727                 void **asn_stack;
2728                 int stack_size;
2729                 int tos;
2730
2731                 ## parser state
2732         };
2733
2734 #### Shift and pop
2735
2736 Two operations are needed on the stack - shift (which is like push) and pop.
2737
2738 Shift applies not only to terminals but also to non-terminals.  When we
2739 reduce a production we will pop off frames corresponding to the body
2740 symbols, then push on a frame for the head of the production.  This last
2741 is exactly the same process as shifting in a terminal so we use the same
2742 function for both.  In both cases we provide the symbol.  The state is
2743 deduced from the current top-of-stack state and the new symbol.
2744
2745 To simplify other code we arrange for `shift` to fail if there is no `go to`
2746 state for the symbol.  This is useful in basic parsing due to our design
2747 that we shift when we can, and reduce when we cannot.  So the `shift`
2748 function reports if it could.
2749
2750 `shift` is also used to push state zero onto the stack, so if the
2751 stack is empty, it always chooses zero as the next state.
2752
2753 So `shift` finds the next state.  If that succeeds it extends the
2754 allocations if needed and pushes all the information onto the stacks.
2755
2756 ###### parser functions
2757
2758         static int shift(struct parser *p,
2759                          short sym, void *asn,
2760                          const struct state states[])
2761         {
2762                 struct frame next = {0};
2763                 int newstate = p->tos
2764                         ? search(&states[p->stack[p->tos-1].state],
2765                                  sym, 0)
2766                         : 0;
2767                 if (newstate < 0)
2768                         return 0;
2769
2770                 if (p->tos >= p->stack_size) {
2771                         p->stack_size += 10;
2772                         p->stack = realloc(p->stack, p->stack_size
2773                                            * sizeof(p->stack[0]));
2774                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2775                                            * sizeof(p->asn_stack[0]));
2776                 }
2777                 next.sym = sym;
2778                 next.state = newstate;
2779
2780                 p->stack[p->tos] = next;
2781                 p->asn_stack[p->tos] = asn;
2782                 p->tos++;
2783                 return 1;
2784         }
2785
2786 `pop` primarily moves the top of stack (`tos`) back down the required
2787 amount and frees any `asn` entries that need to be freed.  It is called
2788 _after_ we reduce a production, just before we `shift` the nonterminal
2789 in.
2790
2791 ###### parser functions
2792
2793         static void pop(struct parser *p, int num,
2794                         void(*do_free)(short sym, void *asn))
2795         {
2796                 int i;
2797
2798                 p->tos -= num;
2799                 for (i = 0; i < num; i++)
2800                         do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]);
2801         }
2802
2803 ### The heart of the parser.
2804
2805 Now we have the parser.  For each token we might shift it, trigger a
2806 reduction, or start error handling.  2D tokens (IN, OUT, NEWLINE, EOL)
2807 might also be ignored.  Ignoring tokens is combined with shifting.
2808
2809 ###### parser vars
2810
2811         struct parser p = { 0 };
2812         struct token *tk = NULL;
2813         int accepted = 0;
2814
2815 ###### heart of parser
2816
2817         shift(&p, TK_eof, NULL, states);
2818         while (!accepted && p.tos > 0) {
2819                 struct frame *tos = &p.stack[p.tos-1];
2820                 if (!tk)
2821                         tk = tok_copy(token_next(tokens));
2822                 parser_trace(trace, &p,
2823                              tk, states, non_term, config->known_count);
2824
2825                 ## try shift or ignore
2826                 ## try reduce
2827                 ## handle error
2828         }
2829
2830 Indents are ignored unless they can be shifted onto the stack
2831 immediately or nothing can be shifted (in which case we reduce, and try
2832 again).  The end of an indented section - the OUT token - is ignored
2833 precisely when the indent was ignored.  To keep track of this we need a
2834 small stack of flags, which is easily stored as bits in an `unsigned
2835 long`.  This will never overflow and the scanner only allows 20 levels
2836 of indentation.
2837
2838 ###### parser state
2839         unsigned long ignored_indents;
2840
2841 NEWLINE is ignored when in an indented section of text which was not
2842 explicitly expected by the grammar.  So if the most recent indent is
2843 ignored, so is any NEWLINE token.
2844
2845 If a NEWLINE is seen but it cannot be shifted, we try to shift an EOL
2846 token instead.  If that succeeds, we make a new copy of the NEWLINE
2847 token and continue.  This allows a NEWLINE to appear to be preceded by
2848 an indefinite number of EOL tokens.
2849
2850 The token number for `EOL` cannot be statically declared, so when the
2851 parser starts we need to look through the array of non-terminals to find
2852 the EOL.
2853
2854 ###### parser state
2855         int tk_eol;
2856
2857 ###### find eol
2858         p.tk_eol = 0;
2859         while (strcmp(non_term[p.tk_eol], "EOL") != 0)
2860                 p.tk_eol += 1;
2861         p.tk_eol += TK_reserved + config->known_count;
2862
2863 For other tokens, we shift the next token if that is possible, otherwise
2864 we try to reduce a production.
2865
2866 ###### try shift or ignore
2867
2868         if ((tk->num == TK_newline || tk->num == TK_out) &&
2869             (p.ignored_indents & 1)) {
2870                 /* indented, so ignore OUT and NEWLINE */
2871                 if (tk->num == TK_out)
2872                         p.ignored_indents >>= 1;
2873                 free(tk);
2874                 tk = NULL;
2875                 parser_trace_action(trace, "Ignore");
2876                 continue;
2877         }
2878
2879         if (shift(&p, tk->num, tk, states)) {
2880                 if (tk->num == TK_out)
2881                         p.ignored_indents >>= 1;
2882                 if (tk->num == TK_in)
2883                         p.ignored_indents <<= 1;
2884
2885                 parser_trace_action(trace, "Shift");
2886                 tk = NULL;
2887                 ## did shift
2888                 continue;
2889         } else if (tk->num == TK_newline &&
2890                    shift(&p, p.tk_eol, tk, states)) {
2891                 tk = tok_copy(*tk);
2892                 parser_trace_action(trace, "ShiftEOL");
2893                 continue;
2894         }
2895
2896         if (tk->num == TK_in) {
2897                 int should_reduce;
2898                 if (states[tos->state].reduce_prod == MANY_REDUCIBLE)
2899                         /* Only reduce if TK_in in lookahead */
2900                         should_reduce = (search(&states[tos->state], TK_in, 1) >= 0);
2901                 else
2902                         /* Only reduce if we cannot shift anything */
2903                         should_reduce = (states[tos->state].go_to_cnt == 0);
2904                 if (!should_reduce) {
2905                         /* No indent expected here and reduce is not indicated,
2906                          * so ignore IN
2907                          */
2908                         free(tk);
2909                         tk = NULL;
2910                         p.ignored_indents <<= 1;
2911                         p.ignored_indents |= 1;
2912                         parser_trace_action(trace, "Ignore");
2913                         continue;
2914                 }
2915         }
2916
2917 We have already discussed the bulk of the handling of a "reduce" action,
2918 with the `pop()` and `shift()` functions doing much of the work.  There
2919 is a little more complexity needed to manage storage for the asn (Abstract
2920 Syntax Node), and also a test of whether the reduction is permitted.
2921
2922 When we try to shift the result of reducing production-zero, it will
2923 fail because there is no next state.  In this case the asn will not have
2924 been stored on the stack, so it get stored in the `ret` variable, and we
2925 report that that input has been accepted.
2926
2927 ###### parser vars
2928
2929         void *ret = NULL;
2930         int reduction;
2931
2932 ###### try reduce
2933
2934         reduction = states[tos->state].reduce_prod;
2935         if (reduction == MANY_REDUCIBLE)
2936                 reduction = search(&states[tos->state], tk->num, 1);
2937         if (reduction >= 0) {
2938                 void **body;
2939                 void *res;
2940                 int size = reductions[reduction].size;
2941                 int res_size = reductions[reduction].result_size;
2942
2943                 body = p.asn_stack + (p.tos - size);
2944                 res = res_size ? calloc(1, res_size) : NULL;
2945                 res_size = do_reduce(reduction, body, config, res);
2946                 if (res_size != reductions[reduction].result_size)
2947                         abort();
2948                 pop(&p, size, do_free);
2949                 if (!shift(&p, reductions[reduction].sym, res, states)) {
2950                         accepted = 1;
2951                         ret = res;
2952                         parser_trace_action(trace, "Accept");
2953                 } else
2954                         parser_trace_action(trace, "Reduce");
2955                 continue;
2956         }
2957
2958 If we can neither shift nor reduce we have an error to handle.  There
2959 are two possible responses to an error: we can pop single frames off the
2960 stack until we find a frame that can shift `TK_error`, or we can discard
2961 the current look-ahead symbol.
2962
2963 When we first see an error we do the first of these and set a flag to
2964 record that we are processing an error.  If the normal shift/reduce
2965 tests fail to find that anything can be done from that state, we start
2966 dropping tokens until either we manage to shift one, or reach end-of-file.
2967
2968 ###### parser vars
2969
2970         int in_err = 0;
2971
2972 ###### did shift
2973
2974         in_err = 0;
2975
2976 ###### handle error
2977
2978         if (in_err) {
2979                 parser_trace_action(trace, "DISCARD");
2980                 if (tk->num == TK_eof)
2981                         break;
2982                 free(tk);
2983                 tk = NULL;
2984         } else {
2985                 struct token *err_tk;
2986
2987                 parser_trace_action(trace, "ERROR");
2988
2989                 err_tk = tok_copy(*tk);
2990                 while (p.tos > 0 && shift(&p, TK_error, err_tk, states) == 0)
2991                         // discard this state
2992                         pop(&p, 1, do_free);
2993                 if (p.tos == 0) {
2994                         free(err_tk);
2995                         // no state accepted TK_error
2996                         break;
2997                 }
2998                 in_err = 1;
2999         }
3000
3001 ###### parser includes
3002         #include "parser.h"
3003
3004 ###### parser_run
3005
3006         void *parser_run(struct token_state *tokens,
3007                          const struct state states[],
3008                          const struct reduction reductions[],
3009                          int (*do_reduce)(int, void**, struct token_config*, void*),
3010                          void (*do_free)(short, void*),
3011                          FILE *trace, const char *non_term[],
3012                          struct token_config *config)
3013         {
3014                 ## parser vars
3015
3016                 ## find eol
3017
3018                 ## heart of parser
3019
3020                 free(tk);
3021                 pop(&p, p.tos, do_free);
3022                 free(p.asn_stack);
3023                 free(p.stack);
3024                 return ret;
3025         }
3026
3027 ###### exported functions
3028         void *parser_run(struct token_state *tokens,
3029                          const struct state states[],
3030                          const struct reduction reductions[],
3031                          int (*do_reduce)(int, void**, struct token_config*, void*),
3032                          void (*do_free)(short, void*),
3033                          FILE *trace, const char *non_term[],
3034                          struct token_config *config);
3035
3036 ### Tracing
3037
3038 Being able to visualize the parser in action can be invaluable when
3039 debugging the parser code, or trying to understand how the parse of a
3040 particular grammar progresses.  The stack contains all the important
3041 state, so just printing out the stack every time around the parse loop
3042 can make it possible to see exactly what is happening.
3043
3044 This doesn't explicitly show each SHIFT and REDUCE action.  However they
3045 are easily deduced from the change between consecutive lines, and the
3046 details of each state can be found by cross referencing the states list
3047 in the stack with the "report" that parsergen can generate.
3048
3049 For terminal symbols, we just dump the token.  For non-terminals we
3050 print the name of the symbol.  The look ahead token is reported at the
3051 end inside square brackets.
3052
3053 ###### parser functions
3054
3055         static char *reserved_words[] = {
3056                 [TK_error]        = "ERROR",
3057                 [TK_in]           = "IN",
3058                 [TK_out]          = "OUT",
3059                 [TK_newline]      = "NEWLINE",
3060                 [TK_eof]          = "$eof",
3061         };
3062
3063         void parser_trace(FILE *trace, struct parser *p,
3064                           struct token *tk, const struct state states[],
3065                           const char *non_term[], int knowns)
3066         {
3067                 int i;
3068                 if (!trace)
3069                         return;
3070                 for (i = 0; i < p->tos; i++) {
3071                         struct frame *f = &p->stack[i];
3072                         if (i) {
3073                                 int sym = f->sym;
3074                                 if (sym < TK_reserved &&
3075                                     reserved_words[sym] != NULL)
3076                                         fputs(reserved_words[sym], trace);
3077                                 else if (sym < TK_reserved + knowns) {
3078                                         struct token *t = p->asn_stack[i];
3079                                         text_dump(trace, t->txt, 20);
3080                                 } else
3081                                         fputs(non_term[sym - TK_reserved - knowns],
3082                                               trace);
3083                                 fputs(" ", trace);
3084                         }
3085                         fprintf(trace, "(%d) ", f->state);
3086                 }
3087                 fprintf(trace, "[");
3088                 if (tk->num < TK_reserved &&
3089                     reserved_words[tk->num] != NULL)
3090                         fputs(reserved_words[tk->num], trace);
3091                 else
3092                         text_dump(trace, tk->txt, 20);
3093                 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3094         }
3095
3096         void parser_trace_action(FILE *trace, char *action)
3097         {
3098                 if (trace)
3099                         fprintf(trace, " - %s\n", action);
3100         }
3101
3102 # A Worked Example
3103
3104 The obvious example for a parser is a calculator.
3105
3106 As `scanner` provides number parsing function using `libgmp` it is not much
3107 work to perform arbitrary rational number calculations.
3108
3109 This calculator takes one expression, or an equality test, per line.
3110 The results are printed and if any equality test fails, the program
3111 exits with an error.
3112
3113 ###### File: parsergen.mk
3114         calc.c calc.h : parsergen parsergen.mdc
3115                 ./parsergen --tag calc -o calc parsergen.mdc
3116         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3117                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3118         calctest : calc
3119                 ./calc parsergen.mdc
3120         demos :: calctest
3121
3122 # calc: header
3123
3124         #include "parse_number.h"
3125         // what do we use for a demo-grammar?  A calculator of course.
3126         struct number {
3127                 mpq_t val;
3128                 char tail[2];
3129                 int err;
3130         };
3131
3132 # calc: code
3133
3134         #include <stdlib.h>
3135         #include <unistd.h>
3136         #include <fcntl.h>
3137         #include <sys/mman.h>
3138         #include <stdio.h>
3139         #include <malloc.h>
3140         #include <gmp.h>
3141         #include <string.h>
3142         #include "mdcode.h"
3143         #include "scanner.h"
3144         #include "parser.h"
3145
3146         #include "calc.h"
3147
3148         static void free_number(struct number *n)
3149         {
3150                 mpq_clear(n->val);
3151                 free(n);
3152         }
3153
3154         static int text_is(struct text t, char *s)
3155         {
3156                 return (strlen(s) == t.len &&
3157                         strncmp(s, t.txt, t.len) == 0);
3158         }
3159
3160         int main(int argc, char *argv[])
3161         {
3162                 int fd = open(argv[1], O_RDONLY);
3163                 int len = lseek(fd, 0, 2);
3164                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3165                 struct section *table = code_extract(file, file+len, NULL);
3166                 struct section *s;
3167                 struct token_config config = {
3168                         .ignored = (1 << TK_line_comment)
3169                                  | (1 << TK_in)
3170                                  | (1 << TK_out),
3171                         .number_chars = ".,_+-",
3172                         .word_start = "",
3173                         .word_cont = "",
3174                 };
3175                 for (s = table; s; s = s->next)
3176                         if (text_is(s->section, "example: input"))
3177                                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3178                 while (table) {
3179                         struct section *t = table->next;
3180                         code_free(table->code);
3181                         free(table);
3182                         table = t;
3183                 }
3184                 exit(0);
3185         }
3186
3187 # calc: grammar
3188
3189         $LEFT + -
3190         $LEFT * / //
3191
3192         Session -> Session Line
3193                 | Line
3194
3195         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3196                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3197                                         gmp_printf("  or as a decimal: %Fg\n", fl);
3198                                         mpf_clear(fl);
3199                                         }
3200                                      }$
3201                 | Expression = Expression NEWLINE ${
3202                         if (mpq_equal($1.val, $3.val))
3203                                 gmp_printf("Both equal %Qd\n", $1.val);
3204                         else {
3205                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
3206                                         $1.val, $3.val);
3207                                 exit(1);
3208                         }
3209                 }$
3210                 | NEWLINE ${ printf("Blank line\n"); }$
3211                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3212
3213         $number
3214         Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3215                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3216                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3217                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3218                 | Expression // Expression ${ {
3219                         mpz_t z0, z1, z2;
3220                         mpq_init($0.val);
3221                         mpz_init(z0); mpz_init(z1); mpz_init(z2);
3222                         mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3223                         mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3224                         mpz_tdiv_q(z0, z1, z2);
3225                         mpq_set_z($0.val, z0);
3226                         mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3227                 } }$
3228                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3229                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3230
3231 # example: input
3232
3233         355/113
3234         3.1415926535 - 355/113
3235         2 + 4 * 5
3236         1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3237         10 * 9 / 2
3238         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
3239
3240         355//113
3241
3242         error
3243
3244 ## A second example
3245
3246 I had originally thought that the LR05 style grammar would suit me there
3247 should never be a sequence of token that means either of two different
3248 things depending on what comes follows it.  Without that ambiguity, the
3249 extra strength of LALR, or even SLR, would not be needed.  However I
3250 discovered this was too simplistic.  There are times when it is quite
3251 reasonable for the empty string, or maybe a NEWLINE, to have different
3252 meanings depending on the next token.  For this, LR05 is not sufficient,
3253 so I added full action-table support.
3254
3255 This example tests the selection of REDUCE actions based on the
3256 look-ahead symbol, using a trivial grammar where the empty string can
3257 reasonably mean different things depending on what it precedes.
3258
3259
3260 ####### File: parsergen.mk
3261         demos :: do_lalr_demo
3262         lalr_demo.c lalr_demo.h : parsergen parsergen.mdc
3263                 ./parsergen --LALR --tag demo -o lalr_demo parsergen.mdc
3264         lalr_demo: lalr_demo.o libparser.o libscanner.o libmdcode.o
3265                 $(CC) -o lalr_demo lalr_demo.c libparser.o libscanner.o libmdcode.o -licuuc
3266         do_lalr_demo: lalr_demo
3267                 NOTRACE=1 ./lalr_demo
3268                 @echo 'Should produce "start of line, empty sign, empty sigl"'
3269
3270 ####### demo: grammar
3271
3272         $TERM $ @ + -
3273         line -> ${ printf("start of line"); }$
3274                 | line term
3275         term -> sigl IDENTIFIER
3276                 | sign NUMBER
3277         sigl ->   $
3278                 | @
3279                 | ${ printf(", empty sigl"); }$
3280         sign ->   +
3281                 | -
3282                 | ${ printf(", empty sign"); }$
3283
3284 ####### demo: header
3285
3286         //
3287
3288 ####### demo: code
3289
3290         #include <stdlib.h>
3291         #include <stdio.h>
3292         #include "mdcode.h"
3293         #include "scanner.h"
3294         #include "parser.h"
3295         #include "lalr_demo.h"
3296         int main(int argc, char *argv[])
3297         {
3298                 char test[] = "````\n$a -1 5 b\n";
3299                 struct section *table = code_extract(test, test+sizeof(test)-1, NULL);
3300                 struct token_config config = {
3301                         .ignored = (1 << TK_line_comment)
3302                                  | (1 << TK_newline)
3303                                  | (1 << TK_in)
3304                                  | (1 << TK_out),
3305                         .number_chars = "",
3306                         .word_start = "",
3307                         .word_cont = "",
3308                 };
3309                 parse_lalr_demo(table->code, &config, getenv("NOTRACE") ? NULL : stderr);
3310                 printf("\n");
3311                 exit(0);
3312         }