ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator with 2D support #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 "2D support" means that indentation and line breaks can be significant.
   8
   9 There are several distinct sections.
  10
  11 1. `grammar_read` will read a grammar from a literate-code file and
  12    store the productions, symbols, and code fragments.
  13 2. `grammar_analyse` will take that grammar and build LR parsing
  14    states and look-ahead tables.
  15 3. `grammar_report` will present the details of the analysis
  16    in a readable format and will report any conflicts.
  17 4. `parser_generate` will write out C code files with various
  18    tables and with the code fragments arranged in useful places.
  19 5. `parser_run` is a library function intended to be linked together
  20    with the generated parser tables to complete the implementation of
  21    a parser.
  22 6. Finally `calc` is a test grammar for a simple calculator.  The
  23    `parsergen` program built from the C code in this file can extract
  24    that grammar directly from this file and process it.
  25
  26 ###### File: parsergen.c
  27         #include <unistd.h>
  28         #include <stdlib.h>
  29         #include <stdio.h>
  30         ## includes
  31         ## forward declarations
  32         ## declarations
  33         ## functions
  34         ## grammar_read
  35         ## grammar_analyse
  36         ## grammar_report
  37         ## parser_generate
  38         ## main
  39 ###### File: parser.h
  40         ## exported types
  41         ## exported functions
  42 ###### File: libparser.c
  43         #include <unistd.h>
  44         #include <stdlib.h>
  45         #include <stdio.h>
  46         ## parser includes
  47         ## parser functions
  48         ## parser_run
  49 ###### File: parsergen.mk
  50         CFLAGS += -Wall -g
  51         all :: parsergen calc
  52         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  53                 ./md2c parsergen.mdc
  54
  55 ## Reading the grammar
  56
  57 The grammar must be stored in a markdown literate code file as parsed by
  58 "[mdcode][]".  It may have up to four top level (i.e.  unreferenced)
  59 sections: `header`, `code`, `grammar` and `reduce`.  The first two, if
  60 present, will be literally copied into the generated `.c` and `.h`
  61 files.  The third is required and contains the grammar which is
  62 tokenised with "[scanner][]".  `reduce` can provide variable
  63 declarations and common intialization code for all the reduction code
  64 fragments in the grammar.
  65
  66 If the `--tag` option is given, then any top level header that doesn't
  67 start with the tag is ignored, and the tag is striped from the rest.  So
  68 `--tag Foo` means that the four sections expected are `Foo: header`,
  69 `Foo: code`, `Foo: grammar` and `Foo: reduce`.  The tag `calc` is used
  70 to extract the sample calculator from this file.
  71
  72 [mdcode]: mdcode.html
  73 [scanner]: scanner.html
  74
  75 ###### includes
  76         #include "mdcode.h"
  77         #include "scanner.h"
  78
  79 ###### parser includes
  80         #include "mdcode.h"
  81         #include "scanner.h"
  82
  83 The grammar contains production sets, precedence/associativity
  84 declarations, general terminal declarations, and data type declarations.
  85 These are all parsed with _ad hoc_ parsing as we don't have a parser
  86 generator yet.
  87
  88 The precedence and associativity can be set for each production, and
  89 can be inherited from symbols.  The data type (either structure or a
  90 reference to a structure) is potentially defined for each non-terminal
  91 and describes what C structure is used to store information about each
  92 symbol.
  93
  94 ###### declarations
  95         enum assoc {Left, Right, Non};
  96         char *assoc_names[] = {"Left","Right","Non"};
  97
  98         struct symbol {
  99                 struct text struct_name;
 100                 int isref;
 101                 enum assoc assoc;
 102                 unsigned short precedence;
 103                 ## symbol fields
 104         };
 105         struct production {
 106                 unsigned short precedence;
 107                 enum assoc assoc;
 108                 char line_like;
 109                 ## production fields
 110         };
 111         struct grammar {
 112                 ## grammar fields
 113         };
 114
 115 The strings reported by `mdcode` and `scanner` are `struct text` which
 116 have length rather than being nul terminated.  To help with printing and
 117 comparing we define `text_is` and `prtxt`, which should possibly go in
 118 `mdcode`.  `scanner` does provide `text_dump` which is useful for
 119 strings which might contain control characters.
 120
 121 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 122 because that is what we need to detect tags.
 123
 124 ###### functions
 125         static int text_is(struct text t, char *s)
 126         {
 127                 return (strlen(s) == t.len &&
 128                         strncmp(s, t.txt, t.len) == 0);
 129         }
 130         static void prtxt(struct text t)
 131         {
 132                 printf("%.*s", t.len, t.txt);
 133         }
 134
 135         static int strip_tag(struct text *t, char *tag)
 136         {
 137                 int skip = strlen(tag) + 1;
 138                 if (skip >= t->len ||
 139                     strncmp(t->txt, tag, skip-1) != 0 ||
 140                     t->txt[skip-1] != ':')
 141                         return 0;
 142                 while (skip < t->len && t->txt[skip] == ' ')
 143                         skip++;
 144                 t->len -= skip;
 145                 t->txt += skip;
 146                 return 1;
 147         }
 148
 149 ### Symbols
 150
 151 Productions are comprised primarily of symbols - terminal and
 152 non-terminal.  We do not make a syntactic distinction between the two,
 153 though non-terminals must be identifiers.  Non-terminal symbols are
 154 those which appear in the head of a production, terminal symbols are
 155 those which don't.  There are also "virtual" symbols used for precedence
 156 marking discussed later, and sometimes we won't know what type a symbol
 157 is yet.
 158
 159 To help with code safety it is possible to declare the terminal symbols.
 160 If this is done, then any symbol used in a production that does not
 161 appear in the head of any production and is not declared as a terminal
 162 is treated as an error.
 163
 164 ###### forward declarations
 165         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 166         char *symtypes = "UVTN";
 167 ###### symbol fields
 168         enum symtype type;
 169 ###### grammar fields
 170         int terminals_declared;
 171
 172 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 173 table of known symbols and the resulting parser will report them as
 174 `TK_reserved + N`.  A small set of identifiers are reserved for the
 175 different token types that `scanner` can report.
 176
 177 ###### declarations
 178
 179         static struct { int num; char *name; } reserved_words[] = {
 180                 { TK_error,        "ERROR" },
 181                 { TK_number,       "NUMBER" },
 182                 { TK_ident,        "IDENTIFIER" },
 183                 { TK_mark,         "MARK" },
 184                 { TK_string,       "STRING" },
 185                 { TK_multi_string, "MULTI_STRING" },
 186                 { TK_in,           "IN" },
 187                 { TK_out,          "OUT" },
 188                 { TK_newline,      "NEWLINE" },
 189                 { TK_eof,          "$eof" },
 190         };
 191 ###### symbol fields
 192         short num;
 193
 194 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 195 recognised.  The former is automatically expected at the end of the text
 196 being parsed. The latter are always ignored by the parser.
 197
 198 All of these symbols are stored in a simple symbol table.  We use the
 199 `struct text` from `mdcode` to store the name and link them together
 200 into a sorted list using an insertion sort.
 201
 202 We don't have separate `find` and `insert` functions as any symbol we
 203 find needs to be remembered.  We simply expect `find` to always return a
 204 symbol, but its type might be `Unknown`.
 205
 206 ###### includes
 207         #include <string.h>
 208
 209 ###### symbol fields
 210         struct text name;
 211         struct symbol *next;
 212
 213 ###### grammar fields
 214         struct symbol *syms;
 215         int num_syms;
 216
 217 ###### functions
 218         static struct symbol *sym_find(struct grammar *g, struct text s)
 219         {
 220                 struct symbol **l = &g->syms;
 221                 struct symbol *n;
 222                 int cmp = 1;
 223
 224                 while (*l &&
 225                         (cmp = text_cmp((*l)->name, s)) < 0)
 226                                 l = & (*l)->next;
 227                 if (cmp == 0)
 228                         return *l;
 229                 n = calloc(1, sizeof(*n));
 230                 n->name = s;
 231                 n->type = Unknown;
 232                 n->next = *l;
 233                 n->num = -1;
 234                 *l = n;
 235                 return n;
 236         }
 237
 238         static void symbols_init(struct grammar *g)
 239         {
 240                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 241                 int i;
 242                 for (i = 0; i < entries; i++) {
 243                         struct text t;
 244                         struct symbol *s;
 245                         t.txt = reserved_words[i].name;
 246                         t.len = strlen(t.txt);
 247                         s = sym_find(g, t);
 248                         s->type = Terminal;
 249                         s->num = reserved_words[i].num;
 250                 }
 251         }
 252
 253 ### Data types and precedence.
 254
 255 Data type specification, precedence specification, and declaration of
 256 terminals are all introduced by a dollar sign at the start of the line.
 257 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 258 precedence, if it is `TERM` then the line declares terminals without
 259 precedence, otherwise it specifies a data type.
 260
 261 The data type name is simply stored and applied to the head of all
 262 subsequent productions.  It must be the name of a structure optionally
 263 preceded by an asterisk which means a reference or "pointer".  So
 264 `$expression` maps to `struct expression` and `$*statement` maps to
 265 `struct statement *`.
 266
 267 Any productions given before the first data type declaration will have
 268 no data type associated with them and can carry no information.  In
 269 order to allow other non-terminals to have no type, the data type
 270 `$void` can be given.  This does *not* mean that `struct void` will be
 271 used, but rather than no type will be associated with subsequent
 272 non-terminals.
 273
 274 The precedence line must contain a list of symbols, either terminal
 275 symbols or virtual symbols.  It can only contain symbols that have not
 276 been seen yet, so precedence declaration must precede the productions
 277 that they affect.
 278
 279 A "virtual" symbol is an identifier preceded by `$$`.  Virtual symbols
 280 carry precedence information but are not included in the grammar.  A
 281 production can declare that it inherits the precedence of a given
 282 virtual symbol.
 283
 284 This use for `$$` precludes it from being used as a symbol in the
 285 described language.  Two other symbols: `${` and `}$` are also
 286 unavailable.
 287
 288 Each new precedence line introduces a new precedence level and
 289 declares how it associates.  This level is stored in each symbol
 290 listed and may be inherited by any production which uses the symbol.  A
 291 production inherits from the last symbol which has a precedence.
 292
 293 The symbols on the first precedence line have the lowest precedence.
 294 Subsequent lines introduce symbols with higher precedence and so bind
 295 more tightly.
 296
 297 Note that a declaration line (or "dollar line") can start with either of
 298 two different marks: "$" or "$*".  The `dollar_line()` function defined
 299 here is told which was found in the `isref` argument.
 300
 301 ###### grammar fields
 302         struct text current_type;
 303         int type_isref;
 304         int prec_levels;
 305
 306 ###### declarations
 307         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 308         static const char *known[] = { "$$", "${", "}$" };
 309
 310 ###### functions
 311         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 312         {
 313                 struct token t = token_next(ts);
 314                 char *err;
 315                 enum assoc assoc;
 316                 int term = 0;
 317                 int found;
 318
 319                 if (t.num != TK_ident) {
 320                         err = "type or assoc expected after '$'";
 321                         goto abort;
 322                 }
 323                 if (text_is(t.txt, "LEFT"))
 324                         assoc = Left;
 325                 else if (text_is(t.txt, "RIGHT"))
 326                         assoc = Right;
 327                 else if (text_is(t.txt, "NON"))
 328                         assoc = Non;
 329                 else if (text_is(t.txt, "TERM")) {
 330                         term = 1;
 331                         g->terminals_declared = 1;
 332                 } else {
 333                         g->current_type = t.txt;
 334                         g->type_isref = isref;
 335                         if (text_is(t.txt, "void"))
 336                                 g->current_type.txt = NULL;
 337                         t = token_next(ts);
 338                         if (t.num != TK_newline) {
 339                                 err = "Extra tokens after type name";
 340                                 goto abort;
 341                         }
 342                         return NULL;
 343                 }
 344
 345                 if (isref) {
 346                         err = "$* cannot be followed by a precedence";
 347                         goto abort;
 348                 }
 349
 350                 // This is a precedence or TERM line, need some symbols.
 351                 found = 0;
 352                 g->prec_levels += 1;
 353                 t = token_next(ts);
 354                 while (t.num != TK_newline) {
 355                         enum symtype type = Terminal;
 356                         struct symbol *s;
 357                         if (t.num == TK_virtual) {
 358                                 type = Virtual;
 359                                 t = token_next(ts);
 360                                 if (t.num != TK_ident) {
 361                                         err = "$$ must be followed by a word";
 362                                         goto abort;
 363                                 }
 364                                 if (term) {
 365                                         err = "Virtual symbols not permitted on $TERM line";
 366                                         goto abort;
 367                                 }
 368                         } else if (t.num != TK_ident &&
 369                                    t.num != TK_mark) {
 370                                 err = "Illegal token in precedence line";
 371                                 goto abort;
 372                         }
 373                         s = sym_find(g, t.txt);
 374                         if (s->type != Unknown) {
 375                                 err = "Symbols in precedence/TERM line must not already be known.";
 376                                 goto abort;
 377                         }
 378                         s->type = type;
 379                         if (!term) {
 380                                 s->precedence = g->prec_levels;
 381                                 s->assoc = assoc;
 382                         }
 383                         found += 1;
 384                         t = token_next(ts);
 385                 }
 386                 if (found == 0)
 387                         err = "No symbols given on precedence/TERM line";
 388                         goto abort;
 389                 return NULL;
 390         abort:
 391                 while (t.num != TK_newline && t.num != TK_eof)
 392                         t = token_next(ts);
 393                 return err;
 394         }
 395
 396 ### Productions
 397
 398 A production either starts with an identifier which is the head
 399 non-terminal, or a vertical bar (`|`) in which case this production
 400 uses the same head as the previous one.  The identifier must be
 401 followed by a `->` mark.  All productions for a given non-terminal must
 402 be grouped together with the `nonterminal ->` given only once.
 403
 404 After this a (possibly empty) sequence of identifiers and marks form
 405 the body of the production.  A virtual symbol may be given after the
 406 body by preceding it with `$$`.  If a virtual symbol is given, the
 407 precedence of the production is that for the virtual symbol.  If none
 408 is given, the precedence is inherited from the last symbol in the
 409 production which has a precedence specified.
 410
 411 After the optional precedence may come the `${` mark.  This indicates
 412 the start of a code fragment.  If present, this must be on the same
 413 line as the start of the production.
 414
 415 All of the text from the `${` through to the matching `}$` is
 416 collected and forms the code-fragment for the production.  It must all
 417 be in one `code_node` of the literate code.  The `}$` must be
 418 at the end of a line.
 419
 420 Text in the code fragment will undergo substitutions where `$N` or
 421 `$<N`,for some numeric `N` (or non-numeric indicator as described
 422 later), will be replaced with a variable holding the parse information
 423 for the particular symbol in the production.  `$0` is the head of the
 424 production, `$1` is the first symbol of the body, etc.  The type of `$N`
 425 for a terminal symbol is `struct token`.  For a non-terminal, it is
 426 whatever has been declared for that symbol.  The `<` may be included and
 427 means that the value (usually a reference) is being moved out, so it
 428 will not automatically be freed.  The effect of using '<' is that the
 429 variable is cleared to all-zeros.
 430
 431 While building productions we will need to add to an array which needs to
 432 grow dynamically.
 433
 434 ###### functions
 435         static void array_add(void *varray, int *cnt, void *new)
 436         {
 437                 void ***array = varray;
 438                 int current = 0;
 439                 const int step = 8;
 440                 current = ((*cnt-1) | (step-1))+1;
 441                 if (*cnt == current) {
 442                         /* must grow */
 443                         current += step;
 444                         *array = realloc(*array, current * sizeof(void*));
 445                 }
 446                 (*array)[*cnt] = new;
 447                 (*cnt) += 1;
 448         }
 449
 450 Collecting the code fragment simply involves reading tokens until we
 451 hit the end token `}$`, and noting the character position of the start and
 452 the end.
 453
 454 ###### functions
 455         static struct text collect_code(struct token_state *state,
 456                                         struct token start)
 457         {
 458                 struct text code;
 459                 struct token t;
 460                 code.txt = start.txt.txt + start.txt.len;
 461                 do
 462                         t = token_next(state);
 463                 while (t.node == start.node &&
 464                        t.num != TK_close && t.num != TK_error &&
 465                        t.num != TK_eof);
 466                 if (t.num == TK_close && t.node == start.node)
 467                         code.len = t.txt.txt - code.txt;
 468                 else
 469                         code.txt = NULL;
 470                 return code;
 471         }
 472
 473 Now we have all the bits we need to parse a full production.
 474
 475 ###### includes
 476         #include <memory.h>
 477
 478 ###### grammar fields
 479         struct production **productions;
 480         int production_count;
 481
 482 ###### production fields
 483         struct symbol  *head;
 484         struct symbol **body;
 485         int             body_size;
 486         struct text     code;
 487         int             code_line;
 488
 489 ###### symbol fields
 490         int first_production;
 491
 492 ###### functions
 493         static char *parse_production(struct grammar *g,
 494                                       struct symbol *head,
 495                                       struct token_state *state)
 496         {
 497                 /* Head has already been parsed. */
 498                 struct token tk;
 499                 char *err;
 500                 struct production p, *pp;
 501
 502                 memset(&p, 0, sizeof(p));
 503                 p.head = head;
 504                 tk = token_next(state);
 505                 while (tk.num == TK_ident || tk.num == TK_mark) {
 506                         struct symbol *bs = sym_find(g, tk.txt);
 507                         if (bs->type == Unknown) {
 508                                 if (!g->terminals_declared)
 509                                         bs->type = Terminal;
 510                         }
 511                         if (bs->type == Virtual) {
 512                                 err = "Virtual symbol not permitted in production";
 513                                 goto abort;
 514                         }
 515                         if (bs->precedence) {
 516                                 p.precedence = bs->precedence;
 517                                 p.assoc = bs->assoc;
 518                         }
 519                         array_add(&p.body, &p.body_size, bs);
 520                         tk = token_next(state);
 521                 }
 522                 if (tk.num == TK_virtual) {
 523                         struct symbol *vs;
 524                         tk = token_next(state);
 525                         if (tk.num != TK_ident) {
 526                                 err = "word required after $$";
 527                                 goto abort;
 528                         }
 529                         vs = sym_find(g, tk.txt);
 530                         if (vs->num == TK_newline)
 531                                 p.line_like = 1;
 532                         else if (vs->num == TK_out)
 533                                 p.line_like = 2;
 534                         else if (vs->precedence == 0) {
 535                                 err = "symbol after $$ must have precedence";
 536                                 goto abort;
 537                         } else {
 538                                 p.precedence = vs->precedence;
 539                                 p.assoc = vs->assoc;
 540                         }
 541                         tk = token_next(state);
 542                 }
 543                 if (tk.num == TK_open) {
 544                         p.code_line = tk.line;
 545                         p.code = collect_code(state, tk);
 546                         if (p.code.txt == NULL) {
 547                                 err = "code fragment not closed properly";
 548                                 goto abort;
 549                         }
 550                         tk = token_next(state);
 551                 }
 552
 553                 if (tk.num != TK_newline && tk.num != TK_eof) {
 554                         err = "stray tokens at end of line";
 555                         goto abort;
 556                 }
 557                 pp = malloc(sizeof(*pp));
 558                 *pp = p;
 559                 array_add(&g->productions, &g->production_count, pp);
 560                 return NULL;
 561         abort:
 562                 while (tk.num != TK_newline && tk.num != TK_eof)
 563                         tk = token_next(state);
 564                 free(p.body);
 565                 return err;
 566         }
 567
 568 With the ability to parse productions and dollar-lines, we have nearly all
 569 that we need to parse a grammar from a `code_node`.
 570
 571 The head of the first production will effectively be the `start` symbol
 572 of the grammar.  However it won't _actually_ be so.  Processing the
 573 grammar is greatly simplified if the real start symbol only has a single
 574 production, and expects `$eof` as the final terminal.  So when we find
 575 the first explicit production we insert an extra implicit production as
 576 production zero which looks like
 577
 578 ###### Example: production 0
 579         $start -> START $eof
 580
 581 where `START` is the first non-terminal given.
 582
 583 ###### create production zero
 584         struct production *p = calloc(1,sizeof(*p));
 585         struct text start = {"$start",6};
 586         struct text eof = {"$eof",4};
 587         struct text code = {"$0 = $<1;", 9};
 588         p->head = sym_find(g, start);
 589         p->head->type = Nonterminal;
 590         p->head->struct_name = g->current_type;
 591         p->head->isref = g->type_isref;
 592         if (g->current_type.txt)
 593                 p->code = code;
 594         array_add(&p->body, &p->body_size, head);
 595         array_add(&p->body, &p->body_size, sym_find(g, eof));
 596         p->head->first_production = g->production_count;
 597         array_add(&g->productions, &g->production_count, p);
 598
 599 Now we are ready to read in the grammar.  We ignore comments
 600 and strings so that the marks which introduce them can be
 601 used as terminals in the grammar.  We don't ignore numbers
 602 even though we don't allow them as that causes the scanner
 603 to produce errors that the parser is better positioned to handle.
 604 We also ignore indentation, but we expect and use newlines.
 605
 606 To allow comments in the grammar, we explicitly check for "//" as the
 607 first token on the line and those lines are skipped.  "//" can still be
 608 used as a terminal anywhere that a terminal is expected.
 609
 610 ###### grammar_read
 611         static struct grammar *grammar_read(struct code_node *code)
 612         {
 613                 struct token_config conf = {
 614                         .word_start = "",
 615                         .word_cont = "",
 616                         .words_marks = known,
 617                         .known_count = sizeof(known)/sizeof(known[0]),
 618                         .number_chars = "",
 619                         .ignored = (1 << TK_line_comment)
 620                                  | (1 << TK_block_comment)
 621                                  | (0 << TK_number)
 622                                  | (1 << TK_string)
 623                                  | (1 << TK_multi_string)
 624                                  | (1 << TK_in)
 625                                  | (1 << TK_out),
 626                 };
 627
 628                 struct token_state *state = token_open(code, &conf);
 629                 struct token tk;
 630                 struct symbol *head = NULL;
 631                 struct grammar *g;
 632                 char *err = NULL;
 633
 634                 g = calloc(1, sizeof(*g));
 635                 symbols_init(g);
 636
 637                 for (tk = token_next(state); tk.num != TK_eof;
 638                      tk = token_next(state)) {
 639                         if (tk.num == TK_newline)
 640                                 continue;
 641                         if (tk.num == TK_ident) {
 642                                 // new non-terminal
 643                                 head = sym_find(g, tk.txt);
 644                                 if (head->type == Nonterminal)
 645                                         err = "This non-terminal has already be used.";
 646                                 else if (head->type == Virtual)
 647                                         err = "Virtual symbol not permitted in head of production";
 648                                 else {
 649                                         head->type = Nonterminal;
 650                                         head->struct_name = g->current_type;
 651                                         head->isref = g->type_isref;
 652                                         if (g->production_count == 0) {
 653                                                 ## create production zero
 654                                         }
 655                                         head->first_production = g->production_count;
 656                                         tk = token_next(state);
 657                                         if (tk.num == TK_mark &&
 658                                             text_is(tk.txt, "->"))
 659                                                 err = parse_production(g, head, state);
 660                                         else
 661                                                 err = "'->' missing in production";
 662                                 }
 663                         } else if (tk.num == TK_mark
 664                                    && text_is(tk.txt, "|")) {
 665                                 // another production for same non-term
 666                                 if (head)
 667                                         err = parse_production(g, head, state);
 668                                 else
 669                                         err = "First production must have a head";
 670                         } else if (tk.num == TK_mark
 671                                    && text_is(tk.txt, "$")) {
 672                                 err = dollar_line(state, g, 0);
 673                         } else if (tk.num == TK_mark
 674                                    && text_is(tk.txt, "$*")) {
 675                                 err = dollar_line(state, g, 1);
 676                         } else if (tk.num == TK_mark
 677                                    && text_is(tk.txt, "//")) {
 678                                 while (tk.num != TK_newline &&
 679                                        tk.num != TK_eof)
 680                                         tk = token_next(state);
 681                         } else {
 682                                 err = "Unrecognised token at start of line.";
 683                         }
 684                         if (err)
 685                                 goto abort;
 686                 }
 687                 token_close(state);
 688                 if (g->terminals_declared) {
 689                         struct symbol *s;
 690                         int errs = 0;
 691                         for (s = g->syms; s; s = s->next) {
 692                                 if (s->type != Unknown)
 693                                         continue;
 694                                 errs += 1;
 695                                 fprintf(stderr, "Token %.*s not declared\n",
 696                                         s->name.len, s->name.txt);
 697                         }
 698                         if (errs) {
 699                                 free(g); // FIXME free content
 700                                 g = NULL;
 701                         }
 702                 }
 703                 return g;
 704         abort:
 705                 fprintf(stderr, "Error at line %d: %s\n",
 706                         tk.line, err);
 707                 token_close(state);
 708                 free(g); // FIXME free content
 709                 return NULL;
 710         }
 711
 712 ## Analysing the grammar
 713
 714 The central task in analysing the grammar is to determine a set of
 715 states to drive the parsing process.  This will require finding various
 716 sets of symbols and of "items".  Some of these sets will need to attach
 717 information to each element in the set, so they are more like maps.
 718
 719 Each "item" may have a 'look-ahead' or `LA` set associated with it.
 720 Many of these will be the same as each other.  To avoid memory wastage,
 721 and to simplify some comparisons of sets, these sets will be stored in a
 722 list of unique sets, each assigned a number.
 723
 724 Once we have the data structures in hand to manage these sets and lists,
 725 we can start setting the 'nullable' flag, build the 'FIRST' sets, and
 726 then create the item sets which define the various states.
 727
 728 ### Symbol sets.
 729
 730 Though we don't only store symbols in these sets, they are the main
 731 things we store, so they are called symbol sets or "symsets".
 732
 733 A symset has a size, an array of shorts, and an optional array of data
 734 values, which are also shorts.  If the array of data is not present, we
 735 store an impossible pointer, as `NULL` is used to indicate that no
 736 memory has been allocated yet;
 737
 738 ###### declarations
 739         struct symset {
 740                 short cnt;
 741                 unsigned short *syms, *data;
 742         };
 743         #define NO_DATA ((unsigned short *)1)
 744         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 745         const struct symset INIT_DATASET = { 0, NULL, NULL };
 746
 747 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
 748 using an insertion sort.  We don't explicitly record the amount of
 749 allocated space as it can be derived directly from the current `cnt`
 750 using `((cnt - 1) | 7) + 1`.
 751
 752 ###### functions
 753         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 754         {
 755                 int i;
 756                 int current = ((s->cnt-1) | 7) + 1;
 757                 if (current == s->cnt) {
 758                         current += 8;
 759                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 760                         if (s->data != NO_DATA)
 761                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 762                 }
 763                 i = s->cnt;
 764                 while (i > 0 && s->syms[i-1] > key) {
 765                         s->syms[i] = s->syms[i-1];
 766                         if (s->data != NO_DATA)
 767                                 s->data[i] = s->data[i-1];
 768                         i--;
 769                 }
 770                 s->syms[i] = key;
 771                 if (s->data != NO_DATA)
 772                         s->data[i] = val;
 773                 s->cnt += 1;
 774         }
 775
 776 Finding a symbol (or item) in a `symset` uses a simple binary search.
 777 We return the index where the value was found (so data can be accessed),
 778 or `-1` to indicate failure.
 779
 780         static int symset_find(struct symset *ss, unsigned short key)
 781         {
 782                 int lo = 0;
 783                 int hi = ss->cnt;
 784
 785                 if (hi == 0)
 786                         return -1;
 787                 while (lo + 1 < hi) {
 788                         int mid = (lo + hi) / 2;
 789                         if (ss->syms[mid] <= key)
 790                                 lo = mid;
 791                         else
 792                                 hi = mid;
 793                 }
 794                 if (ss->syms[lo] == key)
 795                         return lo;
 796                 return -1;
 797         }
 798
 799 We will often want to form the union of two symsets.  When we do, we
 800 will often want to know if anything changed (as that might mean there is
 801 more work to do).  So `symset_union` reports whether anything was added
 802 to the first set.  We use a slow quadratic approach as these sets are
 803 rarely large.  If profiling shows this to be a problem it can be
 804 optimised later.
 805
 806         static int symset_union(struct symset *a, struct symset *b)
 807         {
 808                 int i;
 809                 int added = 0;
 810                 for (i = 0; i < b->cnt; i++)
 811                         if (symset_find(a, b->syms[i]) < 0) {
 812                                 unsigned short data = 0;
 813                                 if (b->data != NO_DATA)
 814                                         data = b->data[i];
 815                                 symset_add(a, b->syms[i], data);
 816                                 added++;
 817                         }
 818                 return added;
 819         }
 820
 821 And of course we must be able to free a symset.
 822
 823         static void symset_free(struct symset ss)
 824         {
 825                 free(ss.syms);
 826                 if (ss.data != NO_DATA)
 827                         free(ss.data);
 828         }
 829
 830 ### Symset Storage
 831
 832 Some symsets are simply stored somewhere appropriate in the `grammar`
 833 data structure, others needs a bit of indirection.  This applies
 834 particularly to "LA" sets.  These will be paired with "items" in an
 835 "itemset".  As itemsets will be stored in a symset, the "LA" set needs
 836 to be stored in the `data` field, so we need a mapping from a small
 837 (short) number to an LA `symset`.
 838
 839 As mentioned earlier this involves creating a list of unique symsets.
 840
 841 For now, we just use a linear list sorted by insertion.  A skiplist
 842 would be more efficient and may be added later.
 843
 844 ###### declarations
 845
 846         struct setlist {
 847                 struct symset ss;
 848                 int num;
 849                 struct setlist *next;
 850         };
 851
 852 ###### grammar fields
 853         struct setlist *sets;
 854         int nextset;
 855
 856 ###### functions
 857
 858         static int ss_cmp(struct symset a, struct symset b)
 859         {
 860                 int i;
 861                 int diff = a.cnt - b.cnt;
 862                 if (diff)
 863                         return diff;
 864                 for (i = 0; i < a.cnt; i++) {
 865                         diff = (int)a.syms[i] - (int)b.syms[i];
 866                         if (diff)
 867                                 return diff;
 868                 }
 869                 return 0;
 870         }
 871
 872         static int save_set(struct grammar *g, struct symset ss)
 873         {
 874                 struct setlist **sl = &g->sets;
 875                 int cmp = 1;
 876                 struct setlist *s;
 877
 878                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 879                         sl = & (*sl)->next;
 880                 if (cmp == 0) {
 881                         symset_free(ss);
 882                         return (*sl)->num;
 883                 }
 884
 885                 s = malloc(sizeof(*s));
 886                 s->ss = ss;
 887                 s->num = g->nextset;
 888                 g->nextset += 1;
 889                 s->next = *sl;
 890                 *sl = s;
 891                 return s->num;
 892         }
 893
 894 Finding a set by number is currently performed by a simple linear
 895 search.  If this turns out to hurt performance, we can store the sets in
 896 a dynamic array like the productions.
 897
 898         static struct symset set_find(struct grammar *g, int num)
 899         {
 900                 struct setlist *sl = g->sets;
 901                 while (sl && sl->num != num)
 902                         sl = sl->next;
 903                 return sl->ss;
 904         }
 905
 906 ### Setting `nullable`
 907
 908 We set `nullable` on the head symbol for any production for which all
 909 the body symbols (if any) are nullable.  As this is a recursive
 910 definition, any change in the `nullable` setting means that we need to
 911 re-evaluate where it needs to be set.
 912
 913 We simply loop around performing the same calculations until no more
 914 changes happen.
 915
 916 ###### symbol fields
 917         int nullable;
 918
 919 ###### functions
 920         static void set_nullable(struct grammar *g)
 921         {
 922                 int check_again = 1;
 923                 while (check_again) {
 924                         int p;
 925                         check_again = 0;
 926                         for (p = 0; p < g->production_count; p++) {
 927                                 struct production *pr = g->productions[p];
 928                                 int s;
 929
 930                                 if (pr->head->nullable)
 931                                         continue;
 932                                 for (s = 0; s < pr->body_size; s++)
 933                                         if (! pr->body[s]->nullable)
 934                                                 break;
 935                                 if (s == pr->body_size) {
 936                                         pr->head->nullable = 1;
 937                                         check_again = 1;
 938                                 }
 939                         }
 940                 }
 941         }
 942
 943 ### Setting `line_like`
 944
 945 In order to be able to ignore newline tokens when not relevant, but
 946 still include them in the parse when needed, we will need to know
 947 which states can start a "line-like" section of code.  We ignore
 948 newlines when there is an indent since the most recent start of a
 949 line-like symbol.
 950
 951 A "line_like" symbol is simply any symbol that can derive a NEWLINE.
 952 If a symbol cannot derive a NEWLINE, then it is only part of a line -
 953 so is word-like.  If it can derive a NEWLINE, then we consider it to
 954 be like a line.
 955
 956 Clearly the `TK_newline` token can derive a NEWLINE.  Any symbol which
 957 is the head of a production that contains a line_like symbol is also a
 958 line-like symbol.  We use a new field `line_like` to record this
 959 attribute of symbols, and compute it in a repetitive manner similar to
 960 `set_nullable`.
 961
 962 ###### symbol fields
 963         int line_like;
 964
 965 ###### functions
 966         static void set_line_like(struct grammar *g)
 967         {
 968                 int check_again = 1;
 969                 g->symtab[TK_newline]->line_like = 1;
 970                 while (check_again) {
 971                         int p;
 972                         check_again = 0;
 973                         for (p = 0; p < g->production_count; p++) {
 974                                 struct production *pr = g->productions[p];
 975                                 int s;
 976
 977                                 if (pr->head->line_like)
 978                                         continue;
 979
 980                                 for (s = 0 ; s < pr->body_size; s++) {
 981                                         if (pr->body[s]->line_like) {
 982                                                 pr->head->line_like = 1;
 983                                                 check_again = 1;
 984                                                 break;
 985                                         }
 986                                 }
 987                         }
 988                 }
 989         }
 990
 991 ### Building the `first` sets
 992
 993 When calculating what can follow a particular non-terminal, we will need
 994 to know what the "first" terminal in any subsequent non-terminal might
 995 be.  So we calculate the `first` set for every non-terminal and store
 996 them in an array.  We don't bother recording the "first" set for
 997 terminals as they are trivial.
 998
 999 As the "first" for one symbol might depend on the "first" of another, we
1000 repeat the calculations until no changes happen, just like with
1001 `set_nullable`.  This makes use of the fact that `symset_union` reports
1002 if any change happens.
1003
1004 The core of this, which finds the "first" of part of a production body,
1005 will be reused for computing the "follow" sets, so we split that out
1006 into a separate function, `add_first`, which also reports if it got all
1007 the way to the end of the production without finding a non-nullable
1008 symbol.
1009
1010 ###### grammar fields
1011         struct symset *first;
1012
1013 ###### functions
1014
1015         static int add_first(struct production *pr, int start,
1016                              struct symset *target, struct grammar *g,
1017                              int *to_end)
1018         {
1019                 int s;
1020                 int changed = 0;
1021                 for (s = start; s < pr->body_size; s++) {
1022                         struct symbol *bs = pr->body[s];
1023                         if (bs->type == Terminal) {
1024                                 if (symset_find(target, bs->num) < 0) {
1025                                         symset_add(target, bs->num, 0);
1026                                         changed = 1;
1027                                 }
1028                                 break;
1029                         } else if (symset_union(target, &g->first[bs->num]))
1030                                 changed = 1;
1031                         if (!bs->nullable)
1032                                 break;
1033                 }
1034                 if (to_end)
1035                         *to_end = (s == pr->body_size);
1036                 return changed;
1037         }
1038
1039         static void build_first(struct grammar *g)
1040         {
1041                 int check_again = 1;
1042                 int s;
1043                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1044                 for (s = 0; s < g->num_syms; s++)
1045                         g->first[s] = INIT_SYMSET;
1046
1047                 while (check_again) {
1048                         int p;
1049                         check_again = 0;
1050                         for (p = 0; p < g->production_count; p++) {
1051                                 struct production *pr = g->productions[p];
1052                                 struct symset *head = &g->first[pr->head->num];
1053
1054                                 if (add_first(pr, 0, head, g, NULL))
1055                                         check_again = 1;
1056                         }
1057                 }
1058         }
1059
1060 ### Building the `follow` sets.
1061
1062 There are two different situations when we will want to generate
1063 "follow" sets.  If we are doing an SLR analysis, we want to generate a
1064 single "follow" set for each non-terminal in the grammar.  That is what
1065 is happening here.  If we are doing an LALR or LR analysis we will want
1066 to generate a separate "LA" set for each item.  We do that later in
1067 state generation.
1068
1069 There are two parts to generating a "follow" set.  Firstly we look at
1070 every place that any non-terminal appears in the body of any production,
1071 and we find the set of possible "first" symbols after there.  This is
1072 done using `add_first` above and only needs to be done once as the
1073 "first" sets are now stable and will not change.
1074
1075 ###### grammar fields
1076         struct symset *follow;
1077
1078 ###### follow code
1079
1080         for (p = 0; p < g->production_count; p++) {
1081                 struct production *pr = g->productions[p];
1082                 int b;
1083
1084                 for (b = 0; b < pr->body_size - 1; b++) {
1085                         struct symbol *bs = pr->body[b];
1086                         if (bs->type == Terminal)
1087                                 continue;
1088                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1089                 }
1090         }
1091
1092 The second part is to add the "follow" set of the head of a production
1093 to the "follow" sets of the final symbol in the production, and any
1094 other symbol which is followed only by `nullable` symbols.  As this
1095 depends on "follow" itself we need to repeatedly perform the process
1096 until no further changes happen.
1097
1098 Rather than 2 nested loops, one that repeats the whole process until
1099 there is no change, and one that iterates of the set of productions, we
1100 combine these two functions into a single loop.
1101
1102 ###### follow code
1103
1104         for (check_again = 0, p = 0;
1105              p < g->production_count;
1106              p < g->production_count-1
1107                 ? p++ : check_again ? (check_again = 0, p = 0)
1108                               : p++) {
1109                 struct production *pr = g->productions[p];
1110                 int b;
1111
1112                 for (b = pr->body_size - 1; b >= 0; b--) {
1113                         struct symbol *bs = pr->body[b];
1114                         if (bs->type == Terminal)
1115                                 break;
1116                         if (symset_union(&g->follow[bs->num],
1117                                          &g->follow[pr->head->num]))
1118                                 check_again = 1;
1119                         if (!bs->nullable)
1120                                 break;
1121                 }
1122         }
1123
1124 We now just need to create and initialise the `follow` list to get a
1125 complete function.
1126
1127 ###### functions
1128         static void build_follow(struct grammar *g)
1129         {
1130                 int s, check_again, p;
1131                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1132                 for (s = 0; s < g->num_syms; s++)
1133                         g->follow[s] = INIT_SYMSET;
1134                 ## follow code
1135         }
1136
1137 ### Building itemsets and states
1138
1139 There are three different levels of detail that can be chosen for
1140 building the itemsets and states for the LR grammar.  They are:
1141
1142 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1143    itemsets - look-ahead, if used at all, is simply the 'follow' sets
1144    already computed,
1145 2. LALR(1) where we build look-ahead sets with each item and merge
1146    the LA sets when we find two paths to the same "kernel" set of items.
1147 3. LR(1) where different look-ahead for any item in the set means
1148    a different item set must be created.
1149
1150 ###### forward declarations
1151         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1152
1153 We need to be able to look through existing states to see if a newly
1154 generated state already exists.  For now we use a simple sorted linked
1155 list.
1156
1157 An item is a pair of numbers: the production number and the position of
1158 "DOT", which is an index into the body of the production.  As the
1159 numbers are not enormous we can combine them into a single "short" and
1160 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1161 production number (so 2000 productions with maximum of 31 symbols in the
1162 body).
1163
1164 Comparing the itemsets will be a little different to comparing symsets
1165 as we want to do the lookup after generating the "kernel" of an
1166 itemset, so we need to ignore the offset=zero items which are added during
1167 completion.
1168
1169 To facilitate this, we modify the "DOT" number so that "0" sorts to
1170 the end of the list in the symset, and then only compare items before
1171 the first "0".
1172
1173 ###### declarations
1174         static inline unsigned short item_num(int production, int dot)
1175         {
1176                 return production | ((31-dot) << 11);
1177         }
1178         static inline int item_prod(unsigned short item)
1179         {
1180                 return item & 0x7ff;
1181         }
1182         static inline int item_dot(unsigned short item)
1183         {
1184                 return (31-(item >> 11)) & 0x1f;
1185         }
1186
1187 For LR(1) analysis we need to compare not just the itemset in a state
1188 but also the LA sets.  As we assign each unique LA set a number, we
1189 can just compare the symset and the data values together.
1190
1191 ###### functions
1192         static int itemset_cmp(struct symset a, struct symset b,
1193                                enum grammar_type type)
1194         {
1195                 int i;
1196                 int av, bv;
1197
1198                 for (i = 0;
1199                      i < a.cnt && i < b.cnt &&
1200                      item_dot(a.syms[i]) > 0 &&
1201                      item_dot(b.syms[i]) > 0;
1202                      i++) {
1203                         int diff = a.syms[i] - b.syms[i];
1204                         if (diff)
1205                                 return diff;
1206                         if (type == LR1) {
1207                                 diff = a.data[i] - b.data[i];
1208                                 if (diff)
1209                                         return diff;
1210                         }
1211                 }
1212                 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1213                         av = -1;
1214                 else
1215                         av = a.syms[i];
1216                 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1217                         bv = -1;
1218                 else
1219                         bv = b.syms[i];
1220                 if (av - bv)
1221                         return av - bv;
1222                 if (type < LR1 || av == -1)
1223                         return 0;
1224                 return
1225                         a.data[i] - b.data[i];
1226         }
1227
1228 It will be helpful to know if an itemset has been "completed" or not,
1229 particularly for LALR where itemsets get merged, at which point they
1230 need to be consider for completion again.  So a `completed` flag is
1231 needed.
1232
1233 For correct handling of `TK_newline` when parsing, we will need to
1234 know which states (itemsets) can occur at the start of a line, so we
1235 will record a `starts_line` flag too whenever DOT is at the start of a
1236 `line_like` symbol.
1237
1238 Finally, for handling `TK_out` we need to know whether productions in the
1239 current state started *before* the most recent indent.  A state
1240 doesn't usually keep details of individual productions, so we need to
1241 add one extra detail. `min_prefix` is the smallest non-zero number of
1242 symbols *before* DOT in any production in an itemset.  This will allow
1243 us to determine if the the most recent indent is sufficiently recent
1244 to cancel it against a `TK_out`.  If it was seen longer ago than the
1245 `min_prefix`, and if the current state cannot be reduced, then the
1246 indented section must have ended in the middle of a syntactic unit, so
1247 an error must be signaled.
1248
1249 And now we can build the list of itemsets.  The lookup routine returns
1250 both a success flag and a pointer to where in the list an insert should
1251 happen, so we don't need to search a second time.
1252
1253 ###### declarations
1254         struct itemset {
1255                 struct itemset *next;
1256                 short state;
1257                 struct symset items;
1258                 struct symset go_to;
1259                 enum assoc assoc;
1260                 unsigned short precedence;
1261                 char completed;
1262                 char starts_line;
1263                 int min_prefix;
1264         };
1265
1266 ###### grammar fields
1267         struct itemset *items;
1268         int states;
1269
1270 ###### functions
1271         static int itemset_find(struct grammar *g, struct itemset ***where,
1272                                 struct symset kernel, enum grammar_type type)
1273         {
1274                 struct itemset **ip;
1275
1276                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1277                         struct itemset *i = *ip;
1278                         int diff;
1279                         diff = itemset_cmp(i->items, kernel, type);
1280                         if (diff < 0)
1281                                 continue;
1282                         if (diff > 0)
1283                                 break;
1284                         /* found */
1285                         *where = ip;
1286                         return 1;
1287                 }
1288                 *where = ip;
1289                 return 0;
1290         }
1291
1292 Adding an itemset may require merging the LA sets if LALR analysis is
1293 happening.  If any new LA set adds any symbols that weren't in the old
1294 LA set, we clear the `completed` flag so that the dependants of this
1295 itemset will be recalculated and their LA sets updated.
1296
1297 `add_itemset` must consume the symsets it is passed, either by adding
1298 them to a data structure, of freeing them.
1299
1300         static int add_itemset(struct grammar *g, struct symset ss,
1301                                enum assoc assoc, unsigned short precedence,
1302                                enum grammar_type type)
1303         {
1304                 struct itemset **where, *is;
1305                 int i;
1306                 int found = itemset_find(g, &where, ss, type);
1307                 if (!found) {
1308                         is = calloc(1, sizeof(*is));
1309                         is->state = g->states;
1310                         g->states += 1;
1311                         is->items = ss;
1312                         is->assoc = assoc;
1313                         is->precedence = precedence;
1314                         is->next = *where;
1315                         is->go_to = INIT_DATASET;
1316                         *where = is;
1317                         return is->state;
1318                 }
1319                 is = *where;
1320                 if (type != LALR) {
1321                         symset_free(ss);
1322                         return is->state;
1323                 }
1324                 for (i = 0; i < ss.cnt; i++) {
1325                         struct symset temp = INIT_SYMSET, s;
1326                         if (ss.data[i] == is->items.data[i])
1327                                 continue;
1328                         s = set_find(g, is->items.data[i]);
1329                         symset_union(&temp, &s);
1330                         s = set_find(g, ss.data[i]);
1331                         if (symset_union(&temp, &s)) {
1332                                 is->items.data[i] = save_set(g, temp);
1333                                 is->completed = 0;
1334                         } else
1335                                 symset_free(temp);
1336                 }
1337                 symset_free(ss);
1338                 return is->state;
1339         }
1340
1341 #### The build
1342
1343 To build all the itemsets, we first insert the initial itemset made from
1344 production zero, complete each itemset, and then generate new itemsets
1345 from old until no new ones can be made.
1346
1347 Completing an itemset means finding all the items where "DOT" is
1348 followed by a nonterminal and adding "DOT=0" items for every production
1349 from that non-terminal - providing each item hasn't already been added.
1350 When we add such an item it might get inserted before the current
1351 one, so to make sure we process it we reset the loop counter to the
1352 start.
1353
1354 If LA sets are needed, the LA set for each new item is found using
1355 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This
1356 may be supplemented by the LA set for the item which produced the new
1357 item.
1358
1359 We also collect a set of all symbols which follow "DOT" (in `done`) as
1360 this is used in the next stage.  If any of these symbols are flagged as
1361 `line_like`, then this state must be a `starts_line` state so now is a
1362 good time to record that.
1363
1364 When itemsets are created we assign a precedence to the itemset from the
1365 complete item, if there is one.  We ignore the possibility of there
1366 being two and don't (currently) handle precedence in such grammars.
1367 When completing a grammar we ignore any item where DOT is followed by a
1368 terminal with a precedence lower than that for the itemset.  Unless the
1369 terminal has right associativity, we also ignore items where the
1370 terminal has the same precedence.  The result is that unwanted items are
1371 still in the itemset, but the terminal doesn't get into the go to set,
1372 so the item is ineffective.
1373
1374 ###### complete itemset
1375         for (i = 0; i < is->items.cnt; i++) {
1376                 int p = item_prod(is->items.syms[i]);
1377                 int bs = item_dot(is->items.syms[i]);
1378                 struct production *pr = g->productions[p];
1379                 int p2;
1380                 struct symbol *s;
1381                 struct symset LA = INIT_SYMSET;
1382                 unsigned short sn = 0;
1383                 struct symset LAnl = INIT_SYMSET;
1384                 unsigned short snnl = 0;
1385
1386                 if (is->min_prefix == 0 ||
1387                     (bs > 0 && bs < is->min_prefix))
1388                         is->min_prefix = bs;
1389                 if (bs == pr->body_size)
1390                         continue;
1391                 s = pr->body[bs];
1392                 if (s->precedence && is->precedence &&
1393                     is->precedence > s->precedence)
1394                         /* This terminal has a low precedence and
1395                          * shouldn't be shifted
1396                          */
1397                         continue;
1398                 if (s->precedence && is->precedence &&
1399                     is->precedence == s->precedence && s->assoc != Right)
1400                         /* This terminal has a matching precedence and is
1401                          * not Right-associative, so we mustn't shift it.
1402                          */
1403                         continue;
1404                 if (symset_find(&done, s->num) < 0) {
1405                         symset_add(&done, s->num, 0);
1406                 }
1407                 if (s->type != Nonterminal)
1408                         continue;
1409                 if (s->line_like)
1410                         is->starts_line = 1;
1411                 check_again = 1;
1412                 if (type >= LALR) {
1413                         // Need the LA set.
1414                         int to_end;
1415                         add_first(pr, bs+1, &LA, g, &to_end);
1416                         if (to_end) {
1417                                 struct symset ss = set_find(g, is->items.data[i]);
1418                                 symset_union(&LA, &ss);
1419                         }
1420                         sn = save_set(g, LA);
1421                         LA = set_find(g, sn);
1422                         if (symset_find(&LA, TK_newline))
1423                                 symset_add(&LAnl, TK_newline, 0);
1424                         snnl = save_set(g, LAnl);
1425                         LAnl = set_find(g, snnl);
1426                 }
1427
1428                 /* Add productions for this symbol */
1429                 for (p2 = s->first_production;
1430                      p2 < g->production_count &&
1431                       g->productions[p2]->head == s;
1432                      p2++) {
1433                         int itm = item_num(p2, 0);
1434                         int pos = symset_find(&is->items, itm);
1435                         if (pos < 0) {
1436                                 if (g->productions[p2]->line_like)
1437                                         symset_add(&is->items, itm, snnl);
1438                                 else
1439                                         symset_add(&is->items, itm, sn);
1440                                 /* Will have re-ordered, so start
1441                                  * from beginning again */
1442                                 i = -1;
1443                         } else if (type >= LALR) {
1444                                 struct symset ss = set_find(g, is->items.data[pos]);
1445                                 struct symset tmp = INIT_SYMSET;
1446                                 struct symset *la = &LA;
1447
1448                                 if (g->productions[p2]->line_like)
1449                                         la = &LAnl;
1450                                 symset_union(&tmp, &ss);
1451                                 if (symset_union(&tmp, la)) {
1452                                         is->items.data[pos] = save_set(g, tmp);
1453                                         i = -1;
1454                                 } else
1455                                         symset_free(tmp);
1456                         }
1457                 }
1458         }
1459
1460 For each symbol we found (and placed in `done`) we collect all the items
1461 for which this symbol is next, and create a new itemset with "DOT"
1462 advanced over the symbol.  This is then added to the collection of
1463 itemsets (or merged with a pre-existing itemset).
1464
1465 ###### derive itemsets
1466         // Now we have a completed itemset, so we need to
1467         // compute all the 'next' itemsets and create them
1468         // if they don't exist.
1469         for (i = 0; i < done.cnt; i++) {
1470                 int j;
1471                 unsigned short state;
1472                 struct symbol *sym = g->symtab[done.syms[i]];
1473                 enum assoc assoc = Non;
1474                 unsigned short precedence = 0;
1475                 struct symset newitemset = INIT_SYMSET;
1476                 if (type >= LALR)
1477                         newitemset = INIT_DATASET;
1478
1479                 for (j = 0; j < is->items.cnt; j++) {
1480                         int itm = is->items.syms[j];
1481                         int p = item_prod(itm);
1482                         int bp = item_dot(itm);
1483                         struct production *pr = g->productions[p];
1484                         unsigned short la = 0;
1485                         int pos;
1486
1487                         if (bp == pr->body_size)
1488                                 continue;
1489                         if (pr->body[bp] != sym)
1490                                 continue;
1491
1492                         bp += 1;
1493                         if (type >= LALR)
1494                                 la = is->items.data[j];
1495                         if (bp == pr->body_size &&
1496                             pr->precedence > 0 &&
1497                             pr->precedence > precedence) {
1498                                 // new itemset is reducible and has a precedence.
1499                                 precedence = pr->precedence;
1500                                 assoc = pr->assoc;
1501                         }
1502                         pos = symset_find(&newitemset, item_num(p, bp));
1503
1504                         if (pos < 0)
1505                                 symset_add(&newitemset, item_num(p, bp), la);
1506                         else if (type >= LALR) {
1507                                 // Need to merge la set.
1508                                 int la2 = newitemset.data[pos];
1509                                 if (la != la2) {
1510                                         struct symset ss = set_find(g, la2);
1511                                         struct symset LA = INIT_SYMSET;
1512                                         symset_union(&LA, &ss);
1513                                         ss = set_find(g, la);
1514                                         if (symset_union(&LA, &ss))
1515                                                 newitemset.data[pos] = save_set(g, LA);
1516                                         else
1517                                                 symset_free(LA);
1518                                 }
1519                         }
1520                 }
1521                 state = add_itemset(g, newitemset, assoc, precedence, type);
1522                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1523                         symset_add(&is->go_to, done.syms[i], state);
1524         }
1525
1526 All that is left is to create the initial itemset from production zero, and
1527 with `TK_eof` as the LA set.
1528
1529 ###### functions
1530         static void build_itemsets(struct grammar *g, enum grammar_type type)
1531         {
1532                 struct symset first = INIT_SYMSET;
1533                 struct itemset *is;
1534                 int check_again;
1535                 unsigned short la = 0;
1536                 if (type >= LALR) {
1537                         // LA set just has eof
1538                         struct symset eof = INIT_SYMSET;
1539                         symset_add(&eof, TK_eof, 0);
1540                         la = save_set(g, eof);
1541                         first = INIT_DATASET;
1542                 }
1543                 // production 0, offset 0 (with no data)
1544                 symset_add(&first, item_num(0, 0), la);
1545                 add_itemset(g, first, Non, 0, type);
1546                 for (check_again = 0, is = g->items;
1547                      is;
1548                      is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1549                         int i;
1550                         struct symset done = INIT_SYMSET;
1551                         if (is->completed)
1552                                 continue;
1553                         is->completed = 1;
1554                         check_again = 1;
1555                         ## complete itemset
1556                         ## derive itemsets
1557                         symset_free(done);
1558                 }
1559         }
1560
1561 ### Completing the analysis.
1562
1563 The exact process of analysis depends on which level was requested.  For
1564 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1565 `LR1` we need first, but not follow.  For `SLR` we need both.
1566
1567 We don't build the "action" tables that you might expect as the parser
1568 doesn't use them.  They will be calculated to some extent if needed for
1569 a report.
1570
1571 Once we have built everything we allocate arrays for the two lists:
1572 symbols and itemsets.  This allows more efficient access during
1573 reporting.  The symbols are grouped as terminals and then non-terminals,
1574 and we record the changeover point in `first_nonterm`.
1575
1576 ###### grammar fields
1577         struct symbol **symtab;
1578         struct itemset **statetab;
1579         int first_nonterm;
1580
1581 ###### grammar_analyse
1582
1583         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1584         {
1585                 struct symbol *s;
1586                 struct itemset *is;
1587                 int snum = TK_reserved;
1588                 for (s = g->syms; s; s = s->next)
1589                         if (s->num < 0 && s->type == Terminal) {
1590                                 s->num = snum;
1591                                 snum++;
1592                         }
1593                 g->first_nonterm = snum;
1594                 for (s = g->syms; s; s = s->next)
1595                         if (s->num < 0 && s->type != Virtual) {
1596                                 s->num = snum;
1597                                 snum++;
1598                         }
1599                 for (s = g->syms; s; s = s->next)
1600                         if (s->num < 0) {
1601                                 s->num = snum;
1602                                 snum++;
1603                         }
1604                 g->num_syms = snum;
1605                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1606                 for (s = g->syms; s; s = s->next)
1607                         g->symtab[s->num] = s;
1608
1609                 set_nullable(g);
1610                 set_line_like(g);
1611                 if (type >= SLR)
1612                         build_first(g);
1613
1614                 if (type == SLR)
1615                         build_follow(g);
1616
1617                 build_itemsets(g, type);
1618
1619                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1620                 for (is = g->items; is ; is = is->next)
1621                         g->statetab[is->state] = is;
1622         }
1623
1624 ## Reporting on the Grammar
1625
1626 The purpose of the report is to give the grammar developer insight into
1627 how the grammar parser will work.  It is basically a structured dump of
1628 all the tables that have been generated, plus a description of any conflicts.
1629
1630 ###### grammar_report
1631         static int grammar_report(struct grammar *g, enum grammar_type type)
1632         {
1633                 report_symbols(g);
1634                 if (g->follow)
1635                         report_follow(g);
1636                 report_itemsets(g);
1637                 return report_conflicts(g, type);
1638         }
1639
1640 Firstly we have the complete list of symbols, together with the
1641 "FIRST" set if that was generated.  We add a mark to each symbol to
1642 show if it is considered to be "line-like" (`<`), or if it is nullable (`.`).
1643
1644 ###### functions
1645
1646         static void report_symbols(struct grammar *g)
1647         {
1648                 int n;
1649                 if (g->first)
1650                         printf("SYMBOLS + FIRST:\n");
1651                 else
1652                         printf("SYMBOLS:\n");
1653
1654                 for (n = 0; n < g->num_syms; n++) {
1655                         struct symbol *s = g->symtab[n];
1656                         if (!s)
1657                                 continue;
1658
1659                         printf(" %c%c%3d%c: ",
1660                                s->nullable ? '.':' ',
1661                                s->line_like ? '<':' ',
1662                                s->num, symtypes[s->type]);
1663                         prtxt(s->name);
1664                         if (s->precedence)
1665                                 printf(" (%d%s)", s->precedence,
1666                                        assoc_names[s->assoc]);
1667
1668                         if (g->first && s->type == Nonterminal) {
1669                                 int j;
1670                                 char c = ':';
1671                                 for (j = 0; j < g->first[n].cnt; j++) {
1672                                         printf("%c ", c);
1673                                         c = ',';
1674                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1675                                 }
1676                         }
1677                         printf("\n");
1678                 }
1679         }
1680
1681 Then we have the follow sets if they were computed.
1682
1683         static void report_follow(struct grammar *g)
1684         {
1685                 int n;
1686                 printf("FOLLOW:\n");
1687                 for (n = 0; n < g->num_syms; n++)
1688                         if (g->follow[n].cnt) {
1689                                 int j;
1690                                 char c = ':';
1691                                 printf("  ");
1692                                 prtxt(g->symtab[n]->name);
1693                                 for (j = 0; j < g->follow[n].cnt; j++) {
1694                                         printf("%c ", c);
1695                                         c = ',';
1696                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1697                                 }
1698                                 printf("\n");
1699                         }
1700         }
1701
1702 And finally the item sets.  These include the GO TO tables and, for
1703 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1704 it up a bit.  First the items, with production number and associativity.
1705
1706         static void report_item(struct grammar *g, int itm)
1707         {
1708                 int p = item_prod(itm);
1709                 int dot = item_dot(itm);
1710                 struct production *pr = g->productions[p];
1711                 int i;
1712
1713                 printf("    ");
1714                 prtxt(pr->head->name);
1715                 printf(" ->");
1716                 for (i = 0; i < pr->body_size; i++) {
1717                         printf(" %s", (dot == i ? ". ": ""));
1718                         prtxt(pr->body[i]->name);
1719                 }
1720                 if (dot == pr->body_size)
1721                         printf(" .");
1722                 printf(" [%d]", p);
1723                 if (pr->precedence && dot == pr->body_size)
1724                         printf(" (%d%s)", pr->precedence,
1725                                assoc_names[pr->assoc]);
1726                 if (dot < pr->body_size &&
1727                     pr->body[dot]->precedence) {
1728                         struct symbol *s = pr->body[dot];
1729                         printf(" [%d%s]", s->precedence,
1730                                assoc_names[s->assoc]);
1731                 }
1732                 if (pr->line_like == 1)
1733                         printf(" $$NEWLINE");
1734                 else if (pr->line_like)
1735                         printf(" $$OUT");
1736                 printf("\n");
1737         }
1738
1739 The LA sets which are (possibly) reported with each item:
1740
1741         static void report_la(struct grammar *g, int lanum)
1742         {
1743                 struct symset la = set_find(g, lanum);
1744                 int i;
1745                 char c = ':';
1746
1747                 printf("        LOOK AHEAD(%d)", lanum);
1748                 for (i = 0; i < la.cnt; i++) {
1749                         printf("%c ", c);
1750                         c = ',';
1751                         prtxt(g->symtab[la.syms[i]]->name);
1752                 }
1753                 printf("\n");
1754         }
1755
1756 Then the go to sets:
1757
1758         static void report_goto(struct grammar *g, struct symset gt)
1759         {
1760                 int i;
1761                 printf("    GOTO:\n");
1762
1763                 for (i = 0; i < gt.cnt; i++) {
1764                         printf("      ");
1765                         prtxt(g->symtab[gt.syms[i]]->name);
1766                         printf(" -> %d\n", gt.data[i]);
1767                 }
1768         }
1769
1770 Now we can report all the item sets complete with items, LA sets, and GO TO.
1771
1772         static void report_itemsets(struct grammar *g)
1773         {
1774                 int s;
1775                 printf("ITEM SETS(%d)\n", g->states);
1776                 for (s = 0; s < g->states; s++) {
1777                         int j;
1778                         struct itemset *is = g->statetab[s];
1779                         printf("  Itemset %d:%s min prefix=%d",
1780                                s, is->starts_line?" (startsline)":"", is->min_prefix);
1781                         if (is->precedence)
1782                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1783                         printf("\n");
1784                         for (j = 0; j < is->items.cnt; j++) {
1785                                 report_item(g, is->items.syms[j]);
1786                                 if (is->items.data != NO_DATA)
1787                                         report_la(g, is->items.data[j]);
1788                         }
1789                         report_goto(g, is->go_to);
1790                 }
1791         }
1792
1793 ### Reporting conflicts
1794
1795 Conflict detection varies a lot among different analysis levels.  However
1796 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1797 LR1 are also similar as they have FOLLOW or LA sets.
1798
1799 ###### functions
1800
1801         ## conflict functions
1802
1803         static int report_conflicts(struct grammar *g, enum grammar_type type)
1804         {
1805                 int cnt = 0;
1806                 printf("Conflicts:\n");
1807                 if (type < SLR)
1808                         cnt = conflicts_lr0(g, type);
1809                 else
1810                         cnt = conflicts_slr(g, type);
1811                 if (cnt == 0)
1812                         printf(" - no conflicts\n");
1813                 return cnt;
1814         }
1815
1816 LR0 conflicts are any state which have both a reducible item and
1817 a shiftable item, or two reducible items.
1818
1819 LR05 conflicts only occur if two possible reductions exist,
1820 as shifts always over-ride reductions.
1821
1822 ###### conflict functions
1823         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1824         {
1825                 int i;
1826                 int cnt = 0;
1827                 for (i = 0; i < g->states; i++) {
1828                         struct itemset *is = g->statetab[i];
1829                         int last_reduce = -1;
1830                         int prev_reduce = -1;
1831                         int last_shift = -1;
1832                         int j;
1833                         if (!is)
1834                                 continue;
1835                         for (j = 0; j < is->items.cnt; j++) {
1836                                 int itm = is->items.syms[j];
1837                                 int p = item_prod(itm);
1838                                 int bp = item_dot(itm);
1839                                 struct production *pr = g->productions[p];
1840
1841                                 if (bp == pr->body_size) {
1842                                         prev_reduce = last_reduce;
1843                                         last_reduce = j;
1844                                         continue;
1845                                 }
1846                                 if (pr->body[bp]->type == Terminal)
1847                                         last_shift = j;
1848                         }
1849                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1850                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1851                                 report_item(g, is->items.syms[last_shift]);
1852                                 report_item(g, is->items.syms[last_reduce]);
1853                                 cnt++;
1854                         }
1855                         if (prev_reduce >= 0) {
1856                                 printf("  State %d has 2 (or more) reducible items\n", i);
1857                                 report_item(g, is->items.syms[prev_reduce]);
1858                                 report_item(g, is->items.syms[last_reduce]);
1859                                 cnt++;
1860                         }
1861                 }
1862                 return cnt;
1863         }
1864
1865 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1866 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1867 in the source of the look ahead set.
1868
1869 We build two datasets to reflect the "action" table: one which maps
1870 terminals to items where that terminal could be shifted and another
1871 which maps terminals to items that could be reduced when the terminal
1872 is in look-ahead.  We report when we get conflicts between the two.
1873
1874 As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
1875 terminal, we ignore it.  NEWLINES are handled specially with its own
1876 rules for when to shift and when to reduce.  Conflicts are expected,
1877 but handled internally.
1878
1879         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1880         {
1881                 int i;
1882                 int cnt = 0;
1883
1884                 for (i = 0; i < g->states; i++) {
1885                         struct itemset *is = g->statetab[i];
1886                         struct symset shifts = INIT_DATASET;
1887                         struct symset reduce = INIT_DATASET;
1888                         int j;
1889                         if (!is)
1890                                 continue;
1891                         /* First collect the shifts */
1892                         for (j = 0; j < is->items.cnt; j++) {
1893                                 unsigned short itm = is->items.syms[j];
1894                                 int p = item_prod(itm);
1895                                 int bp = item_dot(itm);
1896                                 struct production *pr = g->productions[p];
1897                                 struct symbol *s;
1898
1899                                 if (bp >= pr->body_size ||
1900                                     pr->body[bp]->type != Terminal)
1901                                         /* not shiftable */
1902                                         continue;
1903
1904                                 s = pr->body[bp];
1905                                 if (s->precedence && is->precedence)
1906                                         /* Precedence resolves this, so no conflict */
1907                                         continue;
1908
1909                                 if (symset_find(&shifts, s->num) < 0)
1910                                         symset_add(&shifts, s->num, itm);
1911                         }
1912                         /* Now look for reductions and conflicts */
1913                         for (j = 0; j < is->items.cnt; j++) {
1914                                 unsigned short itm = is->items.syms[j];
1915                                 int p = item_prod(itm);
1916                                 int bp = item_dot(itm);
1917                                 struct production *pr = g->productions[p];
1918
1919                                 if (bp < pr->body_size)
1920                                         continue;
1921                                 /* reducible */
1922                                 struct symset la;
1923                                 if (type == SLR)
1924                                         la = g->follow[pr->head->num];
1925                                 else
1926                                         la = set_find(g, is->items.data[j]);
1927                                 int k;
1928                                 for (k = 0; k < la.cnt; k++) {
1929                                         int pos = symset_find(&shifts, la.syms[k]);
1930                                         if (pos >= 0 && la.syms[k] != TK_newline) {
1931                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1932                                                 cnt++;
1933                                                         prtxt(g->symtab[la.syms[k]]->name);
1934                                                 printf(":\n");
1935                                                 report_item(g, shifts.data[pos]);
1936                                                 report_item(g, itm);
1937                                         }
1938                                         pos = symset_find(&reduce, la.syms[k]);
1939                                         if (pos < 0) {
1940                                                 symset_add(&reduce, la.syms[k], itm);
1941                                                 continue;
1942                                         }
1943                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1944                                         prtxt(g->symtab[la.syms[k]]->name);
1945                                         printf(":\n");
1946                                         report_item(g, itm);
1947                                         report_item(g, reduce.data[pos]);
1948                                         cnt++;
1949                                 }
1950                         }
1951                         symset_free(shifts);
1952                         symset_free(reduce);
1953                 }
1954                 return cnt;
1955         }
1956
1957
1958 ## Generating the parser
1959
1960 The exported part of the parser is the `parse_XX` function, where the name
1961 `XX` is based on the name of the parser files.
1962
1963 This takes a `code_node`, a partially initialized `token_config`, and an
1964 optional `FILE` to send tracing to.  The `token_config` gets the list of
1965 known words added and then is used with the `code_node` to initialize the
1966 scanner.
1967
1968 `parse_XX` then calls the library function `parser_run` to actually complete
1969 the parse.  This needs the `states` table and functions to call the various
1970 pieces of code provided in the grammar file, so they are generated first.
1971
1972 ###### parser_generate
1973
1974         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1975                                struct code_node *pre_reduce)
1976         {
1977                 gen_known(f, g);
1978                 gen_non_term(f, g);
1979                 gen_goto(f, g);
1980                 gen_states(f, g);
1981                 gen_reduce(f, g, file, pre_reduce);
1982                 gen_free(f, g);
1983
1984                 fprintf(f, "#line 0 \"gen_parser\"\n");
1985                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1986                         name);
1987                 fprintf(f, "{\n");
1988                 fprintf(f, "\tstruct token_state *tokens;\n");
1989                 fprintf(f, "\tconfig->words_marks = known;\n");
1990                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1991                 fprintf(f, "\ttokens = token_open(code, config);\n");
1992                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1993                 fprintf(f, "\ttoken_close(tokens);\n");
1994                 fprintf(f, "\treturn rv;\n");
1995                 fprintf(f, "}\n\n");
1996         }
1997
1998 ### Known words table
1999
2000 The known words table is simply an array of terminal symbols.
2001 The table of nonterminals used for tracing is a similar array.
2002
2003 ###### functions
2004
2005         static void gen_known(FILE *f, struct grammar *g)
2006         {
2007                 int i;
2008                 fprintf(f, "#line 0 \"gen_known\"\n");
2009                 fprintf(f, "static const char *known[] = {\n");
2010                 for (i = TK_reserved;
2011                      i < g->num_syms && g->symtab[i]->type == Terminal;
2012                      i++)
2013                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
2014                                 g->symtab[i]->name.txt);
2015                 fprintf(f, "};\n\n");
2016         }
2017
2018         static void gen_non_term(FILE *f, struct grammar *g)
2019         {
2020                 int i;
2021                 fprintf(f, "#line 0 \"gen_non_term\"\n");
2022                 fprintf(f, "static const char *non_term[] = {\n");
2023                 for (i = TK_reserved;
2024                      i < g->num_syms;
2025                      i++)
2026                         if (g->symtab[i]->type == Nonterminal)
2027                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
2028                                         g->symtab[i]->name.txt);
2029                 fprintf(f, "};\n\n");
2030         }
2031
2032 ### States and the goto tables.
2033
2034 For each state we record the goto table and details of the reducible
2035 production if there is one.
2036 Some of the details of the reducible production are stored in the
2037 `do_reduce` function to come later.  Here we store the production
2038 number, the body size (useful for stack management), and the resulting
2039 symbol (useful for knowing how to free data later).
2040
2041 The go to table is stored in a simple array of `sym` and corresponding
2042 `state`.
2043
2044 ###### exported types
2045
2046         struct lookup {
2047                 short sym;
2048                 short state;
2049         };
2050         struct state {
2051                 short go_to_cnt;
2052                 const struct lookup * go_to;
2053                 short reduce_prod;
2054                 short reduce_size;
2055                 short reduce_sym;
2056                 char starts_line;
2057                 char newline_only;
2058                 short min_prefix;
2059                 short result_size;
2060         };
2061
2062 ###### functions
2063
2064         static void gen_goto(FILE *f, struct grammar *g)
2065         {
2066                 int i;
2067                 fprintf(f, "#line 0 \"gen_goto\"\n");
2068                 for (i = 0; i < g->states; i++) {
2069                         struct symset gt = g->statetab[i]->go_to;
2070                         int j;
2071
2072                         if (gt.cnt == 0)
2073                                 continue;
2074                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
2075                                 i);
2076                         for (j = 0; j < gt.cnt; j++)
2077                                 fprintf(f, "\t{ %d, %d },\n",
2078                                         gt.syms[j], gt.data[j]);
2079                         fprintf(f, "};\n");
2080                 }
2081         }
2082
2083         static void gen_states(FILE *f, struct grammar *g)
2084         {
2085                 int i;
2086                 fprintf(f, "#line 0 \"gen_states\"\n");
2087                 fprintf(f, "static const struct state states[] = {\n");
2088                 for (i = 0; i < g->states; i++) {
2089                         struct itemset *is = g->statetab[i];
2090                         int j, prod = -1, prod_len;
2091
2092                         for (j = 0; j < is->items.cnt; j++) {
2093                                 int itm = is->items.syms[j];
2094                                 int p = item_prod(itm);
2095                                 int bp = item_dot(itm);
2096                                 struct production *pr = g->productions[p];
2097
2098                                 if (bp < pr->body_size)
2099                                         continue;
2100                                 /* This is what we reduce - choose longest */
2101                                 if (prod < 0 || prod_len < pr->body_size) {
2102                                         prod = p;
2103                                         prod_len = pr->body_size;
2104                                 }
2105                         }
2106                         if (is->go_to.cnt)
2107                                 fprintf(f, "\t[%d] = { %d, goto_%d, ",
2108                                         i, is->go_to.cnt, i);
2109                         else
2110                                 fprintf(f, "\t[%d] = { 0, NULL, ", i);
2111                         if (prod >= 0) {
2112                                 struct production *pr = g->productions[prod];
2113                                 struct symbol *hd = pr->head;
2114                                 fprintf(f, "%d, %d, %d, %d, %d, %d, ", prod,
2115                                         pr->body_size,
2116                                         pr->head->num,
2117                                         is->starts_line,
2118                                         pr->line_like,
2119                                         is->min_prefix);
2120                                 if (hd->struct_name.txt == NULL)
2121                                         fprintf(f, "0 },\n");
2122                                 else
2123                                         fprintf(f, "sizeof(struct %.*s%s) },\n",
2124                                                 hd->struct_name.len,
2125                                                 hd->struct_name.txt,
2126                                                 hd->isref ? "*" : "");
2127                         } else
2128                                 fprintf(f, "-1, -1, -1, %d, 0, %d, -1 },\n",
2129                                         is->starts_line, is->min_prefix);
2130                 }
2131                 fprintf(f, "};\n\n");
2132         }
2133
2134 ### The `do_reduce` function and the code
2135
2136 When the parser engine decides to reduce a production, it calls
2137 `do_reduce` which runs the code that was included with the production,
2138 if any.
2139
2140 This code needs to be able to store data somewhere.  Rather than
2141 requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
2142 buffer and have `do_reduce` return the size to be saved.
2143
2144 In order for the code to access "global" context, we pass in the
2145 "config" pointer that was passed to the parser function.  If the `struct
2146 token_config` is embedded in some larger structure, the reducing code
2147 can access the larger structure using pointer manipulation.  Performing
2148 that pointer manipulation, and any other common code or variables for
2149 all reduce actions, can be provided in code node called "reduce" which
2150 is passed around in `pre_reduce` which you might have already noticed.
2151
2152 The code fragments require translation when written out.  Any `$N` needs
2153 to be converted to a reference either to that buffer (if `$0`) or to the
2154 structure returned by a previous reduction.  These pointers need to be
2155 cast to the appropriate type for each access.  All this is handled in
2156 `gen_code`.
2157
2158 `gen_code` also allows symbol references to contain a '`<`' as in
2159 '`$<2`'.  This is particularly useful for references (or pointers), but
2160 can be used with structures too.  The `<` implies that the value is
2161 being moved out, so the object will not be automatically freed.  It is
2162 equivalent to assigning `NULL` to the pointer or filling a structure
2163 with zeros.
2164
2165 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2166 and possibly a number.  A number by itself (other than zero) selects a
2167 symbol from the body of the production.  A sequence of letters selects
2168 the shortest symbol in the body which contains those letters in the
2169 given order.  If a number follows the letters, then a later occurrence
2170 of that symbol is chosen.  So "`$AB2`" will refer to the structure
2171 attached to the second occurrence of the shortest symbol which contains
2172 an `A` followed by a `B`.  If there is no unique shortest system, or if
2173 the number given is too large, then the symbol reference is not
2174 transformed, and will cause an error when the code is compiled.
2175
2176 ###### functions
2177
2178         static int textchr(struct text t, char c, int s)
2179         {
2180                 int i;
2181                 for (i = s; i < t.len; i++)
2182                         if (t.txt[i] == c)
2183                                 return i;
2184                 return -1;
2185         }
2186
2187         static int subseq_match(char *seq, int slen, struct text name)
2188         {
2189                 int st = 0;
2190                 while (slen > 0) {
2191                         st = textchr(name, *seq, st);
2192                         if (st < 0)
2193                                 return 0;
2194                         slen -= 1;
2195                         seq += 1;
2196                         st += 1;
2197                 }
2198                 return 1;
2199         }
2200
2201         static int choose_sym(char **namep, int len, struct production *p)
2202         {
2203                 char *name = *namep;
2204                 char *nam = name;
2205                 int namlen;
2206                 int n = 0;
2207                 int i, s, slen;
2208                 char c;
2209
2210                 c = *name;
2211                 while (len > 0 &&
2212                        ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2213                         name += 1;
2214                         len -= 1;
2215                         c = *name;
2216                 }
2217                 namlen = name-nam;
2218                 while (len > 0 && (c >= '0' && c <= '9')) {
2219                         name += 1;
2220                         len -= 1;
2221                         n = n * 10 + (c - '0');
2222                         c = *name;
2223                 }
2224                 if (namlen == 0) {
2225                         if (name == *namep || n > p->body_size)
2226                                 return -1;
2227                         *namep = name;
2228                         return n;
2229                 }
2230                 slen = 0; s = -1;
2231                 for (i = 0; i < p->body_size; i++) {
2232                         if (!subseq_match(nam, namlen, p->body[i]->name))
2233                                 continue;
2234                         if (slen == 0 || p->body[i]->name.len < slen) {
2235                                 s = i;
2236                                 slen = p->body[i]->name.len;
2237                         }
2238                         if (s >= 0 && p->body[i] != p->body[s] &&
2239                             p->body[i]->name.len == p->body[s]->name.len)
2240                                 /* not unique, so s cannot be used */
2241                                 s = -1;
2242                 }
2243                 if (s < 0)
2244                         return -1;
2245                 if (n == 0)
2246                         n = 1;
2247                 for (i = 0; i < p->body_size; i++)
2248                         if (p->body[i] == p->body[s]) {
2249                                 n -= 1;
2250                                 if (n == 0)
2251                                         break;
2252                         }
2253                 if (n > 0 || i == p->body_size)
2254                         return -1;
2255                 *namep = name;
2256                 return i + 1;
2257         }
2258
2259         static void gen_code(struct production *p, FILE *f, struct grammar *g)
2260         {
2261                 char *c;
2262                 char *used = calloc(1, p->body_size);
2263                 int i;
2264
2265                 fprintf(f, "\t\t\t");
2266                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2267                         int n;
2268                         int use = 0;
2269                         if (*c != '$') {
2270                                 fputc(*c, f);
2271                                 if (*c == '\n')
2272                                         fputs("\t\t\t", f);
2273                                 continue;
2274                         }
2275                         c++;
2276                         if (*c == '<') {
2277                                 use = 1;
2278                                 c++;
2279                         }
2280                         n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2281                         if (n < 0) {
2282                                 fputc('$', f);
2283                                 if (use)
2284                                         fputc('<', f);
2285                                 fputc(*c, f);
2286                                 continue;
2287                         }
2288                         if (n == 0)
2289                                 fprintf(f, "(*(struct %.*s*%s)ret)",
2290                                         p->head->struct_name.len,
2291                                         p->head->struct_name.txt,
2292                                         p->head->isref ? "*":"");
2293                         else if (p->body[n-1]->type == Terminal)
2294                                 fprintf(f, "(*(struct token *)body[%d])",
2295                                         n-1);
2296                         else if (p->body[n-1]->struct_name.txt == NULL)
2297                                 fprintf(f, "$%d", n);
2298                         else {
2299                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2300                                         p->body[n-1]->struct_name.len,
2301                                         p->body[n-1]->struct_name.txt,
2302                                         p->body[n-1]->isref ? "*":"", n-1);
2303                                 used[n-1] = use;
2304                         }
2305                         c -= 1;
2306                 }
2307                 fputs("\n", f);
2308                 for (i = 0; i < p->body_size; i++) {
2309                         if (p->body[i]->struct_name.txt &&
2310                             used[i]) {
2311                                 // assume this has been copied out
2312                                 if (p->body[i]->isref)
2313                                         fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2314                                 else
2315                                         fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2316                         }
2317                 }
2318                 free(used);
2319         }
2320
2321 ###### functions
2322
2323         static void gen_reduce(FILE *f, struct grammar *g, char *file,
2324                                struct code_node *pre_reduce)
2325         {
2326                 int i;
2327                 fprintf(f, "#line 1 \"gen_reduce\"\n");
2328                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2329                 fprintf(f, "{\n");
2330                 fprintf(f, "\tint ret_size = 0;\n");
2331                 if (pre_reduce)
2332                         code_node_print(f, pre_reduce, file);
2333
2334                 fprintf(f, "#line 4 \"gen_reduce\"\n");
2335                 fprintf(f, "\tswitch(prod) {\n");
2336                 for (i = 0; i < g->production_count; i++) {
2337                         struct production *p = g->productions[i];
2338                         fprintf(f, "\tcase %d:\n", i);
2339
2340                         if (p->code.txt) {
2341                                 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2342                                 gen_code(p, f, g);
2343                         }
2344
2345                         if (p->head->struct_name.txt)
2346                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2347                                         p->head->struct_name.len,
2348                                         p->head->struct_name.txt,
2349                                         p->head->isref ? "*":"");
2350
2351                         fprintf(f, "\t\tbreak;\n");
2352                 }
2353                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2354         }
2355
2356 ### `do_free`
2357
2358 As each non-terminal can potentially cause a different type of data
2359 structure to be allocated and filled in, we need to be able to free it when
2360 done.
2361
2362 It is particularly important to have fine control over freeing during error
2363 recovery where individual stack frames might need to be freed.
2364
2365 For this, the grammar author is required to defined a `free_XX` function for
2366 each structure that is used by a non-terminal.  `do_free` will call whichever
2367 is appropriate given a symbol number, and will call `free` (as is
2368 appropriate for tokens) on any terminal symbol.
2369
2370 ###### functions
2371
2372         static void gen_free(FILE *f, struct grammar *g)
2373         {
2374                 int i;
2375
2376                 fprintf(f, "#line 0 \"gen_free\"\n");
2377                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2378                 fprintf(f, "{\n");
2379                 fprintf(f, "\tif (!asn) return;\n");
2380                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2381                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2382                 fprintf(f, "\tswitch(sym) {\n");
2383
2384                 for (i = 0; i < g->num_syms; i++) {
2385                         struct symbol *s = g->symtab[i];
2386                         if (!s ||
2387                             s->type != Nonterminal ||
2388                             s->struct_name.txt == NULL)
2389                                 continue;
2390
2391                         fprintf(f, "\tcase %d:\n", s->num);
2392                         if (s->isref) {
2393                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2394                                         s->struct_name.len,
2395                                         s->struct_name.txt);
2396                                 fprintf(f, "\t\tfree(asn);\n");
2397                         } else
2398                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2399                                         s->struct_name.len,
2400                                         s->struct_name.txt);
2401                         fprintf(f, "\t\tbreak;\n");
2402                 }
2403                 fprintf(f, "\t}\n}\n\n");
2404         }
2405
2406 ## The main routine.
2407
2408 There are three key parts to the "main" function of parsergen: processing
2409 the arguments, loading the grammar file, and dealing with the grammar.
2410
2411 ### Argument processing.
2412
2413 Command line options allow the selection of analysis mode, name of output
2414 file, and whether or not a report should be generated.  By default we create
2415 a report only if no code output was requested.
2416
2417 The `parse_XX` function name uses the basename of the output file which
2418 should not have a suffix (`.c`).  `.c` is added to the given name for the
2419 code, and `.h` is added for the header (if header text is specifed in the
2420 grammar file).
2421
2422 ###### includes
2423         #include <getopt.h>
2424
2425 ###### declarations
2426         static const struct option long_options[] = {
2427                 { "LR0",        0, NULL, '0' },
2428                 { "LR05",       0, NULL, '5' },
2429                 { "SLR",        0, NULL, 'S' },
2430                 { "LALR",       0, NULL, 'L' },
2431                 { "LR1",        0, NULL, '1' },
2432                 { "tag",        1, NULL, 't' },
2433                 { "report",     0, NULL, 'R' },
2434                 { "output",     1, NULL, 'o' },
2435                 { NULL,         0, NULL, 0   }
2436         };
2437         const char *options = "05SL1t:Ro:";
2438
2439 ###### process arguments
2440         int opt;
2441         char *outfile = NULL;
2442         char *infile;
2443         char *name;
2444         char *tag = NULL;
2445         int report = 1;
2446         enum grammar_type type = LR05;
2447         while ((opt = getopt_long(argc, argv, options,
2448                                   long_options, NULL)) != -1) {
2449                 switch(opt) {
2450                 case '0':
2451                         type = LR0; break;
2452                 case '5':
2453                         type = LR05; break;
2454                 case 'S':
2455                         type = SLR; break;
2456                 case 'L':
2457                         type = LALR; break;
2458                 case '1':
2459                         type = LR1; break;
2460                 case 'R':
2461                         report = 2; break;
2462                 case 'o':
2463                         outfile = optarg; break;
2464                 case 't':
2465                         tag = optarg; break;
2466                 default:
2467                         fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2468                         exit(1);
2469                 }
2470         }
2471         if (optind < argc)
2472                 infile = argv[optind++];
2473         else {
2474                 fprintf(stderr, "No input file given\n");
2475                 exit(1);
2476         }
2477         if (outfile && report == 1)
2478                 report = 0;
2479         name = outfile;
2480         if (name && strchr(name, '/'))
2481                 name = strrchr(name, '/')+1;
2482
2483         if (optind < argc) {
2484                 fprintf(stderr, "Excess command line arguments\n");
2485                 exit(1);
2486         }
2487
2488 ### Loading the grammar file
2489
2490 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2491 map it.
2492
2493 Once we have extracted the code (with `mdcode`) we expect to find three
2494 or four sections: header, code, grammar, reduce.
2495 Anything else that is not excluded by the `--tag` option is an error.
2496
2497 "header", "code", and "reduce" are optional, though it is hard to build
2498 a working parser without the first two.  "grammar" must be provided.
2499
2500 ###### includes
2501         #include <fcntl.h>
2502         #include <sys/mman.h>
2503         #include <errno.h>
2504
2505 ###### functions
2506         static int errs;
2507         static void pr_err(char *msg)
2508         {
2509                 errs++;
2510                 fprintf(stderr, "%s\n", msg);
2511         }
2512
2513 ###### load file
2514         struct section *table;
2515         int fd;
2516         int len;
2517         char *file;
2518         fd = open(infile, O_RDONLY);
2519         if (fd < 0) {
2520                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2521                         infile, strerror(errno));
2522                 exit(1);
2523         }
2524         len = lseek(fd, 0, 2);
2525         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2526         table = code_extract(file, file+len, pr_err);
2527
2528         struct code_node *hdr = NULL;
2529         struct code_node *code = NULL;
2530         struct code_node *gram = NULL;
2531         struct code_node *pre_reduce = NULL;
2532         for (s = table; s; s = s->next) {
2533                 struct text sec = s->section;
2534                 if (tag && !strip_tag(&sec, tag))
2535                         continue;
2536                 if (text_is(sec, "header"))
2537                         hdr = s->code;
2538                 else if (text_is(sec, "code"))
2539                         code = s->code;
2540                 else if (text_is(sec, "grammar"))
2541                         gram = s->code;
2542                 else if (text_is(sec, "reduce"))
2543                         pre_reduce = s->code;
2544                 else {
2545                         fprintf(stderr, "Unknown content section: %.*s\n",
2546                                 s->section.len, s->section.txt);
2547                         rv |= 2;
2548                 }
2549         }
2550
2551 ### Processing the input
2552
2553 As we need to append an extention to a filename and then open it for
2554 writing, and we need to do this twice, it helps to have a separate function.
2555
2556 ###### functions
2557
2558         static FILE *open_ext(char *base, char *ext)
2559         {
2560                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2561                 FILE *f;
2562                 strcat(strcpy(fn, base), ext);
2563                 f = fopen(fn, "w");
2564                 free(fn);
2565                 return f;
2566         }
2567
2568 If we can read the grammar, then we analyse and optionally report on it.  If
2569 the report finds conflicts we will exit with an error status.
2570
2571 ###### process input
2572         struct grammar *g = NULL;
2573         if (gram == NULL) {
2574                 fprintf(stderr, "No grammar section provided\n");
2575                 rv |= 2;
2576         } else {
2577                 g = grammar_read(gram);
2578                 if (!g) {
2579                         fprintf(stderr, "Failure to parse grammar\n");
2580                         rv |= 2;
2581                 }
2582         }
2583         if (g) {
2584                 grammar_analyse(g, type);
2585                 if (report)
2586                         if (grammar_report(g, type))
2587                                 rv |= 1;
2588         }
2589
2590 If a "headers" section is defined, we write it out and include a declaration
2591 for the `parse_XX` function so it can be used from separate code.
2592
2593         if (rv == 0 && hdr && outfile) {
2594                 FILE *f = open_ext(outfile, ".h");
2595                 if (f) {
2596                         code_node_print(f, hdr, infile);
2597                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2598                                 name);
2599                         fclose(f);
2600                 } else {
2601                         fprintf(stderr, "Cannot create %s.h\n",
2602                                 outfile);
2603                         rv |= 4;
2604                 }
2605         }
2606
2607 And if all goes well and an output file was provided, we create the `.c`
2608 file with the code section (if any) and the parser tables and function.
2609
2610         if (rv == 0 && outfile) {
2611                 FILE *f = open_ext(outfile, ".c");
2612                 if (f) {
2613                         if (code)
2614                                 code_node_print(f, code, infile);
2615                         gen_parser(f, g, infile, name, pre_reduce);
2616                         fclose(f);
2617                 } else {
2618                         fprintf(stderr, "Cannot create %s.c\n",
2619                                 outfile);
2620                         rv |= 4;
2621                 }
2622         }
2623
2624 And that about wraps it up.  We need to set the locale so that UTF-8 is
2625 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2626
2627 ###### File: parsergen.mk
2628         parsergen : parsergen.o libscanner.o libmdcode.o
2629                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2630
2631 ###### includes
2632         #include <locale.h>
2633
2634 ###### main
2635
2636         int main(int argc, char *argv[])
2637         {
2638                 struct section *s;
2639                 int rv = 0;
2640
2641                 setlocale(LC_ALL,"");
2642
2643                 ## process arguments
2644                 ## load file
2645                 ## process input
2646
2647                 return rv;
2648         }
2649
2650 ## The SHIFT/REDUCE parser
2651
2652 Having analysed the grammar and generated all the tables, we only need
2653 the shift/reduce engine to bring it all together.
2654
2655 ### Goto table lookup
2656
2657 The parser generator has nicely provided us with goto tables sorted by
2658 symbol number.  We need a binary search function to find a symbol in the
2659 table.
2660
2661 ###### parser functions
2662
2663         static int search(const struct state *l, int sym)
2664         {
2665                 int lo = 0;
2666                 int hi = l->go_to_cnt;
2667
2668                 if (hi == 0)
2669                         return -1;
2670                 while (lo + 1 < hi) {
2671                         int mid = (lo + hi) / 2;
2672                         if (l->go_to[mid].sym <= sym)
2673                                 lo = mid;
2674                         else
2675                                 hi = mid;
2676                 }
2677                 if (l->go_to[lo].sym == sym)
2678                         return l->go_to[lo].state;
2679                 else
2680                         return -1;
2681         }
2682
2683 ### Memory allocation
2684
2685 The `scanner` returns tokens in a local variable - we want them in allocated
2686 memory so they can live in the `asn_stack`.  So we provide `tok_copy` to
2687 make an allocated copy as required.
2688
2689 ###### parser includes
2690         #include <memory.h>
2691
2692 ###### parser functions
2693
2694         static struct token *tok_copy(struct token tk)
2695         {
2696                 struct token *new = malloc(sizeof(*new));
2697                 *new = tk;
2698                 return new;
2699         }
2700
2701 ### The state stack.
2702
2703 The core data structure for the parser is the stack.  This tracks all
2704 the symbols that have been recognised or partially recognised.
2705
2706 The stack usually won't grow very large - maybe a few tens of entries.
2707 So we dynamically grow an array as required but never bother to
2708 shrink it down again.
2709
2710 We keep the stack as two separate allocations.  One, `asn_stack` stores
2711 the "abstract syntax nodes" which are created by each reduction.  When
2712 we call `do_reduce` we need to pass an array of the `asn`s of the body
2713 of the production, and by keeping a separate `asn` stack, we can just
2714 pass a pointer into this stack.
2715
2716 The other allocation stores all other stack fields of which there are
2717 several.  The `state` is the most important one and guides the parsing
2718 process.  The `sym` is nearly unnecessary as it is implicit in the
2719 state.  However when we want to free entries from the `asn_stack`, it
2720 helps to know what type they are so we can call the right freeing
2721 function.  The symbol leads us to the right free function through
2722 `do_free`.
2723
2724 The `indents` count tracks the line indents with-in the symbol or
2725 immediately follow it.  These are used to allow indent information to
2726 guide parsing and error recovery.
2727
2728 `since_newline` tracks how many stack frames since the last
2729 start-of-line (whether indented or not).  So if `since_newline` is
2730 zero, then this symbol is at the start of a line.  Similarly
2731 `since_indent` counts the number of states since an indent, it is zero
2732 precisely when `indents` is not zero.
2733
2734 `newline_permitted` keeps track of whether newlines should be ignored
2735 or not.
2736
2737 The stack is most properly seen as alternating states and symbols -
2738 states, like the 'DOT' in items, are between symbols.  Each frame in
2739 our stack holds a state and the symbol that was before it.  The
2740 bottom of stack holds the start state but no symbol, as nothing came
2741 before the beginning.  As we need to store some value, `TK_eof` is used
2742 to mark the beginning of the file as well as the end.
2743
2744 ###### parser functions
2745
2746         struct parser {
2747                 struct frame {
2748                         short state;
2749                         short newline_permitted;
2750
2751                         short sym;
2752                         short indents;
2753                         short since_newline;
2754                         short since_indent;
2755                 } *stack;
2756                 void **asn_stack;
2757                 int stack_size;
2758                 int tos;
2759         };
2760
2761 #### Shift and pop
2762
2763 Two operations are needed on the stack - shift (which is like push) and pop.
2764
2765 Shift applies not only to terminals but also to non-terminals.  When we
2766 reduce a production we will pop off frames corresponding to the body
2767 symbols, then push on a frame for the head of the production.  This last
2768 is exactly the same process as shifting in a terminal so we use the same
2769 function for both.  In both cases we provide the symbol, the number of
2770 indents the symbol contains (which will be zero for a terminal symbol)
2771 and a flag indicating the the symbol was at (or was reduced from a
2772 symbol which was at) the start of a line.  The state is deduced from the
2773 current top-of-stack state and the new symbol.
2774
2775 To simplify other code we arrange for `shift` to fail if there is no `goto`
2776 state for the symbol.  This is useful in basic parsing due to our design
2777 that we shift when we can, and reduce when we cannot.  So the `shift`
2778 function reports if it could.
2779
2780 `shift` is also used to push state zero onto the stack, so if the
2781 stack is empty, it always chooses zero as the next state.
2782
2783 So `shift` finds the next state.  If that succeeds it extends the
2784 allocations if needed and pushes all the information onto the stacks.
2785
2786 Newlines are permitted after a `starts_line` state until an internal
2787 indent.  If the new frame has neither a `starts_line` state nor an
2788 indent, newlines are permitted if the previous stack frame permitted
2789 them.
2790
2791 ###### parser functions
2792
2793         static int shift(struct parser *p,
2794                          short sym, short indents, short start_of_line,
2795                          void *asn,
2796                          const struct state states[])
2797         {
2798                 // Push an entry onto the stack
2799                 struct frame next = {0};
2800                 int newstate = p->tos
2801                         ? search(&states[p->stack[p->tos-1].state],
2802                                  sym)
2803                         : 0;
2804                 if (newstate < 0)
2805                         return 0;
2806                 if (p->tos >= p->stack_size) {
2807                         p->stack_size += 10;
2808                         p->stack = realloc(p->stack, p->stack_size
2809                                            * sizeof(p->stack[0]));
2810                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2811                                            * sizeof(p->asn_stack[0]));
2812                 }
2813                 next.sym = sym;
2814                 next.indents = indents;
2815                 next.state = newstate;
2816                 if (states[newstate].starts_line)
2817                         next.newline_permitted = 1;
2818                 else if (indents)
2819                         next.newline_permitted = 0;
2820                 else if (p->tos)
2821                         next.newline_permitted =
2822                                 p->stack[p->tos-1].newline_permitted;
2823                 else
2824                         next.newline_permitted = 0;
2825
2826                 if (!start_of_line) {
2827                         if (p->tos)
2828                                 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2829                         else
2830                                 next.since_newline = 1;
2831                 }
2832                 if (indents)
2833                         next.since_indent = 0;
2834                 else if (p->tos)
2835                         next.since_indent = p->stack[p->tos-1].since_indent + 1;
2836                 else
2837                         next.since_indent = 1;
2838
2839                 p->stack[p->tos] = next;
2840                 p->asn_stack[p->tos] = asn;
2841                 p->tos++;
2842                 return 1;
2843         }
2844
2845 `pop` primarily moves the top of stack (`tos`) back down the required
2846 amount and frees any `asn` entries that need to be freed.  It also
2847 collects a summary of the indents and line starts in the symbols that
2848 are being removed. It is called _after_ we reduce a production, just
2849 before we `shift` the nonterminal in.
2850
2851 ###### parser functions
2852
2853         static int pop(struct parser *p, int num,
2854                        short *start_of_line,
2855                        void(*do_free)(short sym, void *asn))
2856         {
2857                 int i;
2858                 short indents = 0;
2859                 int sol = 0;
2860
2861                 p->tos -= num;
2862                 for (i = 0; i < num; i++) {
2863                         sol |= !p->stack[p->tos+i].since_newline;
2864                         indents += p->stack[p->tos+i].indents;
2865                         do_free(p->stack[p->tos+i].sym,
2866                                 p->asn_stack[p->tos+i]);
2867                 }
2868                 if (start_of_line)
2869                         *start_of_line = sol;
2870                 return indents;
2871         }
2872
2873 ### The heart of the parser.
2874
2875 Now we have the parser.  For each token we might shift it, trigger a
2876 reduction, or start error handling.  2D tokens (IN, OUT, EOL) also need
2877 to be handled.
2878
2879 We return whatever `asn` was returned by reducing production zero.
2880
2881 If we can neither shift nor reduce we have an error to handle.  We pop
2882 single entries off the stack until we can shift the `TK_error` symbol,
2883 then drop input tokens until we find one we can shift into the new error
2884 state.  We need to ensure that something is dropped or shifted after an
2885 error, or we could get into an infinite loop, only shifting in
2886 `TK_error`, then reducing.  So we track if there has been a shift since
2887 the last error, and if not the next error always discards one token.
2888
2889 When we find `TK_in` and `TK_out` tokens which report indents we need
2890 to handle them directly as the grammar cannot express what we want to
2891 do with them.
2892
2893 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2894 record how many indents there are following the previous token.
2895
2896 `TK_out` tokens must be canceled against an indent count
2897 within the stack.  If we can reduce some symbols that are all since
2898 the most recent indent, then we do that first.  If the minimum prefix
2899 of the current state then extends back before the most recent indent,
2900 that indent can be cancelled.  If the minimum prefix is shorter then
2901 the indent had ended prematurely and we must start error handling, which
2902 is still a work-in-progress.
2903
2904 `TK_newline` tokens are ignored unless the top stack frame records
2905 that they are permitted.  In that case they will not be considered for
2906 shifting if it is possible to reduce some symbols that are all since
2907 the most recent start of line.  This is how a newline forcibly
2908 terminates any line-like structure - we try to reduce down to at most
2909 one symbol for each line where newlines are allowed.
2910 A consequence of this is that a rule like
2911
2912 ###### Example: newlines - broken
2913
2914         Newlines ->
2915                 | NEWLINE Newlines
2916         IfStatement -> Newlines if ....
2917
2918 cannot work, as the NEWLINE will never be shifted as the empty string
2919 will be reduced first.  Optional sets of newlines need to be include
2920 in the thing that preceed:
2921
2922 ###### Example: newlines - works
2923
2924         If -> if
2925                 | NEWLINE If
2926         IfStatement -> If ....
2927
2928 Here the NEWLINE will be shifted because nothing can be reduced until
2929 the `if` is seen.
2930
2931 When during error handling we discard tokens read in, we want to keep
2932 discarding until we see one that is recognised.  If we had a full set
2933 of LR(1) grammar states, this would mean looking in the look-ahead set,
2934 but we don't keep a full look-ahead set.  We only record the subset
2935 that leads to SHIFT.  We can, however, deduce the look-ahead set by
2936 looking at the SHIFT subsets for all states that we can get to by
2937 reducing zero or more times.  So we need a little function which
2938 checks if a given token is in any of these look-ahead sets.
2939
2940 ###### parser includes
2941         #include "parser.h"
2942
2943 ###### parser_run
2944
2945         static int in_lookahead(struct token *tk, const struct state *states, int state)
2946         {
2947                 while (state >= 0) {
2948                         if (search(&states[state], tk->num) >= 0)
2949                                 return 1;
2950                         if (states[state].reduce_prod < 0)
2951                                 return 0;
2952                         state = search(&states[state], states[state].reduce_sym);
2953                 }
2954                 return 0;
2955         }
2956
2957         void *parser_run(struct token_state *tokens,
2958                          const struct state states[],
2959                          int (*do_reduce)(int, void**, struct token_config*, void*),
2960                          void (*do_free)(short, void*),
2961                          FILE *trace, const char *non_term[],
2962                          struct token_config *config)
2963         {
2964                 struct parser p = { 0 };
2965                 struct token *tk = NULL;
2966                 int accepted = 0;
2967                 int shift_since_err = 1;
2968                 void *ret = NULL;
2969
2970                 shift(&p, TK_eof, 0, 1, NULL, states);
2971                 while (!accepted && p.tos > 0) {
2972                         struct token *err_tk;
2973                         struct frame *tos = &p.stack[p.tos-1];
2974                         if (!tk)
2975                                 tk = tok_copy(token_next(tokens));
2976                         parser_trace(trace, &p,
2977                                      tk, states, non_term, config->known_count);
2978
2979                         if (tk->num == TK_in) {
2980                                 tos->indents += 1;
2981                                 tos->since_newline = 0;
2982                                 tos->since_indent = 0;
2983                                 if (!states[tos->state].starts_line)
2984                                         tos->newline_permitted = 0;
2985                                 free(tk);
2986                                 tk = NULL;
2987                                 parser_trace_action(trace, "Record");
2988                                 continue;
2989                         }
2990                         if (tk->num == TK_out) {
2991                                 if (states[tos->state].reduce_size >= 0 &&
2992                                     states[tos->state].reduce_size <= tos->since_indent)
2993                                         goto force_reduce;
2994                                 if (states[tos->state].min_prefix >= tos->since_indent) {
2995                                         // OK to cancel
2996                                         struct frame *in = tos - tos->since_indent;
2997                                         in->indents -= 1;
2998                                         if (in->indents == 0) {
2999                                                 /* Reassess since_indent and newline_permitted */
3000                                                 if (in > p.stack) {
3001                                                         in->since_indent = in[-1].since_indent + 1;
3002                                                         in->newline_permitted = in[-1].newline_permitted;
3003                                                 } else {
3004                                                         in->since_indent = 0;
3005                                                         in->newline_permitted = 0;
3006                                                 }
3007                                                 if (states[in->state].starts_line)
3008                                                         in->newline_permitted = 1;
3009                                                 while (in < tos) {
3010                                                         in += 1;
3011                                                         in->since_indent = in[-1].since_indent + 1;
3012                                                         if (states[in->state].starts_line)
3013                                                                 in->newline_permitted = 1;
3014                                                         else
3015                                                                 in->newline_permitted = in[-1].newline_permitted;
3016                                                 }
3017                                         }
3018                                         free(tk);
3019                                         tk = NULL;
3020                                         parser_trace_action(trace, "Cancel");
3021                                         continue;
3022                                 }
3023                                 // fall through to error handling as both SHIFT and REDUCE
3024                                 // will fail.
3025                         }
3026                         if (tk->num == TK_newline) {
3027                                 if (!tos->newline_permitted) {
3028                                         free(tk);
3029                                         tk = NULL;
3030                                         parser_trace_action(trace, "Discard");
3031                                         continue;
3032                                 }
3033                                 if (tos->since_newline > 1 &&
3034                                     states[tos->state].reduce_size >= 0 &&
3035                                     states[tos->state].reduce_size <= tos->since_newline)
3036                                         goto force_reduce;
3037                         }
3038                         if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
3039                                 shift_since_err = 1;
3040                                 tk = NULL;
3041                                 parser_trace_action(trace, "Shift");
3042                                 continue;
3043                         }
3044                 force_reduce:
3045                         if (states[tos->state].reduce_prod >= 0 &&
3046                             states[tos->state].newline_only &&
3047                             !(tk->num == TK_newline ||
3048                               tk->num == TK_eof ||
3049                               tk->num == TK_out ||
3050                               (tos->indents == 0 && tos->since_newline == 0))) {
3051                                 /* Anything other than newline or out or eof
3052                                  * in an error unless we are already at start
3053                                  * of line, as this production must end at EOL.
3054                                  */
3055                         } else if (states[tos->state].reduce_prod >= 0) {
3056                                 void **body;
3057                                 void *res;
3058                                 const struct state *nextstate = &states[tos->state];
3059                                 int prod = nextstate->reduce_prod;
3060                                 int size = nextstate->reduce_size;
3061                                 int res_size = nextstate->result_size;
3062                                 short indents, start_of_line;
3063
3064                                 body = p.asn_stack + (p.tos - size);
3065                                 res = res_size ? calloc(1, res_size) : NULL;
3066                                 res_size = do_reduce(prod, body, config, res);
3067                                 if (res_size != nextstate->result_size)
3068                                         abort();
3069
3070                                 indents = pop(&p, size, &start_of_line,
3071                                               do_free);
3072
3073                                 if (!shift(&p, nextstate->reduce_sym,
3074                                            indents, start_of_line,
3075                                            res, states)) {
3076                                         if (prod != 0) abort();
3077                                         accepted = 1;
3078                                         ret = res;
3079                                 }
3080                                 parser_trace_action(trace, "Reduce");
3081                                 continue;
3082                         }
3083                         /* Error. We walk up the stack until we
3084                          * find a state which will accept TK_error.
3085                          * We then shift in TK_error and see what state
3086                          * that takes us too.
3087                          * Then we discard input tokens until
3088                          * we find one that is acceptable.
3089                          */
3090                         parser_trace_action(trace, "ERROR");
3091                         short indents = 0, start_of_line;
3092
3093                         err_tk = tok_copy(*tk);
3094                         while (p.tos > 0 &&
3095                                shift(&p, TK_error, 0, 0,
3096                                      err_tk, states) == 0)
3097                                 // discard this state
3098                                 indents += pop(&p, 1, &start_of_line, do_free);
3099                         if (p.tos == 0) {
3100                                 free(err_tk);
3101                                 // no state accepted TK_error
3102                                 break;
3103                         }
3104                         if (!shift_since_err) {
3105                                 /* must discard at least one token to avoid
3106                                  * infinite loop.
3107                                  */
3108                                 if (tk->num == TK_eof)
3109                                         break;
3110                                 free(tk);
3111                                 tk = tok_copy(token_next(tokens));
3112                         }
3113                         shift_since_err = 0;
3114                         tos = &p.stack[p.tos-1];
3115                         while (!in_lookahead(tk, states, tos->state) &&
3116                                tk->num != TK_eof) {
3117                                 free(tk);
3118                                 tk = tok_copy(token_next(tokens));
3119                                 shift_since_err = 1;
3120                                 if (tk->num == TK_in)
3121                                         indents += 1;
3122                                 if (tk->num == TK_out) {
3123                                         if (indents == 0)
3124                                                 break;
3125                                         indents -= 1;
3126                                         // FIXME update since_indent here
3127                                 }
3128                         }
3129                         tos->indents += indents;
3130                 }
3131                 free(tk);
3132                 pop(&p, p.tos, NULL, do_free);
3133                 free(p.asn_stack);
3134                 free(p.stack);
3135                 return ret;
3136         }
3137
3138 ###### exported functions
3139         void *parser_run(struct token_state *tokens,
3140                          const struct state states[],
3141                          int (*do_reduce)(int, void**, struct token_config*, void*),
3142                          void (*do_free)(short, void*),
3143                          FILE *trace, const char *non_term[],
3144                          struct token_config *config);
3145
3146 ### Tracing
3147
3148 Being able to visualize the parser in action can be invaluable when
3149 debugging the parser code, or trying to understand how the parse of a
3150 particular grammar progresses.  The stack contains all the important
3151 state, so just printing out the stack every time around the parse loop
3152 can make it possible to see exactly what is happening.
3153
3154 This doesn't explicitly show each SHIFT and REDUCE action.  However they
3155 are easily deduced from the change between consecutive lines, and the
3156 details of each state can be found by cross referencing the states list
3157 in the stack with the "report" that parsergen can generate.
3158
3159 For terminal symbols, we just dump the token.  For non-terminals we
3160 print the name of the symbol.  The look ahead token is reported at the
3161 end inside square brackets.
3162
3163 ###### parser functions
3164
3165         static char *reserved_words[] = {
3166                 [TK_error]        = "ERROR",
3167                 [TK_in]           = "IN",
3168                 [TK_out]          = "OUT",
3169                 [TK_newline]      = "NEWLINE",
3170                 [TK_eof]          = "$eof",
3171         };
3172         static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
3173         {
3174                 fprintf(trace, "(%d", f->state);
3175                 if (states[f->state].starts_line)
3176                         fprintf(trace, "s");
3177                 if (f->newline_permitted)
3178                         fprintf(trace, "n%d", f->since_newline);
3179                 fprintf(trace, ") ");
3180         }
3181
3182         void parser_trace(FILE *trace, struct parser *p,
3183                           struct token *tk, const struct state states[],
3184                           const char *non_term[], int knowns)
3185         {
3186                 int i;
3187                 if (!trace)
3188                         return;
3189                 for (i = 0; i < p->tos; i++) {
3190                         struct frame *f = &p->stack[i];
3191                         if (i) {
3192                                 int sym = f->sym;
3193                                 if (sym < TK_reserved &&
3194                                     reserved_words[sym] != NULL)
3195                                         fputs(reserved_words[sym], trace);
3196                                 else if (sym < TK_reserved + knowns) {
3197                                         struct token *t = p->asn_stack[i];
3198                                         text_dump(trace, t->txt, 20);
3199                                 } else
3200                                         fputs(non_term[sym - TK_reserved - knowns],
3201                                               trace);
3202                                 if (f->indents)
3203                                         fprintf(trace, ".%d", f->indents);
3204                                 if (f->since_newline == 0)
3205                                         fputs("/", trace);
3206                                 fputs(" ", trace);
3207                         }
3208                         parser_trace_state(trace, f, states);
3209                 }
3210                 fprintf(trace, "[");
3211                 if (tk->num < TK_reserved &&
3212                     reserved_words[tk->num] != NULL)
3213                         fputs(reserved_words[tk->num], trace);
3214                 else
3215                         text_dump(trace, tk->txt, 20);
3216                 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3217         }
3218
3219         void parser_trace_action(FILE *trace, char *action)
3220         {
3221                 if (trace)
3222                         fprintf(trace, " - %s\n", action);
3223         }
3224
3225 # A Worked Example
3226
3227 The obvious example for a parser is a calculator.
3228
3229 As `scanner` provides number parsing function using `libgmp` it is not much
3230 work to perform arbitrary rational number calculations.
3231
3232 This calculator takes one expression, or an equality test, per line.
3233 The results are printed and if any equality test fails, the program
3234 exits with an error.
3235
3236 ###### File: parsergen.mk
3237         calc.c calc.h : parsergen parsergen.mdc
3238                 ./parsergen --tag calc -o calc parsergen.mdc
3239         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3240                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3241         calctest : calc
3242                 ./calc parsergen.mdc
3243         demos :: calctest
3244
3245 # calc: header
3246
3247         #include "parse_number.h"
3248         // what do we use for a demo-grammar?  A calculator of course.
3249         struct number {
3250                 mpq_t val;
3251                 char tail[2];
3252                 int err;
3253         };
3254
3255 # calc: code
3256
3257         #include <stdlib.h>
3258         #include <unistd.h>
3259         #include <fcntl.h>
3260         #include <sys/mman.h>
3261         #include <stdio.h>
3262         #include <malloc.h>
3263         #include <gmp.h>
3264         #include <string.h>
3265         #include "mdcode.h"
3266         #include "scanner.h"
3267         #include "parser.h"
3268
3269         #include "calc.h"
3270
3271         static void free_number(struct number *n)
3272         {
3273                 mpq_clear(n->val);
3274                 free(n);
3275         }
3276
3277         static int text_is(struct text t, char *s)
3278         {
3279                 return (strlen(s) == t.len &&
3280                         strncmp(s, t.txt, t.len) == 0);
3281         }
3282
3283         int main(int argc, char *argv[])
3284         {
3285                 int fd = open(argv[1], O_RDONLY);
3286                 int len = lseek(fd, 0, 2);
3287                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3288                 struct section *table = code_extract(file, file+len, NULL);
3289                 struct section *s;
3290                 struct token_config config = {
3291                         .ignored = (1 << TK_line_comment)
3292                                  | (1 << TK_in)
3293                                  | (1 << TK_out),
3294                         .number_chars = ".,_+-",
3295                         .word_start = "",
3296                         .word_cont = "",
3297                 };
3298                 for (s = table; s; s = s->next)
3299                         if (text_is(s->section, "example: input"))
3300                                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3301                 while (table) {
3302                         struct section *t = table->next;
3303                         code_free(table->code);
3304                         free(table);
3305                         table = t;
3306                 }
3307                 exit(0);
3308         }
3309
3310 # calc: grammar
3311
3312         $LEFT + -
3313         $LEFT * / //
3314
3315         Session -> Session Line
3316                 | Line
3317
3318         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3319                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3320                                         gmp_printf("  or as a decimal: %Fg\n", fl);
3321                                         mpf_clear(fl);
3322                                         }
3323                                      }$
3324                 | Expression = Expression NEWLINE ${
3325                         if (mpq_equal($1.val, $3.val))
3326                                 gmp_printf("Both equal %Qd\n", $1.val);
3327                         else {
3328                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
3329                                         $1.val, $3.val);
3330                                 exit(1);
3331                         }
3332                 }$
3333                 | NEWLINE ${ printf("Blank line\n"); }$
3334                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3335
3336         $number
3337         Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3338                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3339                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3340                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3341                 | Expression // Expression ${ {
3342                         mpz_t z0, z1, z2;
3343                         mpq_init($0.val);
3344                         mpz_init(z0); mpz_init(z1); mpz_init(z2);
3345                         mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3346                         mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3347                         mpz_tdiv_q(z0, z1, z2);
3348                         mpq_set_z($0.val, z0);
3349                         mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3350                 } }$
3351                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3352                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3353
3354 # example: input
3355
3356         355/113
3357         3.1415926535 - 355/113
3358         2 + 4 * 5
3359         1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3360         10 * 9 / 2
3361         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
3362
3363         355//113
3364
3365         error