ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator with 2D support #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 "2D support" means that indentation and line breaks can be significant.
   8
   9 There are several distinct sections.
  10
  11 1. `grammar_read` will read a grammar from a literate-code file and
  12    store the productions, symbols, and code fragments.
  13 2. `grammar_analyse` will take that grammar and build LR parsing
  14    states and look-ahead tables.
  15 3. `grammar_report` will present the details of the analysis
  16    in a readable format and will report any conflicts.
  17 4. `parser_generate` will write out C code files with various
  18    tables and with the code fragments arranged in useful places.
  19 5. `parser_run` is a library function intended to be linked together
  20    with the generated parser tables to complete the implementation of
  21    a parser.
  22 6. Finally `calc` is a test grammar for a simple calculator.  The
  23    `parsergen` program built from the C code in this file can extract
  24    that grammar directly from this file and process it.
  25
  26 ###### File: parsergen.c
  27         #include <unistd.h>
  28         #include <stdlib.h>
  29         #include <stdio.h>
  30         ## includes
  31         ## forward declarations
  32         ## declarations
  33         ## functions
  34         ## grammar_read
  35         ## grammar_analyse
  36         ## grammar_report
  37         ## parser_generate
  38         ## main
  39 ###### File: parser.h
  40         ## exported types
  41         ## exported functions
  42 ###### File: libparser.c
  43         #include <unistd.h>
  44         #include <stdlib.h>
  45         #include <stdio.h>
  46         ## parser includes
  47         ## parser functions
  48         ## parser_run
  49 ###### File: parsergen.mk
  50         CFLAGS += -Wall -g
  51         all :: parsergen calc
  52         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  53                 ./md2c parsergen.mdc
  54
  55 ## Reading the grammar
  56
  57 The grammar must be stored in a markdown literate code file as parsed by
  58 "[mdcode][]".  It may have up to four top level (i.e.  unreferenced)
  59 sections: `header`, `code`, `grammar` and `reduce`.  The first two, if
  60 present, will be literally copied into the generated `.c` and `.h`
  61 files.  The third is required and contains the grammar which is
  62 tokenised with "[scanner][]".  `reduce` can provide variable
  63 declarations and common intialization code for all the reduction code
  64 fragments in the grammar.
  65
  66 If the `--tag` option is given, then any top level header that doesn't
  67 start with the tag is ignored, and the tag is striped from the rest.  So
  68 `--tag Foo` means that the four sections expected are `Foo: header`,
  69 `Foo: code`, `Foo: grammar` and `Foo: reduce`.  The tag `calc` is used
  70 to extract the sample calculator from this file.
  71
  72 [mdcode]: mdcode.html
  73 [scanner]: scanner.html
  74
  75 ###### includes
  76         #include "mdcode.h"
  77         #include "scanner.h"
  78
  79 ###### parser includes
  80         #include "mdcode.h"
  81         #include "scanner.h"
  82
  83 The grammar contains production sets, precedence/associativity
  84 declarations, general terminal declarations, and data type declarations.
  85 These are all parsed with _ad hoc_ parsing as we don't have a parser
  86 generator yet.
  87
  88 The precedence and associativity can be set for each production, and
  89 can be inherited from symbols.  The data type (either structure or a
  90 reference to a structure) is potentially defined for each non-terminal
  91 and describes what C structure is used to store information about each
  92 symbol.
  93
  94 ###### declarations
  95         enum assoc {Left, Right, Non};
  96         char *assoc_names[] = {"Left","Right","Non"};
  97
  98         struct symbol {
  99                 struct text struct_name;
 100                 int isref;
 101                 enum assoc assoc;
 102                 unsigned short precedence;
 103                 ## symbol fields
 104         };
 105         struct production {
 106                 unsigned short precedence;
 107                 enum assoc assoc;
 108                 char line_like;
 109                 ## production fields
 110         };
 111         struct grammar {
 112                 ## grammar fields
 113         };
 114
 115 The strings reported by `mdcode` and `scanner` are `struct text` which
 116 have length rather than being nul terminated.  To help with printing and
 117 comparing we define `text_is` and `prtxt`, which should possibly go in
 118 `mdcode`.  `scanner` does provide `text_dump` which is useful for
 119 strings which might contain control characters.
 120
 121 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 122 because that is what we need to detect tags.
 123
 124 ###### functions
 125         static int text_is(struct text t, char *s)
 126         {
 127                 return (strlen(s) == t.len &&
 128                         strncmp(s, t.txt, t.len) == 0);
 129         }
 130         static void prtxt(struct text t)
 131         {
 132                 printf("%.*s", t.len, t.txt);
 133         }
 134
 135         static int strip_tag(struct text *t, char *tag)
 136         {
 137                 int skip = strlen(tag) + 1;
 138                 if (skip >= t->len ||
 139                     strncmp(t->txt, tag, skip-1) != 0 ||
 140                     t->txt[skip-1] != ':')
 141                         return 0;
 142                 while (skip < t->len && t->txt[skip] == ' ')
 143                         skip++;
 144                 t->len -= skip;
 145                 t->txt += skip;
 146                 return 1;
 147         }
 148
 149 ### Symbols
 150
 151 Productions are comprised primarily of symbols - terminal and
 152 non-terminal.  We do not make a syntactic distinction between the two,
 153 though non-terminals must be identifiers.  Non-terminal symbols are
 154 those which appear in the head of a production, terminal symbols are
 155 those which don't.  There are also "virtual" symbols used for precedence
 156 marking discussed later, and sometimes we won't know what type a symbol
 157 is yet.
 158
 159 To help with code safety it is possible to declare the terminal symbols.
 160 If this is done, then any symbol used in a production that does not
 161 appear in the head of any production and is not declared as a terminal
 162 is treated as an error.
 163
 164 ###### forward declarations
 165         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 166         char *symtypes = "UVTN";
 167 ###### symbol fields
 168         enum symtype type;
 169 ###### grammar fields
 170         int terminals_declared;
 171
 172 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 173 table of known symbols and the resulting parser will report them as
 174 `TK_reserved + N`.  A small set of identifiers are reserved for the
 175 different token types that `scanner` can report.
 176
 177 ###### declarations
 178
 179         static struct { int num; char *name; } reserved_words[] = {
 180                 { TK_error,        "ERROR" },
 181                 { TK_number,       "NUMBER" },
 182                 { TK_ident,        "IDENTIFIER" },
 183                 { TK_mark,         "MARK" },
 184                 { TK_string,       "STRING" },
 185                 { TK_multi_string, "MULTI_STRING" },
 186                 { TK_in,           "IN" },
 187                 { TK_out,          "OUT" },
 188                 { TK_newline,      "NEWLINE" },
 189                 { TK_eof,          "$eof" },
 190         };
 191 ###### symbol fields
 192         short num;
 193
 194 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 195 recognised.  The former is automatically expected at the end of the text
 196 being parsed. The latter are always ignored by the parser.
 197
 198 All of these symbols are stored in a simple symbol table.  We use the
 199 `struct text` from `mdcode` to store the name and link them together
 200 into a sorted list using an insertion sort.
 201
 202 We don't have separate `find` and `insert` functions as any symbol we
 203 find needs to be remembered.  We simply expect `find` to always return a
 204 symbol, but its type might be `Unknown`.
 205
 206 ###### includes
 207         #include <string.h>
 208
 209 ###### symbol fields
 210         struct text name;
 211         struct symbol *next;
 212
 213 ###### grammar fields
 214         struct symbol *syms;
 215         int num_syms;
 216
 217 ###### functions
 218         static struct symbol *sym_find(struct grammar *g, struct text s)
 219         {
 220                 struct symbol **l = &g->syms;
 221                 struct symbol *n;
 222                 int cmp = 1;
 223
 224                 while (*l &&
 225                         (cmp = text_cmp((*l)->name, s)) < 0)
 226                                 l = & (*l)->next;
 227                 if (cmp == 0)
 228                         return *l;
 229                 n = calloc(1, sizeof(*n));
 230                 n->name = s;
 231                 n->type = Unknown;
 232                 n->next = *l;
 233                 n->num = -1;
 234                 *l = n;
 235                 return n;
 236         }
 237
 238         static void symbols_init(struct grammar *g)
 239         {
 240                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 241                 int i;
 242                 for (i = 0; i < entries; i++) {
 243                         struct text t;
 244                         struct symbol *s;
 245                         t.txt = reserved_words[i].name;
 246                         t.len = strlen(t.txt);
 247                         s = sym_find(g, t);
 248                         s->type = Terminal;
 249                         s->num = reserved_words[i].num;
 250                 }
 251         }
 252
 253 ### Data types and precedence.
 254
 255 Data type specification, precedence specification, and declaration of
 256 terminals are all introduced by a dollar sign at the start of the line.
 257 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 258 precedence, if it is `TERM` then the line declares terminals without
 259 precedence, otherwise it specifies a data type.
 260
 261 The data type name is simply stored and applied to the head of all
 262 subsequent productions.  It must be the name of a structure optionally
 263 preceded by an asterisk which means a reference or "pointer".  So
 264 `$expression` maps to `struct expression` and `$*statement` maps to
 265 `struct statement *`.
 266
 267 Any productions given before the first data type declaration will have
 268 no data type associated with them and can carry no information.  In
 269 order to allow other non-terminals to have no type, the data type
 270 `$void` can be given.  This does *not* mean that `struct void` will be
 271 used, but rather than no type will be associated with subsequent
 272 non-terminals.
 273
 274 The precedence line must contain a list of symbols, either terminal
 275 symbols or virtual symbols.  It can only contain symbols that have not
 276 been seen yet, so precedence declaration must precede the productions
 277 that they affect.
 278
 279 A "virtual" symbol is an identifier preceded by `$$`.  Virtual symbols
 280 carry precedence information but are not included in the grammar.  A
 281 production can declare that it inherits the precedence of a given
 282 virtual symbol.
 283
 284 This use for `$$` precludes it from being used as a symbol in the
 285 described language.  Two other symbols: `${` and `}$` are also
 286 unavailable.
 287
 288 Each new precedence line introduces a new precedence level and
 289 declares how it associates.  This level is stored in each symbol
 290 listed and may be inherited by any production which uses the symbol.  A
 291 production inherits from the last symbol which has a precedence.
 292
 293 The symbols on the first precedence line have the lowest precedence.
 294 Subsequent lines introduce symbols with higher precedence and so bind
 295 more tightly.
 296
 297 Note that a declaration line (or "dollar line") can start with either of
 298 two different marks: "$" or "$*".  The `dollar_line()` function defined
 299 here is told which was found in the `isref` argument.
 300
 301 ###### grammar fields
 302         struct text current_type;
 303         int type_isref;
 304         int prec_levels;
 305
 306 ###### declarations
 307         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 308         static const char *known[] = { "$$", "${", "}$" };
 309
 310 ###### functions
 311         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 312         {
 313                 struct token t = token_next(ts);
 314                 char *err;
 315                 enum assoc assoc;
 316                 int term = 0;
 317                 int found;
 318
 319                 if (t.num != TK_ident) {
 320                         err = "type or assoc expected after '$'";
 321                         goto abort;
 322                 }
 323                 if (text_is(t.txt, "LEFT"))
 324                         assoc = Left;
 325                 else if (text_is(t.txt, "RIGHT"))
 326                         assoc = Right;
 327                 else if (text_is(t.txt, "NON"))
 328                         assoc = Non;
 329                 else if (text_is(t.txt, "TERM")) {
 330                         term = 1;
 331                         g->terminals_declared = 1;
 332                 } else {
 333                         g->current_type = t.txt;
 334                         g->type_isref = isref;
 335                         if (text_is(t.txt, "void"))
 336                                 g->current_type.txt = NULL;
 337                         t = token_next(ts);
 338                         if (t.num != TK_newline) {
 339                                 err = "Extra tokens after type name";
 340                                 goto abort;
 341                         }
 342                         return NULL;
 343                 }
 344
 345                 if (isref) {
 346                         err = "$* cannot be followed by a precedence";
 347                         goto abort;
 348                 }
 349
 350                 // This is a precedence or TERM line, need some symbols.
 351                 found = 0;
 352                 g->prec_levels += 1;
 353                 t = token_next(ts);
 354                 while (t.num != TK_newline) {
 355                         enum symtype type = Terminal;
 356                         struct symbol *s;
 357                         if (t.num == TK_virtual) {
 358                                 type = Virtual;
 359                                 t = token_next(ts);
 360                                 if (t.num != TK_ident) {
 361                                         err = "$$ must be followed by a word";
 362                                         goto abort;
 363                                 }
 364                                 if (term) {
 365                                         err = "Virtual symbols not permitted on $TERM line";
 366                                         goto abort;
 367                                 }
 368                         } else if (t.num != TK_ident &&
 369                                    t.num != TK_mark) {
 370                                 err = "Illegal token in precedence line";
 371                                 goto abort;
 372                         }
 373                         s = sym_find(g, t.txt);
 374                         if (s->type != Unknown) {
 375                                 err = "Symbols in precedence/TERM line must not already be known.";
 376                                 goto abort;
 377                         }
 378                         s->type = type;
 379                         if (!term) {
 380                                 s->precedence = g->prec_levels;
 381                                 s->assoc = assoc;
 382                         }
 383                         found += 1;
 384                         t = token_next(ts);
 385                 }
 386                 if (found == 0)
 387                         err = "No symbols given on precedence/TERM line";
 388                         goto abort;
 389                 return NULL;
 390         abort:
 391                 while (t.num != TK_newline && t.num != TK_eof)
 392                         t = token_next(ts);
 393                 return err;
 394         }
 395
 396 ### Productions
 397
 398 A production either starts with an identifier which is the head
 399 non-terminal, or a vertical bar (`|`) in which case this production
 400 uses the same head as the previous one.  The identifier must be
 401 followed by a `->` mark.  All productions for a given non-terminal must
 402 be grouped together with the `nonterminal ->` given only once.
 403
 404 After this a (possibly empty) sequence of identifiers and marks form
 405 the body of the production.  A virtual symbol may be given after the
 406 body by preceding it with `$$`.  If a virtual symbol is given, the
 407 precedence of the production is that for the virtual symbol.  If none
 408 is given, the precedence is inherited from the last symbol in the
 409 production which has a precedence specified.
 410
 411 After the optional precedence may come the `${` mark.  This indicates
 412 the start of a code fragment.  If present, this must be on the same
 413 line as the start of the production.
 414
 415 All of the text from the `${` through to the matching `}$` is
 416 collected and forms the code-fragment for the production.  It must all
 417 be in one `code_node` of the literate code.  The `}$` must be
 418 at the end of a line.
 419
 420 Text in the code fragment will undergo substitutions where `$N` or
 421 `$<N`,for some numeric `N` (or non-numeric indicator as described
 422 later), will be replaced with a variable holding the parse information
 423 for the particular symbol in the production.  `$0` is the head of the
 424 production, `$1` is the first symbol of the body, etc.  The type of `$N`
 425 for a terminal symbol is `struct token`.  For a non-terminal, it is
 426 whatever has been declared for that symbol.  The `<` may be included and
 427 means that the value (usually a reference) is being moved out, so it
 428 will not automatically be freed.  The effect of using '<' is that the
 429 variable is cleared to all-zeros.
 430
 431 While building productions we will need to add to an array which needs to
 432 grow dynamically.
 433
 434 ###### functions
 435         static void array_add(void *varray, int *cnt, void *new)
 436         {
 437                 void ***array = varray;
 438                 int current = 0;
 439                 const int step = 8;
 440                 current = ((*cnt-1) | (step-1))+1;
 441                 if (*cnt == current) {
 442                         /* must grow */
 443                         current += step;
 444                         *array = realloc(*array, current * sizeof(void*));
 445                 }
 446                 (*array)[*cnt] = new;
 447                 (*cnt) += 1;
 448         }
 449
 450 Collecting the code fragment simply involves reading tokens until we
 451 hit the end token `}$`, and noting the character position of the start and
 452 the end.
 453
 454 ###### functions
 455         static struct text collect_code(struct token_state *state,
 456                                         struct token start)
 457         {
 458                 struct text code;
 459                 struct token t;
 460                 code.txt = start.txt.txt + start.txt.len;
 461                 do
 462                         t = token_next(state);
 463                 while (t.node == start.node &&
 464                        t.num != TK_close && t.num != TK_error &&
 465                        t.num != TK_eof);
 466                 if (t.num == TK_close && t.node == start.node)
 467                         code.len = t.txt.txt - code.txt;
 468                 else
 469                         code.txt = NULL;
 470                 return code;
 471         }
 472
 473 Now we have all the bits we need to parse a full production.
 474
 475 ###### includes
 476         #include <memory.h>
 477
 478 ###### grammar fields
 479         struct production **productions;
 480         int production_count;
 481
 482 ###### production fields
 483         struct symbol  *head;
 484         struct symbol **body;
 485         int             body_size;
 486         struct text     code;
 487         int             code_line;
 488
 489 ###### symbol fields
 490         int first_production;
 491
 492 ###### functions
 493         static char *parse_production(struct grammar *g,
 494                                       struct symbol *head,
 495                                       struct token_state *state)
 496         {
 497                 /* Head has already been parsed. */
 498                 struct token tk;
 499                 char *err;
 500                 struct production p, *pp;
 501
 502                 memset(&p, 0, sizeof(p));
 503                 p.head = head;
 504                 tk = token_next(state);
 505                 while (tk.num == TK_ident || tk.num == TK_mark) {
 506                         struct symbol *bs = sym_find(g, tk.txt);
 507                         if (bs->type == Unknown) {
 508                                 if (!g->terminals_declared)
 509                                         bs->type = Terminal;
 510                         }
 511                         if (bs->type == Virtual) {
 512                                 err = "Virtual symbol not permitted in production";
 513                                 goto abort;
 514                         }
 515                         if (bs->precedence) {
 516                                 p.precedence = bs->precedence;
 517                                 p.assoc = bs->assoc;
 518                         }
 519                         array_add(&p.body, &p.body_size, bs);
 520                         tk = token_next(state);
 521                 }
 522                 if (tk.num == TK_virtual) {
 523                         struct symbol *vs;
 524                         tk = token_next(state);
 525                         if (tk.num != TK_ident) {
 526                                 err = "word required after $$";
 527                                 goto abort;
 528                         }
 529                         vs = sym_find(g, tk.txt);
 530                         if (vs->num == TK_newline)
 531                                 p.line_like = 1;
 532                         else if (vs->num == TK_out)
 533                                 p.line_like = 2;
 534                         else if (vs->precedence == 0) {
 535                                 err = "symbol after $$ must have precedence";
 536                                 goto abort;
 537                         } else {
 538                                 p.precedence = vs->precedence;
 539                                 p.assoc = vs->assoc;
 540                         }
 541                         tk = token_next(state);
 542                 }
 543                 if (tk.num == TK_open) {
 544                         p.code_line = tk.line;
 545                         p.code = collect_code(state, tk);
 546                         if (p.code.txt == NULL) {
 547                                 err = "code fragment not closed properly";
 548                                 goto abort;
 549                         }
 550                         tk = token_next(state);
 551                 }
 552
 553                 if (tk.num != TK_newline && tk.num != TK_eof) {
 554                         err = "stray tokens at end of line";
 555                         goto abort;
 556                 }
 557                 pp = malloc(sizeof(*pp));
 558                 *pp = p;
 559                 array_add(&g->productions, &g->production_count, pp);
 560                 return NULL;
 561         abort:
 562                 while (tk.num != TK_newline && tk.num != TK_eof)
 563                         tk = token_next(state);
 564                 free(p.body);
 565                 return err;
 566         }
 567
 568 With the ability to parse productions and dollar-lines, we have nearly all
 569 that we need to parse a grammar from a `code_node`.
 570
 571 The head of the first production will effectively be the `start` symbol
 572 of the grammar.  However it won't _actually_ be so.  Processing the
 573 grammar is greatly simplified if the real start symbol only has a single
 574 production, and expects `$eof` as the final terminal.  So when we find
 575 the first explicit production we insert an extra implicit production as
 576 production zero which looks like
 577
 578 ###### Example: production 0
 579         $start -> START $eof
 580
 581 where `START` is the first non-terminal given.
 582
 583 ###### create production zero
 584         struct production *p = calloc(1,sizeof(*p));
 585         struct text start = {"$start",6};
 586         struct text eof = {"$eof",4};
 587         struct text code = {"$0 = $<1;", 9};
 588         p->head = sym_find(g, start);
 589         p->head->type = Nonterminal;
 590         p->head->struct_name = g->current_type;
 591         p->head->isref = g->type_isref;
 592         if (g->current_type.txt)
 593                 p->code = code;
 594         array_add(&p->body, &p->body_size, head);
 595         array_add(&p->body, &p->body_size, sym_find(g, eof));
 596         p->head->first_production = g->production_count;
 597         array_add(&g->productions, &g->production_count, p);
 598
 599 Now we are ready to read in the grammar.  We ignore comments
 600 and strings so that the marks which introduce them can be
 601 used as terminals in the grammar.  We don't ignore numbers
 602 even though we don't allow them as that causes the scanner
 603 to produce errors that the parser is better positioned to handle.
 604 We also ignore indentation, but we expect and use newlines.
 605
 606 To allow comments in the grammar, we explicitly check for "//" as the
 607 first token on the line and those lines are skipped.  "//" can still be
 608 used as a terminal anywhere that a terminal is expected.
 609
 610 ###### grammar_read
 611         static struct grammar *grammar_read(struct code_node *code)
 612         {
 613                 struct token_config conf = {
 614                         .word_start = "",
 615                         .word_cont = "",
 616                         .words_marks = known,
 617                         .known_count = sizeof(known)/sizeof(known[0]),
 618                         .number_chars = "",
 619                         .ignored = (1 << TK_line_comment)
 620                                  | (1 << TK_block_comment)
 621                                  | (0 << TK_number)
 622                                  | (1 << TK_string)
 623                                  | (1 << TK_multi_string)
 624                                  | (1 << TK_in)
 625                                  | (1 << TK_out),
 626                 };
 627
 628                 struct token_state *state = token_open(code, &conf);
 629                 struct token tk;
 630                 struct symbol *head = NULL;
 631                 struct grammar *g;
 632                 char *err = NULL;
 633
 634                 g = calloc(1, sizeof(*g));
 635                 symbols_init(g);
 636
 637                 for (tk = token_next(state); tk.num != TK_eof;
 638                      tk = token_next(state)) {
 639                         if (tk.num == TK_newline)
 640                                 continue;
 641                         if (tk.num == TK_ident) {
 642                                 // new non-terminal
 643                                 head = sym_find(g, tk.txt);
 644                                 if (head->type == Nonterminal)
 645                                         err = "This non-terminal has already be used.";
 646                                 else if (head->type == Virtual)
 647                                         err = "Virtual symbol not permitted in head of production";
 648                                 else {
 649                                         head->type = Nonterminal;
 650                                         head->struct_name = g->current_type;
 651                                         head->isref = g->type_isref;
 652                                         if (g->production_count == 0) {
 653                                                 ## create production zero
 654                                         }
 655                                         head->first_production = g->production_count;
 656                                         tk = token_next(state);
 657                                         if (tk.num == TK_mark &&
 658                                             text_is(tk.txt, "->"))
 659                                                 err = parse_production(g, head, state);
 660                                         else
 661                                                 err = "'->' missing in production";
 662                                 }
 663                         } else if (tk.num == TK_mark
 664                                    && text_is(tk.txt, "|")) {
 665                                 // another production for same non-term
 666                                 if (head)
 667                                         err = parse_production(g, head, state);
 668                                 else
 669                                         err = "First production must have a head";
 670                         } else if (tk.num == TK_mark
 671                                    && text_is(tk.txt, "$")) {
 672                                 err = dollar_line(state, g, 0);
 673                         } else if (tk.num == TK_mark
 674                                    && text_is(tk.txt, "$*")) {
 675                                 err = dollar_line(state, g, 1);
 676                         } else if (tk.num == TK_mark
 677                                    && text_is(tk.txt, "//")) {
 678                                 while (tk.num != TK_newline &&
 679                                        tk.num != TK_eof)
 680                                         tk = token_next(state);
 681                         } else {
 682                                 err = "Unrecognised token at start of line.";
 683                         }
 684                         if (err)
 685                                 goto abort;
 686                 }
 687                 token_close(state);
 688                 if (g->terminals_declared) {
 689                         struct symbol *s;
 690                         int errs = 0;
 691                         for (s = g->syms; s; s = s->next) {
 692                                 if (s->type != Unknown)
 693                                         continue;
 694                                 errs += 1;
 695                                 fprintf(stderr, "Token %.*s not declared\n",
 696                                         s->name.len, s->name.txt);
 697                         }
 698                         if (errs) {
 699                                 free(g); // FIXME free content
 700                                 g = NULL;
 701                         }
 702                 }
 703                 return g;
 704         abort:
 705                 fprintf(stderr, "Error at line %d: %s\n",
 706                         tk.line, err);
 707                 token_close(state);
 708                 free(g); // FIXME free content
 709                 return NULL;
 710         }
 711
 712 ## Analysing the grammar
 713
 714 The central task in analysing the grammar is to determine a set of
 715 states to drive the parsing process.  This will require finding various
 716 sets of symbols and of "items".  Some of these sets will need to attach
 717 information to each element in the set, so they are more like maps.
 718
 719 Each "item" may have a 'look-ahead' or `LA` set associated with it.
 720 Many of these will be the same as each other.  To avoid memory wastage,
 721 and to simplify some comparisons of sets, these sets will be stored in a
 722 list of unique sets, each assigned a number.
 723
 724 Once we have the data structures in hand to manage these sets and lists,
 725 we can start setting the 'nullable' flag, build the 'FIRST' sets, and
 726 then create the item sets which define the various states.
 727
 728 ### Symbol sets.
 729
 730 Though we don't only store symbols in these sets, they are the main
 731 things we store, so they are called symbol sets or "symsets".
 732
 733 A symset has a size, an array of shorts, and an optional array of data
 734 values, which are also shorts.  If the array of data is not present, we
 735 store an impossible pointer, as `NULL` is used to indicate that no
 736 memory has been allocated yet;
 737
 738 ###### declarations
 739         struct symset {
 740                 short cnt;
 741                 unsigned short *syms, *data;
 742         };
 743         #define NO_DATA ((unsigned short *)1)
 744         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 745         const struct symset INIT_DATASET = { 0, NULL, NULL };
 746
 747 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
 748 using an insertion sort.  We don't explicitly record the amount of
 749 allocated space as it can be derived directly from the current `cnt`
 750 using `((cnt - 1) | 7) + 1`.
 751
 752 ###### functions
 753         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 754         {
 755                 int i;
 756                 int current = ((s->cnt-1) | 7) + 1;
 757                 if (current == s->cnt) {
 758                         current += 8;
 759                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 760                         if (s->data != NO_DATA)
 761                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 762                 }
 763                 i = s->cnt;
 764                 while (i > 0 && s->syms[i-1] > key) {
 765                         s->syms[i] = s->syms[i-1];
 766                         if (s->data != NO_DATA)
 767                                 s->data[i] = s->data[i-1];
 768                         i--;
 769                 }
 770                 s->syms[i] = key;
 771                 if (s->data != NO_DATA)
 772                         s->data[i] = val;
 773                 s->cnt += 1;
 774         }
 775
 776 Finding a symbol (or item) in a `symset` uses a simple binary search.
 777 We return the index where the value was found (so data can be accessed),
 778 or `-1` to indicate failure.
 779
 780         static int symset_find(struct symset *ss, unsigned short key)
 781         {
 782                 int lo = 0;
 783                 int hi = ss->cnt;
 784
 785                 if (hi == 0)
 786                         return -1;
 787                 while (lo + 1 < hi) {
 788                         int mid = (lo + hi) / 2;
 789                         if (ss->syms[mid] <= key)
 790                                 lo = mid;
 791                         else
 792                                 hi = mid;
 793                 }
 794                 if (ss->syms[lo] == key)
 795                         return lo;
 796                 return -1;
 797         }
 798
 799 We will often want to form the union of two symsets.  When we do, we
 800 will often want to know if anything changed (as that might mean there is
 801 more work to do).  So `symset_union` reports whether anything was added
 802 to the first set.  We use a slow quadratic approach as these sets are
 803 rarely large.  If profiling shows this to be a problem it can be
 804 optimised later.
 805
 806         static int symset_union(struct symset *a, struct symset *b)
 807         {
 808                 int i;
 809                 int added = 0;
 810                 for (i = 0; i < b->cnt; i++)
 811                         if (symset_find(a, b->syms[i]) < 0) {
 812                                 unsigned short data = 0;
 813                                 if (b->data != NO_DATA)
 814                                         data = b->data[i];
 815                                 symset_add(a, b->syms[i], data);
 816                                 added++;
 817                         }
 818                 return added;
 819         }
 820
 821 And of course we must be able to free a symset.
 822
 823         static void symset_free(struct symset ss)
 824         {
 825                 free(ss.syms);
 826                 if (ss.data != NO_DATA)
 827                         free(ss.data);
 828         }
 829
 830 ### Symset Storage
 831
 832 Some symsets are simply stored somewhere appropriate in the `grammar`
 833 data structure, others needs a bit of indirection.  This applies
 834 particularly to "LA" sets.  These will be paired with "items" in an
 835 "itemset".  As itemsets will be stored in a symset, the "LA" set needs
 836 to be stored in the `data` field, so we need a mapping from a small
 837 (short) number to an LA `symset`.
 838
 839 As mentioned earlier this involves creating a list of unique symsets.
 840
 841 For now, we just use a linear list sorted by insertion.  A skiplist
 842 would be more efficient and may be added later.
 843
 844 ###### declarations
 845
 846         struct setlist {
 847                 struct symset ss;
 848                 int num;
 849                 struct setlist *next;
 850         };
 851
 852 ###### grammar fields
 853         struct setlist *sets;
 854         int nextset;
 855
 856 ###### functions
 857
 858         static int ss_cmp(struct symset a, struct symset b)
 859         {
 860                 int i;
 861                 int diff = a.cnt - b.cnt;
 862                 if (diff)
 863                         return diff;
 864                 for (i = 0; i < a.cnt; i++) {
 865                         diff = (int)a.syms[i] - (int)b.syms[i];
 866                         if (diff)
 867                                 return diff;
 868                 }
 869                 return 0;
 870         }
 871
 872         static int save_set(struct grammar *g, struct symset ss)
 873         {
 874                 struct setlist **sl = &g->sets;
 875                 int cmp = 1;
 876                 struct setlist *s;
 877
 878                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 879                         sl = & (*sl)->next;
 880                 if (cmp == 0) {
 881                         symset_free(ss);
 882                         return (*sl)->num;
 883                 }
 884
 885                 s = malloc(sizeof(*s));
 886                 s->ss = ss;
 887                 s->num = g->nextset;
 888                 g->nextset += 1;
 889                 s->next = *sl;
 890                 *sl = s;
 891                 return s->num;
 892         }
 893
 894 Finding a set by number is currently performed by a simple linear
 895 search.  If this turns out to hurt performance, we can store the sets in
 896 a dynamic array like the productions.
 897
 898         static struct symset set_find(struct grammar *g, int num)
 899         {
 900                 struct setlist *sl = g->sets;
 901                 while (sl && sl->num != num)
 902                         sl = sl->next;
 903                 return sl->ss;
 904         }
 905
 906 ### Setting `nullable`
 907
 908 We set `nullable` on the head symbol for any production for which all
 909 the body symbols (if any) are nullable.  As this is a recursive
 910 definition, any change in the `nullable` setting means that we need to
 911 re-evaluate where it needs to be set.
 912
 913 We simply loop around performing the same calculations until no more
 914 changes happen.
 915
 916 ###### symbol fields
 917         int nullable;
 918
 919 ###### functions
 920         static void set_nullable(struct grammar *g)
 921         {
 922                 int check_again = 1;
 923                 while (check_again) {
 924                         int p;
 925                         check_again = 0;
 926                         for (p = 0; p < g->production_count; p++) {
 927                                 struct production *pr = g->productions[p];
 928                                 int s;
 929
 930                                 if (pr->head->nullable)
 931                                         continue;
 932                                 for (s = 0; s < pr->body_size; s++)
 933                                         if (! pr->body[s]->nullable)
 934                                                 break;
 935                                 if (s == pr->body_size) {
 936                                         pr->head->nullable = 1;
 937                                         check_again = 1;
 938                                 }
 939                         }
 940                 }
 941         }
 942
 943 ### Setting `line_like`
 944
 945 In order to be able to ignore newline tokens when not relevant, but
 946 still include them in the parse when needed, we will need to know
 947 which states can start a "line-like" section of code.  We ignore
 948 newlines when there is an indent since the most recent start of a
 949 line-like symbol.
 950
 951 A "line_like" symbol is simply any symbol that can derive a NEWLINE.
 952 If a symbol cannot derive a NEWLINE, then it is only part of a line -
 953 so is word-like.  If it can derive a NEWLINE, then we consider it to
 954 be like a line.
 955
 956 Clearly the `TK_newline` token can derive a NEWLINE.  Any symbol which
 957 is the head of a production that contains a line_like symbol is also a
 958 line-like symbol.  We use a new field `line_like` to record this
 959 attribute of symbols, and compute it in a repetitive manner similar to
 960 `set_nullable`.
 961
 962 ###### symbol fields
 963         int line_like;
 964
 965 ###### functions
 966         static void set_line_like(struct grammar *g)
 967         {
 968                 int check_again = 1;
 969                 g->symtab[TK_newline]->line_like = 1;
 970                 while (check_again) {
 971                         int p;
 972                         check_again = 0;
 973                         for (p = 0; p < g->production_count; p++) {
 974                                 struct production *pr = g->productions[p];
 975                                 int s;
 976
 977                                 if (pr->head->line_like)
 978                                         continue;
 979
 980                                 for (s = 0 ; s < pr->body_size; s++) {
 981                                         if (pr->body[s]->line_like) {
 982                                                 pr->head->line_like = 1;
 983                                                 check_again = 1;
 984                                                 break;
 985                                         }
 986                                 }
 987                         }
 988                 }
 989         }
 990
 991 ### Building the `first` sets
 992
 993 When calculating what can follow a particular non-terminal, we will need
 994 to know what the "first" terminal in any subsequent non-terminal might
 995 be.  So we calculate the `first` set for every non-terminal and store
 996 them in an array.  We don't bother recording the "first" set for
 997 terminals as they are trivial.
 998
 999 As the "first" for one symbol might depend on the "first" of another, we
1000 repeat the calculations until no changes happen, just like with
1001 `set_nullable`.  This makes use of the fact that `symset_union` reports
1002 if any change happens.
1003
1004 The core of this, which finds the "first" of part of a production body,
1005 will be reused for computing the "follow" sets, so we split that out
1006 into a separate function, `add_first`, which also reports if it got all
1007 the way to the end of the production without finding a non-nullable
1008 symbol.
1009
1010 ###### grammar fields
1011         struct symset *first;
1012
1013 ###### functions
1014
1015         static int add_first(struct production *pr, int start,
1016                              struct symset *target, struct grammar *g,
1017                              int *to_end)
1018         {
1019                 int s;
1020                 int changed = 0;
1021                 for (s = start; s < pr->body_size; s++) {
1022                         struct symbol *bs = pr->body[s];
1023                         if (bs->type == Terminal) {
1024                                 if (symset_find(target, bs->num) < 0) {
1025                                         symset_add(target, bs->num, 0);
1026                                         changed = 1;
1027                                 }
1028                                 break;
1029                         } else if (symset_union(target, &g->first[bs->num]))
1030                                 changed = 1;
1031                         if (!bs->nullable)
1032                                 break;
1033                 }
1034                 if (to_end)
1035                         *to_end = (s == pr->body_size);
1036                 return changed;
1037         }
1038
1039         static void build_first(struct grammar *g)
1040         {
1041                 int check_again = 1;
1042                 int s;
1043                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1044                 for (s = 0; s < g->num_syms; s++)
1045                         g->first[s] = INIT_SYMSET;
1046
1047                 while (check_again) {
1048                         int p;
1049                         check_again = 0;
1050                         for (p = 0; p < g->production_count; p++) {
1051                                 struct production *pr = g->productions[p];
1052                                 struct symset *head = &g->first[pr->head->num];
1053
1054                                 if (add_first(pr, 0, head, g, NULL))
1055                                         check_again = 1;
1056                         }
1057                 }
1058         }
1059
1060 ### Building the `follow` sets.
1061
1062 There are two different situations when we will want to generate
1063 "follow" sets.  If we are doing an SLR analysis, we want to generate a
1064 single "follow" set for each non-terminal in the grammar.  That is what
1065 is happening here.  If we are doing an LALR or LR analysis we will want
1066 to generate a separate "LA" set for each item.  We do that later in
1067 state generation.
1068
1069 There are two parts to generating a "follow" set.  Firstly we look at
1070 every place that any non-terminal appears in the body of any production,
1071 and we find the set of possible "first" symbols after there.  This is
1072 done using `add_first` above and only needs to be done once as the
1073 "first" sets are now stable and will not change.
1074
1075 ###### grammar fields
1076         struct symset *follow;
1077
1078 ###### follow code
1079
1080         for (p = 0; p < g->production_count; p++) {
1081                 struct production *pr = g->productions[p];
1082                 int b;
1083
1084                 for (b = 0; b < pr->body_size - 1; b++) {
1085                         struct symbol *bs = pr->body[b];
1086                         if (bs->type == Terminal)
1087                                 continue;
1088                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1089                 }
1090         }
1091
1092 The second part is to add the "follow" set of the head of a production
1093 to the "follow" sets of the final symbol in the production, and any
1094 other symbol which is followed only by `nullable` symbols.  As this
1095 depends on "follow" itself we need to repeatedly perform the process
1096 until no further changes happen.
1097
1098 Rather than 2 nested loops, one that repeats the whole process until
1099 there is no change, and one that iterates of the set of productions, we
1100 combine these two functions into a single loop.
1101
1102 ###### follow code
1103
1104         for (check_again = 0, p = 0;
1105              p < g->production_count;
1106              p < g->production_count-1
1107                 ? p++ : check_again ? (check_again = 0, p = 0)
1108                               : p++) {
1109                 struct production *pr = g->productions[p];
1110                 int b;
1111
1112                 for (b = pr->body_size - 1; b >= 0; b--) {
1113                         struct symbol *bs = pr->body[b];
1114                         if (bs->type == Terminal)
1115                                 break;
1116                         if (symset_union(&g->follow[bs->num],
1117                                          &g->follow[pr->head->num]))
1118                                 check_again = 1;
1119                         if (!bs->nullable)
1120                                 break;
1121                 }
1122         }
1123
1124 We now just need to create and initialise the `follow` list to get a
1125 complete function.
1126
1127 ###### functions
1128         static void build_follow(struct grammar *g)
1129         {
1130                 int s, check_again, p;
1131                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1132                 for (s = 0; s < g->num_syms; s++)
1133                         g->follow[s] = INIT_SYMSET;
1134                 ## follow code
1135         }
1136
1137 ### Building itemsets and states
1138
1139 There are three different levels of detail that can be chosen for
1140 building the itemsets and states for the LR grammar.  They are:
1141
1142 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1143    itemsets - look-ahead, if used at all, is simply the 'follow' sets
1144    already computed,
1145 2. LALR(1) where we build look-ahead sets with each item and merge
1146    the LA sets when we find two paths to the same "kernel" set of items.
1147 3. LR(1) where different look-ahead for any item in the set means
1148    a different item set must be created.
1149
1150 ###### forward declarations
1151         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1152
1153 We need to be able to look through existing states to see if a newly
1154 generated state already exists.  For now we use a simple sorted linked
1155 list.
1156
1157 An item is a pair of numbers: the production number and the position of
1158 "DOT", which is an index into the body of the production.  As the
1159 numbers are not enormous we can combine them into a single "short" and
1160 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1161 production number (so 2000 productions with maximum of 31 symbols in the
1162 body).
1163
1164 Comparing the itemsets will be a little different to comparing symsets
1165 as we want to do the lookup after generating the "kernel" of an
1166 itemset, so we need to ignore the offset=zero items which are added during
1167 completion.
1168
1169 To facilitate this, we modify the "DOT" number so that "0" sorts to
1170 the end of the list in the symset, and then only compare items before
1171 the first "0".
1172
1173 ###### declarations
1174         static inline unsigned short item_num(int production, int dot)
1175         {
1176                 return production | ((31-dot) << 11);
1177         }
1178         static inline int item_prod(unsigned short item)
1179         {
1180                 return item & 0x7ff;
1181         }
1182         static inline int item_dot(unsigned short item)
1183         {
1184                 return (31-(item >> 11)) & 0x1f;
1185         }
1186
1187 For LR(1) analysis we need to compare not just the itemset in a state
1188 but also the LA sets.  As we assign each unique LA set a number, we
1189 can just compare the symset and the data values together.
1190
1191 ###### functions
1192         static int itemset_cmp(struct symset a, struct symset b,
1193                                enum grammar_type type)
1194         {
1195                 int i;
1196                 int av, bv;
1197
1198                 for (i = 0;
1199                      i < a.cnt && i < b.cnt &&
1200                      item_dot(a.syms[i]) > 0 &&
1201                      item_dot(b.syms[i]) > 0;
1202                      i++) {
1203                         int diff = a.syms[i] - b.syms[i];
1204                         if (diff)
1205                                 return diff;
1206                         if (type == LR1) {
1207                                 diff = a.data[i] - b.data[i];
1208                                 if (diff)
1209                                         return diff;
1210                         }
1211                 }
1212                 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1213                         av = -1;
1214                 else
1215                         av = a.syms[i];
1216                 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1217                         bv = -1;
1218                 else
1219                         bv = b.syms[i];
1220                 if (av - bv)
1221                         return av - bv;
1222                 if (type < LR1 || av == -1)
1223                         return 0;
1224                 return
1225                         a.data[i] - b.data[i];
1226         }
1227
1228 It will be helpful to know if an itemset has been "completed" or not,
1229 particularly for LALR where itemsets get merged, at which point they
1230 need to be consider for completion again.  So a `completed` flag is
1231 needed.
1232
1233 For correct handling of `TK_newline` when parsing, we will need to
1234 know which states (itemsets) can occur at the start of a line, so we
1235 will record a `starts_line` flag too whenever DOT is at the start of a
1236 `line_like` symbol.
1237
1238 Finally, for handling `TK_out` we need to know whether productions in the
1239 current state started *before* the most recent indent.  A state
1240 doesn't usually keep details of individual productions, so we need to
1241 add one extra detail. `min_prefix` is the smallest non-zero number of
1242 symbols *before* DOT in any production in an itemset.  This will allow
1243 us to determine if the the most recent indent is sufficiently recent
1244 to cancel it against a `TK_out`.  If it was seen longer ago than the
1245 `min_prefix`, and if the current state cannot be reduced, then the
1246 indented section must have ended in the middle of a syntactic unit, so
1247 an error must be signaled.
1248
1249 And now we can build the list of itemsets.  The lookup routine returns
1250 both a success flag and a pointer to where in the list an insert should
1251 happen, so we don't need to search a second time.
1252
1253 ###### declarations
1254         struct itemset {
1255                 struct itemset *next;
1256                 short state;
1257                 struct symset items;
1258                 struct symset go_to;
1259                 enum assoc assoc;
1260                 unsigned short precedence;
1261                 char completed;
1262                 char starts_line;
1263                 int min_prefix;
1264         };
1265
1266 ###### grammar fields
1267         struct itemset *items;
1268         int states;
1269
1270 ###### functions
1271         static int itemset_find(struct grammar *g, struct itemset ***where,
1272                                 struct symset kernel, enum grammar_type type)
1273         {
1274                 struct itemset **ip;
1275
1276                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1277                         struct itemset *i = *ip;
1278                         int diff;
1279                         diff = itemset_cmp(i->items, kernel, type);
1280                         if (diff < 0)
1281                                 continue;
1282                         if (diff > 0)
1283                                 break;
1284                         /* found */
1285                         *where = ip;
1286                         return 1;
1287                 }
1288                 *where = ip;
1289                 return 0;
1290         }
1291
1292 Adding an itemset may require merging the LA sets if LALR analysis is
1293 happening.  If any new LA set adds any symbols that weren't in the old
1294 LA set, we clear the `completed` flag so that the dependants of this
1295 itemset will be recalculated and their LA sets updated.
1296
1297 `add_itemset` must consume the symsets it is passed, either by adding
1298 them to a data structure, of freeing them.
1299
1300         static int add_itemset(struct grammar *g, struct symset ss,
1301                                enum assoc assoc, unsigned short precedence,
1302                                enum grammar_type type)
1303         {
1304                 struct itemset **where, *is;
1305                 int i;
1306                 int found = itemset_find(g, &where, ss, type);
1307                 if (!found) {
1308                         is = calloc(1, sizeof(*is));
1309                         is->state = g->states;
1310                         g->states += 1;
1311                         is->items = ss;
1312                         is->assoc = assoc;
1313                         is->precedence = precedence;
1314                         is->next = *where;
1315                         is->go_to = INIT_DATASET;
1316                         *where = is;
1317                         return is->state;
1318                 }
1319                 is = *where;
1320                 if (type != LALR) {
1321                         symset_free(ss);
1322                         return is->state;
1323                 }
1324                 for (i = 0; i < ss.cnt; i++) {
1325                         struct symset temp = INIT_SYMSET, s;
1326                         if (ss.data[i] == is->items.data[i])
1327                                 continue;
1328                         s = set_find(g, is->items.data[i]);
1329                         symset_union(&temp, &s);
1330                         s = set_find(g, ss.data[i]);
1331                         if (symset_union(&temp, &s)) {
1332                                 is->items.data[i] = save_set(g, temp);
1333                                 is->completed = 0;
1334                         } else
1335                                 symset_free(temp);
1336                 }
1337                 symset_free(ss);
1338                 return is->state;
1339         }
1340
1341 #### The build
1342
1343 To build all the itemsets, we first insert the initial itemset made from
1344 production zero, complete each itemset, and then generate new itemsets
1345 from old until no new ones can be made.
1346
1347 Completing an itemset means finding all the items where "DOT" is
1348 followed by a nonterminal and adding "DOT=0" items for every production
1349 from that non-terminal - providing each item hasn't already been added.
1350 When we add such an item it might get inserted before the current
1351 one, so to make sure we process it we reset the loop counter to the
1352 start.
1353
1354 If LA sets are needed, the LA set for each new item is found using
1355 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This
1356 may be supplemented by the LA set for the item which produced the new
1357 item.
1358
1359 We also collect a set of all symbols which follow "DOT" (in `done`) as
1360 this is used in the next stage.  If any of these symbols are flagged as
1361 `line_like`, then this state must be a `starts_line` state so now is a
1362 good time to record that.
1363
1364 When itemsets are created we assign a precedence to the itemset from the
1365 complete item, if there is one.  We ignore the possibility of there
1366 being two and don't (currently) handle precedence in such grammars.
1367 When completing a grammar we ignore any item where DOT is followed by a
1368 terminal with a precedence lower than that for the itemset.  Unless the
1369 terminal has right associativity, we also ignore items where the
1370 terminal has the same precedence.  The result is that unwanted items are
1371 still in the itemset, but the terminal doesn't get into the go to set,
1372 so the item is ineffective.
1373
1374 ###### complete itemset
1375         for (i = 0; i < is->items.cnt; i++) {
1376                 int p = item_prod(is->items.syms[i]);
1377                 int bs = item_dot(is->items.syms[i]);
1378                 struct production *pr = g->productions[p];
1379                 int p2;
1380                 struct symbol *s;
1381                 struct symset LA = INIT_SYMSET;
1382                 unsigned short sn = 0;
1383                 struct symset LAnl = INIT_SYMSET;
1384                 unsigned short snnl = 0;
1385
1386                 if (is->min_prefix == 0 ||
1387                     (bs > 0 && bs < is->min_prefix))
1388                         is->min_prefix = bs;
1389                 if (bs == pr->body_size)
1390                         continue;
1391                 s = pr->body[bs];
1392                 if (s->precedence && is->precedence &&
1393                     is->precedence > s->precedence)
1394                         /* This terminal has a low precedence and
1395                          * shouldn't be shifted
1396                          */
1397                         continue;
1398                 if (s->precedence && is->precedence &&
1399                     is->precedence == s->precedence && s->assoc != Right)
1400                         /* This terminal has a matching precedence and is
1401                          * not Right-associative, so we mustn't shift it.
1402                          */
1403                         continue;
1404                 if (symset_find(&done, s->num) < 0) {
1405                         symset_add(&done, s->num, 0);
1406                 }
1407                 if (s->type != Nonterminal)
1408                         continue;
1409                 if (s->line_like)
1410                         is->starts_line = 1;
1411                 check_again = 1;
1412                 if (type >= LALR) {
1413                         // Need the LA set.
1414                         int to_end;
1415                         add_first(pr, bs+1, &LA, g, &to_end);
1416                         if (to_end) {
1417                                 struct symset ss = set_find(g, is->items.data[i]);
1418                                 symset_union(&LA, &ss);
1419                         }
1420                         sn = save_set(g, LA);
1421                         LA = set_find(g, sn);
1422                         if (symset_find(&LA, TK_newline))
1423                                 symset_add(&LAnl, TK_newline, 0);
1424                         snnl = save_set(g, LAnl);
1425                         LAnl = set_find(g, snnl);
1426                 }
1427
1428                 /* Add productions for this symbol */
1429                 for (p2 = s->first_production;
1430                      p2 < g->production_count &&
1431                       g->productions[p2]->head == s;
1432                      p2++) {
1433                         int itm = item_num(p2, 0);
1434                         int pos = symset_find(&is->items, itm);
1435                         if (pos < 0) {
1436                                 if (g->productions[p2]->line_like)
1437                                         symset_add(&is->items, itm, snnl);
1438                                 else
1439                                         symset_add(&is->items, itm, sn);
1440                                 /* Will have re-ordered, so start
1441                                  * from beginning again */
1442                                 i = -1;
1443                         } else if (type >= LALR) {
1444                                 struct symset ss = set_find(g, is->items.data[pos]);
1445                                 struct symset tmp = INIT_SYMSET;
1446                                 struct symset *la = &LA;
1447
1448                                 if (g->productions[p2]->line_like)
1449                                         la = &LAnl;
1450                                 symset_union(&tmp, &ss);
1451                                 if (symset_union(&tmp, la)) {
1452                                         is->items.data[pos] = save_set(g, tmp);
1453                                         i = -1;
1454                                 } else
1455                                         symset_free(tmp);
1456                         }
1457                 }
1458         }
1459
1460 For each symbol we found (and placed in `done`) we collect all the items
1461 for which this symbol is next, and create a new itemset with "DOT"
1462 advanced over the symbol.  This is then added to the collection of
1463 itemsets (or merged with a pre-existing itemset).
1464
1465 ###### derive itemsets
1466         // Now we have a completed itemset, so we need to
1467         // compute all the 'next' itemsets and create them
1468         // if they don't exist.
1469         for (i = 0; i < done.cnt; i++) {
1470                 int j;
1471                 unsigned short state;
1472                 struct symbol *sym = g->symtab[done.syms[i]];
1473                 enum assoc assoc = Non;
1474                 unsigned short precedence = 0;
1475                 struct symset newitemset = INIT_SYMSET;
1476                 if (type >= LALR)
1477                         newitemset = INIT_DATASET;
1478
1479                 for (j = 0; j < is->items.cnt; j++) {
1480                         int itm = is->items.syms[j];
1481                         int p = item_prod(itm);
1482                         int bp = item_dot(itm);
1483                         struct production *pr = g->productions[p];
1484                         unsigned short la = 0;
1485                         int pos;
1486
1487                         if (bp == pr->body_size)
1488                                 continue;
1489                         if (pr->body[bp] != sym)
1490                                 continue;
1491
1492                         bp += 1;
1493                         if (type >= LALR)
1494                                 la = is->items.data[j];
1495                         if (bp == pr->body_size &&
1496                             pr->precedence > 0 &&
1497                             pr->precedence > precedence) {
1498                                 // new itemset is reducible and has a precedence.
1499                                 precedence = pr->precedence;
1500                                 assoc = pr->assoc;
1501                         }
1502                         pos = symset_find(&newitemset, item_num(p, bp));
1503
1504                         if (pos < 0)
1505                                 symset_add(&newitemset, item_num(p, bp), la);
1506                         else if (type >= LALR) {
1507                                 // Need to merge la set.
1508                                 int la2 = newitemset.data[pos];
1509                                 if (la != la2) {
1510                                         struct symset ss = set_find(g, la2);
1511                                         struct symset LA = INIT_SYMSET;
1512                                         symset_union(&LA, &ss);
1513                                         ss = set_find(g, la);
1514                                         if (symset_union(&LA, &ss))
1515                                                 newitemset.data[pos] = save_set(g, LA);
1516                                         else
1517                                                 symset_free(LA);
1518                                 }
1519                         }
1520                 }
1521                 state = add_itemset(g, newitemset, assoc, precedence, type);
1522                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1523                         symset_add(&is->go_to, done.syms[i], state);
1524         }
1525
1526 All that is left is to create the initial itemset from production zero, and
1527 with `TK_eof` as the LA set.
1528
1529 ###### functions
1530         static void build_itemsets(struct grammar *g, enum grammar_type type)
1531         {
1532                 struct symset first = INIT_SYMSET;
1533                 struct itemset *is;
1534                 int check_again;
1535                 unsigned short la = 0;
1536                 if (type >= LALR) {
1537                         // LA set just has eof
1538                         struct symset eof = INIT_SYMSET;
1539                         symset_add(&eof, TK_eof, 0);
1540                         la = save_set(g, eof);
1541                         first = INIT_DATASET;
1542                 }
1543                 // production 0, offset 0 (with no data)
1544                 symset_add(&first, item_num(0, 0), la);
1545                 add_itemset(g, first, Non, 0, type);
1546                 for (check_again = 0, is = g->items;
1547                      is;
1548                      is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1549                         int i;
1550                         struct symset done = INIT_SYMSET;
1551                         if (is->completed)
1552                                 continue;
1553                         is->completed = 1;
1554                         check_again = 1;
1555                         ## complete itemset
1556                         ## derive itemsets
1557                         symset_free(done);
1558                 }
1559         }
1560
1561 ### Completing the analysis.
1562
1563 The exact process of analysis depends on which level was requested.  For
1564 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1565 `LR1` we need first, but not follow.  For `SLR` we need both.
1566
1567 We don't build the "action" tables that you might expect as the parser
1568 doesn't use them.  They will be calculated to some extent if needed for
1569 a report.
1570
1571 Once we have built everything we allocate arrays for the two lists:
1572 symbols and itemsets.  This allows more efficient access during
1573 reporting.  The symbols are grouped as terminals and then non-terminals,
1574 and we record the changeover point in `first_nonterm`.
1575
1576 ###### grammar fields
1577         struct symbol **symtab;
1578         struct itemset **statetab;
1579         int first_nonterm;
1580
1581 ###### grammar_analyse
1582
1583         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1584         {
1585                 struct symbol *s;
1586                 struct itemset *is;
1587                 int snum = TK_reserved;
1588                 for (s = g->syms; s; s = s->next)
1589                         if (s->num < 0 && s->type == Terminal) {
1590                                 s->num = snum;
1591                                 snum++;
1592                         }
1593                 g->first_nonterm = snum;
1594                 for (s = g->syms; s; s = s->next)
1595                         if (s->num < 0 && s->type != Virtual) {
1596                                 s->num = snum;
1597                                 snum++;
1598                         }
1599                 for (s = g->syms; s; s = s->next)
1600                         if (s->num < 0) {
1601                                 s->num = snum;
1602                                 snum++;
1603                         }
1604                 g->num_syms = snum;
1605                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1606                 for (s = g->syms; s; s = s->next)
1607                         g->symtab[s->num] = s;
1608
1609                 set_nullable(g);
1610                 set_line_like(g);
1611                 if (type >= SLR)
1612                         build_first(g);
1613
1614                 if (type == SLR)
1615                         build_follow(g);
1616
1617                 build_itemsets(g, type);
1618
1619                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1620                 for (is = g->items; is ; is = is->next)
1621                         g->statetab[is->state] = is;
1622         }
1623
1624 ## Reporting on the Grammar
1625
1626 The purpose of the report is to give the grammar developer insight into
1627 how the grammar parser will work.  It is basically a structured dump of
1628 all the tables that have been generated, plus a description of any conflicts.
1629
1630 ###### grammar_report
1631         static int grammar_report(struct grammar *g, enum grammar_type type)
1632         {
1633                 report_symbols(g);
1634                 if (g->follow)
1635                         report_follow(g);
1636                 report_itemsets(g);
1637                 return report_conflicts(g, type);
1638         }
1639
1640 Firstly we have the complete list of symbols, together with the
1641 "FIRST" set if that was generated.  We add a mark to each symbol to
1642 show if it is considered to be "line-like" (`<`), or if it is nullable (`.`).
1643
1644 ###### functions
1645
1646         static void report_symbols(struct grammar *g)
1647         {
1648                 int n;
1649                 if (g->first)
1650                         printf("SYMBOLS + FIRST:\n");
1651                 else
1652                         printf("SYMBOLS:\n");
1653
1654                 for (n = 0; n < g->num_syms; n++) {
1655                         struct symbol *s = g->symtab[n];
1656                         if (!s)
1657                                 continue;
1658
1659                         printf(" %c%c%3d%c: ",
1660                                s->nullable ? '.':' ',
1661                                s->line_like ? '<':' ',
1662                                s->num, symtypes[s->type]);
1663                         prtxt(s->name);
1664                         if (s->precedence)
1665                                 printf(" (%d%s)", s->precedence,
1666                                        assoc_names[s->assoc]);
1667
1668                         if (g->first && s->type == Nonterminal) {
1669                                 int j;
1670                                 char c = ':';
1671                                 for (j = 0; j < g->first[n].cnt; j++) {
1672                                         printf("%c ", c);
1673                                         c = ',';
1674                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1675                                 }
1676                         }
1677                         printf("\n");
1678                 }
1679         }
1680
1681 Then we have the follow sets if they were computed.
1682
1683         static void report_follow(struct grammar *g)
1684         {
1685                 int n;
1686                 printf("FOLLOW:\n");
1687                 for (n = 0; n < g->num_syms; n++)
1688                         if (g->follow[n].cnt) {
1689                                 int j;
1690                                 char c = ':';
1691                                 printf("  ");
1692                                 prtxt(g->symtab[n]->name);
1693                                 for (j = 0; j < g->follow[n].cnt; j++) {
1694                                         printf("%c ", c);
1695                                         c = ',';
1696                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1697                                 }
1698                                 printf("\n");
1699                         }
1700         }
1701
1702 And finally the item sets.  These include the GO TO tables and, for
1703 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1704 it up a bit.  First the items, with production number and associativity.
1705
1706         static void report_item(struct grammar *g, int itm)
1707         {
1708                 int p = item_prod(itm);
1709                 int dot = item_dot(itm);
1710                 struct production *pr = g->productions[p];
1711                 int i;
1712
1713                 printf("    ");
1714                 prtxt(pr->head->name);
1715                 printf(" ->");
1716                 for (i = 0; i < pr->body_size; i++) {
1717                         printf(" %s", (dot == i ? ". ": ""));
1718                         prtxt(pr->body[i]->name);
1719                 }
1720                 if (dot == pr->body_size)
1721                         printf(" .");
1722                 printf(" [%d]", p);
1723                 if (pr->precedence && dot == pr->body_size)
1724                         printf(" (%d%s)", pr->precedence,
1725                                assoc_names[pr->assoc]);
1726                 if (dot < pr->body_size &&
1727                     pr->body[dot]->precedence) {
1728                         struct symbol *s = pr->body[dot];
1729                         printf(" [%d%s]", s->precedence,
1730                                assoc_names[s->assoc]);
1731                 }
1732                 if (pr->line_like == 1)
1733                         printf(" $$NEWLINE");
1734                 else if (pr->line_like)
1735                         printf(" $$OUT");
1736                 printf("\n");
1737         }
1738
1739 The LA sets which are (possibly) reported with each item:
1740
1741         static void report_la(struct grammar *g, int lanum)
1742         {
1743                 struct symset la = set_find(g, lanum);
1744                 int i;
1745                 char c = ':';
1746
1747                 printf("        LOOK AHEAD(%d)", lanum);
1748                 for (i = 0; i < la.cnt; i++) {
1749                         printf("%c ", c);
1750                         c = ',';
1751                         prtxt(g->symtab[la.syms[i]]->name);
1752                 }
1753                 printf("\n");
1754         }
1755
1756 Then the go to sets:
1757
1758         static void report_goto(struct grammar *g, struct symset gt)
1759         {
1760                 int i;
1761                 printf("    GOTO:\n");
1762
1763                 for (i = 0; i < gt.cnt; i++) {
1764                         printf("      ");
1765                         prtxt(g->symtab[gt.syms[i]]->name);
1766                         printf(" -> %d\n", gt.data[i]);
1767                 }
1768         }
1769
1770 Now we can report all the item sets complete with items, LA sets, and GO TO.
1771
1772         static void report_itemsets(struct grammar *g)
1773         {
1774                 int s;
1775                 printf("ITEM SETS(%d)\n", g->states);
1776                 for (s = 0; s < g->states; s++) {
1777                         int j;
1778                         struct itemset *is = g->statetab[s];
1779                         printf("  Itemset %d:%s min prefix=%d",
1780                                s, is->starts_line?" (startsline)":"", is->min_prefix);
1781                         if (is->precedence)
1782                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1783                         printf("\n");
1784                         for (j = 0; j < is->items.cnt; j++) {
1785                                 report_item(g, is->items.syms[j]);
1786                                 if (is->items.data != NO_DATA)
1787                                         report_la(g, is->items.data[j]);
1788                         }
1789                         report_goto(g, is->go_to);
1790                 }
1791         }
1792
1793 ### Reporting conflicts
1794
1795 Conflict detection varies a lot among different analysis levels.  However
1796 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1797 LR1 are also similar as they have FOLLOW or LA sets.
1798
1799 ###### functions
1800
1801         ## conflict functions
1802
1803         static int report_conflicts(struct grammar *g, enum grammar_type type)
1804         {
1805                 int cnt = 0;
1806                 printf("Conflicts:\n");
1807                 if (type < SLR)
1808                         cnt = conflicts_lr0(g, type);
1809                 else
1810                         cnt = conflicts_slr(g, type);
1811                 if (cnt == 0)
1812                         printf(" - no conflicts\n");
1813                 return cnt;
1814         }
1815
1816 LR0 conflicts are any state which have both a reducible item and
1817 a shiftable item, or two reducible items.
1818
1819 LR05 conflicts only occur if two possible reductions exist,
1820 as shifts always over-ride reductions.
1821
1822 ###### conflict functions
1823         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1824         {
1825                 int i;
1826                 int cnt = 0;
1827                 for (i = 0; i < g->states; i++) {
1828                         struct itemset *is = g->statetab[i];
1829                         int last_reduce = -1;
1830                         int prev_reduce = -1;
1831                         int last_shift = -1;
1832                         int j;
1833                         if (!is)
1834                                 continue;
1835                         for (j = 0; j < is->items.cnt; j++) {
1836                                 int itm = is->items.syms[j];
1837                                 int p = item_prod(itm);
1838                                 int bp = item_dot(itm);
1839                                 struct production *pr = g->productions[p];
1840
1841                                 if (bp == pr->body_size) {
1842                                         prev_reduce = last_reduce;
1843                                         last_reduce = j;
1844                                         continue;
1845                                 }
1846                                 if (pr->body[bp]->type == Terminal)
1847                                         last_shift = j;
1848                         }
1849                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1850                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1851                                 report_item(g, is->items.syms[last_shift]);
1852                                 report_item(g, is->items.syms[last_reduce]);
1853                                 cnt++;
1854                         }
1855                         if (prev_reduce >= 0) {
1856                                 printf("  State %d has 2 (or more) reducible items\n", i);
1857                                 report_item(g, is->items.syms[prev_reduce]);
1858                                 report_item(g, is->items.syms[last_reduce]);
1859                                 cnt++;
1860                         }
1861                 }
1862                 return cnt;
1863         }
1864
1865 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1866 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1867 in the source of the look ahead set.
1868
1869 We build two datasets to reflect the "action" table: one which maps
1870 terminals to items where that terminal could be shifted and another
1871 which maps terminals to items that could be reduced when the terminal
1872 is in look-ahead.  We report when we get conflicts between the two.
1873
1874 As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
1875 terminal, we ignore it.  NEWLINES are handled specially with its own
1876 rules for when to shift and when to reduce.  Conflicts are expected,
1877 but handled internally.
1878
1879         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1880         {
1881                 int i;
1882                 int cnt = 0;
1883
1884                 for (i = 0; i < g->states; i++) {
1885                         struct itemset *is = g->statetab[i];
1886                         struct symset shifts = INIT_DATASET;
1887                         struct symset reduce = INIT_DATASET;
1888                         int j;
1889                         if (!is)
1890                                 continue;
1891                         /* First collect the shifts */
1892                         for (j = 0; j < is->items.cnt; j++) {
1893                                 unsigned short itm = is->items.syms[j];
1894                                 int p = item_prod(itm);
1895                                 int bp = item_dot(itm);
1896                                 struct production *pr = g->productions[p];
1897                                 struct symbol *s;
1898
1899                                 if (bp >= pr->body_size ||
1900                                     pr->body[bp]->type != Terminal)
1901                                         /* not shiftable */
1902                                         continue;
1903
1904                                 s = pr->body[bp];
1905                                 if (s->precedence && is->precedence)
1906                                         /* Precedence resolves this, so no conflict */
1907                                         continue;
1908
1909                                 if (symset_find(&shifts, s->num) < 0)
1910                                         symset_add(&shifts, s->num, itm);
1911                         }
1912                         /* Now look for reductions and conflicts */
1913                         for (j = 0; j < is->items.cnt; j++) {
1914                                 unsigned short itm = is->items.syms[j];
1915                                 int p = item_prod(itm);
1916                                 int bp = item_dot(itm);
1917                                 struct production *pr = g->productions[p];
1918
1919                                 if (bp < pr->body_size)
1920                                         continue;
1921                                 /* reducible */
1922                                 struct symset la;
1923                                 if (type == SLR)
1924                                         la = g->follow[pr->head->num];
1925                                 else
1926                                         la = set_find(g, is->items.data[j]);
1927                                 int k;
1928                                 for (k = 0; k < la.cnt; k++) {
1929                                         int pos = symset_find(&shifts, la.syms[k]);
1930                                         if (pos >= 0 && la.syms[k] != TK_newline) {
1931                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1932                                                 cnt++;
1933                                                         prtxt(g->symtab[la.syms[k]]->name);
1934                                                 printf(":\n");
1935                                                 report_item(g, shifts.data[pos]);
1936                                                 report_item(g, itm);
1937                                         }
1938                                         pos = symset_find(&reduce, la.syms[k]);
1939                                         if (pos < 0) {
1940                                                 symset_add(&reduce, la.syms[k], itm);
1941                                                 continue;
1942                                         }
1943                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1944                                         prtxt(g->symtab[la.syms[k]]->name);
1945                                         printf(":\n");
1946                                         report_item(g, itm);
1947                                         report_item(g, reduce.data[pos]);
1948                                         cnt++;
1949                                 }
1950                         }
1951                         symset_free(shifts);
1952                         symset_free(reduce);
1953                 }
1954                 return cnt;
1955         }
1956
1957
1958 ## Generating the parser
1959
1960 The exported part of the parser is the `parse_XX` function, where the name
1961 `XX` is based on the name of the parser files.
1962
1963 This takes a `code_node`, a partially initialized `token_config`, and an
1964 optional `FILE` to send tracing to.  The `token_config` gets the list of
1965 known words added and then is used with the `code_node` to initialize the
1966 scanner.
1967
1968 `parse_XX` then calls the library function `parser_run` to actually complete
1969 the parse.  This needs the `states` table and functions to call the various
1970 pieces of code provided in the grammar file, so they are generated first.
1971
1972 ###### parser_generate
1973
1974         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1975                                struct code_node *pre_reduce)
1976         {
1977                 gen_known(f, g);
1978                 gen_non_term(f, g);
1979                 gen_goto(f, g);
1980                 gen_states(f, g);
1981                 gen_reduce(f, g, file, pre_reduce);
1982                 gen_free(f, g);
1983
1984                 fprintf(f, "#line 0 \"gen_parser\"\n");
1985                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1986                         name);
1987                 fprintf(f, "{\n");
1988                 fprintf(f, "\tstruct token_state *tokens;\n");
1989                 fprintf(f, "\tconfig->words_marks = known;\n");
1990                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1991                 fprintf(f, "\ttokens = token_open(code, config);\n");
1992                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1993                 fprintf(f, "\ttoken_close(tokens);\n");
1994                 fprintf(f, "\treturn rv;\n");
1995                 fprintf(f, "}\n\n");
1996         }
1997
1998 ### Known words table
1999
2000 The known words table is simply an array of terminal symbols.
2001 The table of nonterminals used for tracing is a similar array.
2002
2003 ###### functions
2004
2005         static void gen_known(FILE *f, struct grammar *g)
2006         {
2007                 int i;
2008                 fprintf(f, "#line 0 \"gen_known\"\n");
2009                 fprintf(f, "static const char *known[] = {\n");
2010                 for (i = TK_reserved;
2011                      i < g->num_syms && g->symtab[i]->type == Terminal;
2012                      i++)
2013                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
2014                                 g->symtab[i]->name.txt);
2015                 fprintf(f, "};\n\n");
2016         }
2017
2018         static void gen_non_term(FILE *f, struct grammar *g)
2019         {
2020                 int i;
2021                 fprintf(f, "#line 0 \"gen_non_term\"\n");
2022                 fprintf(f, "static const char *non_term[] = {\n");
2023                 for (i = TK_reserved;
2024                      i < g->num_syms;
2025                      i++)
2026                         if (g->symtab[i]->type == Nonterminal)
2027                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
2028                                         g->symtab[i]->name.txt);
2029                 fprintf(f, "};\n\n");
2030         }
2031
2032 ### States and the goto tables.
2033
2034 For each state we record the goto table and details of the reducible
2035 production if there is one.
2036 Some of the details of the reducible production are stored in the
2037 `do_reduce` function to come later.  Here we store the production
2038 number, the body size (useful for stack management), and the resulting
2039 symbol (useful for knowing how to free data later).
2040
2041 The go to table is stored in a simple array of `sym` and corresponding
2042 `state`.
2043
2044 ###### exported types
2045
2046         struct lookup {
2047                 short sym;
2048                 short state;
2049         };
2050         struct state {
2051                 short go_to_cnt;
2052                 const struct lookup * go_to;
2053                 short reduce_prod;
2054                 short reduce_size;
2055                 short reduce_sym;
2056                 char starts_line;
2057                 char newline_only;
2058                 short min_prefix;
2059         };
2060
2061 ###### functions
2062
2063         static void gen_goto(FILE *f, struct grammar *g)
2064         {
2065                 int i;
2066                 fprintf(f, "#line 0 \"gen_goto\"\n");
2067                 for (i = 0; i < g->states; i++) {
2068                         int j;
2069                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
2070                                 i);
2071                         struct symset gt = g->statetab[i]->go_to;
2072                         for (j = 0; j < gt.cnt; j++)
2073                                 fprintf(f, "\t{ %d, %d },\n",
2074                                         gt.syms[j], gt.data[j]);
2075                         fprintf(f, "};\n");
2076                 }
2077         }
2078
2079         static void gen_states(FILE *f, struct grammar *g)
2080         {
2081                 int i;
2082                 fprintf(f, "#line 0 \"gen_states\"\n");
2083                 fprintf(f, "static const struct state states[] = {\n");
2084                 for (i = 0; i < g->states; i++) {
2085                         struct itemset *is = g->statetab[i];
2086                         int j, prod = -1, prod_len;
2087
2088                         for (j = 0; j < is->items.cnt; j++) {
2089                                 int itm = is->items.syms[j];
2090                                 int p = item_prod(itm);
2091                                 int bp = item_dot(itm);
2092                                 struct production *pr = g->productions[p];
2093
2094                                 if (bp < pr->body_size)
2095                                         continue;
2096                                 /* This is what we reduce - choose longest */
2097                                 if (prod < 0 || prod_len < pr->body_size) {
2098                                         prod = p;
2099                                         prod_len = pr->body_size;
2100                                 }
2101                         }
2102
2103                         if (prod >= 0)
2104                                 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n",
2105                                         i, is->go_to.cnt, i, prod,
2106                                         g->productions[prod]->body_size,
2107                                         g->productions[prod]->head->num,
2108                                         is->starts_line,
2109                                         g->productions[prod]->line_like,
2110                                         is->min_prefix);
2111                         else
2112                                 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n",
2113                                         i, is->go_to.cnt, i,
2114                                         is->starts_line, is->min_prefix);
2115                 }
2116                 fprintf(f, "};\n\n");
2117         }
2118
2119 ### The `do_reduce` function and the code
2120
2121 When the parser engine decides to reduce a production, it calls
2122 `do_reduce` which runs the code that was included with the production,
2123 if any.
2124
2125 This code needs to be able to store data somewhere.  Rather than
2126 requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
2127 buffer and have `do_reduce` return the size to be saved.
2128
2129 In order for the code to access "global" context, we pass in the
2130 "config" pointer that was passed to the parser function.  If the `struct
2131 token_config` is embedded in some larger structure, the reducing code
2132 can access the larger structure using pointer manipulation.  Performing
2133 that pointer manipulation, and any other common code or variables for
2134 all reduce actions, can be provided in code node called "reduce" which
2135 is passed around in `pre_reduce` which you might have already noticed.
2136
2137 The code fragments require translation when written out.  Any `$N` needs
2138 to be converted to a reference either to that buffer (if `$0`) or to the
2139 structure returned by a previous reduction.  These pointers need to be
2140 cast to the appropriate type for each access.  All this is handled in
2141 `gen_code`.
2142
2143 `gen_code` also allows symbol references to contain a '`<`' as in
2144 '`$<2`'.  This is particularly useful for references (or pointers), but
2145 can be used with structures too.  The `<` implies that the value is
2146 being moved out, so the object will not be automatically freed.  It is
2147 equivalent to assigning `NULL` to the pointer or filling a structure
2148 with zeros.
2149
2150 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2151 and possibly a number.  A number by itself (other than zero) selects a
2152 symbol from the body of the production.  A sequence of letters selects
2153 the shortest symbol in the body which contains those letters in the
2154 given order.  If a number follows the letters, then a later occurrence
2155 of that symbol is chosen.  So "`$AB2`" will refer to the structure
2156 attached to the second occurrence of the shortest symbol which contains
2157 an `A` followed by a `B`.  If there is no unique shortest system, or if
2158 the number given is too large, then the symbol reference is not
2159 transformed, and will cause an error when the code is compiled.
2160
2161 ###### functions
2162
2163         static int textchr(struct text t, char c, int s)
2164         {
2165                 int i;
2166                 for (i = s; i < t.len; i++)
2167                         if (t.txt[i] == c)
2168                                 return i;
2169                 return -1;
2170         }
2171
2172         static int subseq_match(char *seq, int slen, struct text name)
2173         {
2174                 int st = 0;
2175                 while (slen > 0) {
2176                         st = textchr(name, *seq, st);
2177                         if (st < 0)
2178                                 return 0;
2179                         slen -= 1;
2180                         seq += 1;
2181                         st += 1;
2182                 }
2183                 return 1;
2184         }
2185
2186         static int choose_sym(char **namep, int len, struct production *p)
2187         {
2188                 char *name = *namep;
2189                 char *nam = name;
2190                 int namlen;
2191                 int n = 0;
2192                 int i, s, slen;
2193                 char c;
2194
2195                 c = *name;
2196                 while (len > 0 &&
2197                        ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2198                         name += 1;
2199                         len -= 1;
2200                         c = *name;
2201                 }
2202                 namlen = name-nam;
2203                 while (len > 0 && (c >= '0' && c <= '9')) {
2204                         name += 1;
2205                         len -= 1;
2206                         n = n * 10 + (c - '0');
2207                         c = *name;
2208                 }
2209                 if (namlen == 0) {
2210                         if (name == *namep || n > p->body_size)
2211                                 return -1;
2212                         *namep = name;
2213                         return n;
2214                 }
2215                 slen = 0; s = -1;
2216                 for (i = 0; i < p->body_size; i++) {
2217                         if (!subseq_match(nam, namlen, p->body[i]->name))
2218                                 continue;
2219                         if (slen == 0 || p->body[i]->name.len < slen) {
2220                                 s = i;
2221                                 slen = p->body[i]->name.len;
2222                         }
2223                         if (s >= 0 && p->body[i] != p->body[s] &&
2224                             p->body[i]->name.len == p->body[s]->name.len)
2225                                 /* not unique, so s cannot be used */
2226                                 s = -1;
2227                 }
2228                 if (s < 0)
2229                         return -1;
2230                 if (n == 0)
2231                         n = 1;
2232                 for (i = 0; i < p->body_size; i++)
2233                         if (p->body[i] == p->body[s]) {
2234                                 n -= 1;
2235                                 if (n == 0)
2236                                         break;
2237                         }
2238                 if (n > 0 || i == p->body_size)
2239                         return -1;
2240                 *namep = name;
2241                 return i + 1;
2242         }
2243
2244         static void gen_code(struct production *p, FILE *f, struct grammar *g)
2245         {
2246                 char *c;
2247                 char *used = calloc(1, p->body_size);
2248                 int i;
2249
2250                 fprintf(f, "\t\t\t");
2251                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2252                         int n;
2253                         int use = 0;
2254                         if (*c != '$') {
2255                                 fputc(*c, f);
2256                                 if (*c == '\n')
2257                                         fputs("\t\t\t", f);
2258                                 continue;
2259                         }
2260                         c++;
2261                         if (*c == '<') {
2262                                 use = 1;
2263                                 c++;
2264                         }
2265                         n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2266                         if (n < 0) {
2267                                 fputc('$', f);
2268                                 if (use)
2269                                         fputc('<', f);
2270                                 fputc(*c, f);
2271                                 continue;
2272                         }
2273                         if (n == 0)
2274                                 fprintf(f, "(*(struct %.*s*%s)ret)",
2275                                         p->head->struct_name.len,
2276                                         p->head->struct_name.txt,
2277                                         p->head->isref ? "*":"");
2278                         else if (p->body[n-1]->type == Terminal)
2279                                 fprintf(f, "(*(struct token *)body[%d])",
2280                                         n-1);
2281                         else if (p->body[n-1]->struct_name.txt == NULL)
2282                                 fprintf(f, "$%d", n);
2283                         else {
2284                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2285                                         p->body[n-1]->struct_name.len,
2286                                         p->body[n-1]->struct_name.txt,
2287                                         p->body[n-1]->isref ? "*":"", n-1);
2288                                 used[n-1] = use;
2289                         }
2290                         c -= 1;
2291                 }
2292                 fputs("\n", f);
2293                 for (i = 0; i < p->body_size; i++) {
2294                         if (p->body[i]->struct_name.txt &&
2295                             used[i]) {
2296                                 // assume this has been copied out
2297                                 if (p->body[i]->isref)
2298                                         fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2299                                 else
2300                                         fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2301                         }
2302                 }
2303                 free(used);
2304         }
2305
2306 ###### functions
2307
2308         static void gen_reduce(FILE *f, struct grammar *g, char *file,
2309                                struct code_node *pre_reduce)
2310         {
2311                 int i;
2312                 fprintf(f, "#line 1 \"gen_reduce\"\n");
2313                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2314                 fprintf(f, "{\n");
2315                 fprintf(f, "\tint ret_size = 0;\n");
2316                 if (pre_reduce)
2317                         code_node_print(f, pre_reduce, file);
2318
2319                 fprintf(f, "#line 4 \"gen_reduce\"\n");
2320                 fprintf(f, "\tswitch(prod) {\n");
2321                 for (i = 0; i < g->production_count; i++) {
2322                         struct production *p = g->productions[i];
2323                         fprintf(f, "\tcase %d:\n", i);
2324
2325                         if (p->code.txt) {
2326                                 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2327                                 gen_code(p, f, g);
2328                         }
2329
2330                         if (p->head->struct_name.txt)
2331                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2332                                         p->head->struct_name.len,
2333                                         p->head->struct_name.txt,
2334                                         p->head->isref ? "*":"");
2335
2336                         fprintf(f, "\t\tbreak;\n");
2337                 }
2338                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2339         }
2340
2341 ### `do_free`
2342
2343 As each non-terminal can potentially cause a different type of data
2344 structure to be allocated and filled in, we need to be able to free it when
2345 done.
2346
2347 It is particularly important to have fine control over freeing during error
2348 recovery where individual stack frames might need to be freed.
2349
2350 For this, the grammar author is required to defined a `free_XX` function for
2351 each structure that is used by a non-terminal.  `do_free` will call whichever
2352 is appropriate given a symbol number, and will call `free` (as is
2353 appropriate for tokens) on any terminal symbol.
2354
2355 ###### functions
2356
2357         static void gen_free(FILE *f, struct grammar *g)
2358         {
2359                 int i;
2360
2361                 fprintf(f, "#line 0 \"gen_free\"\n");
2362                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2363                 fprintf(f, "{\n");
2364                 fprintf(f, "\tif (!asn) return;\n");
2365                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2366                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2367                 fprintf(f, "\tswitch(sym) {\n");
2368
2369                 for (i = 0; i < g->num_syms; i++) {
2370                         struct symbol *s = g->symtab[i];
2371                         if (!s ||
2372                             s->type != Nonterminal ||
2373                             s->struct_name.txt == NULL)
2374                                 continue;
2375
2376                         fprintf(f, "\tcase %d:\n", s->num);
2377                         if (s->isref) {
2378                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2379                                         s->struct_name.len,
2380                                         s->struct_name.txt);
2381                                 fprintf(f, "\t\tfree(asn);\n");
2382                         } else
2383                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2384                                         s->struct_name.len,
2385                                         s->struct_name.txt);
2386                         fprintf(f, "\t\tbreak;\n");
2387                 }
2388                 fprintf(f, "\t}\n}\n\n");
2389         }
2390
2391 ## The main routine.
2392
2393 There are three key parts to the "main" function of parsergen: processing
2394 the arguments, loading the grammar file, and dealing with the grammar.
2395
2396 ### Argument processing.
2397
2398 Command line options allow the selection of analysis mode, name of output
2399 file, and whether or not a report should be generated.  By default we create
2400 a report only if no code output was requested.
2401
2402 The `parse_XX` function name uses the basename of the output file which
2403 should not have a suffix (`.c`).  `.c` is added to the given name for the
2404 code, and `.h` is added for the header (if header text is specifed in the
2405 grammar file).
2406
2407 ###### includes
2408         #include <getopt.h>
2409
2410 ###### declarations
2411         static const struct option long_options[] = {
2412                 { "LR0",        0, NULL, '0' },
2413                 { "LR05",       0, NULL, '5' },
2414                 { "SLR",        0, NULL, 'S' },
2415                 { "LALR",       0, NULL, 'L' },
2416                 { "LR1",        0, NULL, '1' },
2417                 { "tag",        1, NULL, 't' },
2418                 { "report",     0, NULL, 'R' },
2419                 { "output",     1, NULL, 'o' },
2420                 { NULL,         0, NULL, 0   }
2421         };
2422         const char *options = "05SL1t:Ro:";
2423
2424 ###### process arguments
2425         int opt;
2426         char *outfile = NULL;
2427         char *infile;
2428         char *name;
2429         char *tag = NULL;
2430         int report = 1;
2431         enum grammar_type type = LR05;
2432         while ((opt = getopt_long(argc, argv, options,
2433                                   long_options, NULL)) != -1) {
2434                 switch(opt) {
2435                 case '0':
2436                         type = LR0; break;
2437                 case '5':
2438                         type = LR05; break;
2439                 case 'S':
2440                         type = SLR; break;
2441                 case 'L':
2442                         type = LALR; break;
2443                 case '1':
2444                         type = LR1; break;
2445                 case 'R':
2446                         report = 2; break;
2447                 case 'o':
2448                         outfile = optarg; break;
2449                 case 't':
2450                         tag = optarg; break;
2451                 default:
2452                         fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2453                         exit(1);
2454                 }
2455         }
2456         if (optind < argc)
2457                 infile = argv[optind++];
2458         else {
2459                 fprintf(stderr, "No input file given\n");
2460                 exit(1);
2461         }
2462         if (outfile && report == 1)
2463                 report = 0;
2464         name = outfile;
2465         if (name && strchr(name, '/'))
2466                 name = strrchr(name, '/')+1;
2467
2468         if (optind < argc) {
2469                 fprintf(stderr, "Excess command line arguments\n");
2470                 exit(1);
2471         }
2472
2473 ### Loading the grammar file
2474
2475 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2476 map it.
2477
2478 Once we have extracted the code (with `mdcode`) we expect to find three
2479 or four sections: header, code, grammar, reduce.
2480 Anything else that is not excluded by the `--tag` option is an error.
2481
2482 "header", "code", and "reduce" are optional, though it is hard to build
2483 a working parser without the first two.  "grammar" must be provided.
2484
2485 ###### includes
2486         #include <fcntl.h>
2487         #include <sys/mman.h>
2488         #include <errno.h>
2489
2490 ###### functions
2491         static int errs;
2492         static void pr_err(char *msg)
2493         {
2494                 errs++;
2495                 fprintf(stderr, "%s\n", msg);
2496         }
2497
2498 ###### load file
2499         struct section *table;
2500         int fd;
2501         int len;
2502         char *file;
2503         fd = open(infile, O_RDONLY);
2504         if (fd < 0) {
2505                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2506                         infile, strerror(errno));
2507                 exit(1);
2508         }
2509         len = lseek(fd, 0, 2);
2510         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2511         table = code_extract(file, file+len, pr_err);
2512
2513         struct code_node *hdr = NULL;
2514         struct code_node *code = NULL;
2515         struct code_node *gram = NULL;
2516         struct code_node *pre_reduce = NULL;
2517         for (s = table; s; s = s->next) {
2518                 struct text sec = s->section;
2519                 if (tag && !strip_tag(&sec, tag))
2520                         continue;
2521                 if (text_is(sec, "header"))
2522                         hdr = s->code;
2523                 else if (text_is(sec, "code"))
2524                         code = s->code;
2525                 else if (text_is(sec, "grammar"))
2526                         gram = s->code;
2527                 else if (text_is(sec, "reduce"))
2528                         pre_reduce = s->code;
2529                 else {
2530                         fprintf(stderr, "Unknown content section: %.*s\n",
2531                                 s->section.len, s->section.txt);
2532                         rv |= 2;
2533                 }
2534         }
2535
2536 ### Processing the input
2537
2538 As we need to append an extention to a filename and then open it for
2539 writing, and we need to do this twice, it helps to have a separate function.
2540
2541 ###### functions
2542
2543         static FILE *open_ext(char *base, char *ext)
2544         {
2545                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2546                 FILE *f;
2547                 strcat(strcpy(fn, base), ext);
2548                 f = fopen(fn, "w");
2549                 free(fn);
2550                 return f;
2551         }
2552
2553 If we can read the grammar, then we analyse and optionally report on it.  If
2554 the report finds conflicts we will exit with an error status.
2555
2556 ###### process input
2557         struct grammar *g = NULL;
2558         if (gram == NULL) {
2559                 fprintf(stderr, "No grammar section provided\n");
2560                 rv |= 2;
2561         } else {
2562                 g = grammar_read(gram);
2563                 if (!g) {
2564                         fprintf(stderr, "Failure to parse grammar\n");
2565                         rv |= 2;
2566                 }
2567         }
2568         if (g) {
2569                 grammar_analyse(g, type);
2570                 if (report)
2571                         if (grammar_report(g, type))
2572                                 rv |= 1;
2573         }
2574
2575 If a "headers" section is defined, we write it out and include a declaration
2576 for the `parse_XX` function so it can be used from separate code.
2577
2578         if (rv == 0 && hdr && outfile) {
2579                 FILE *f = open_ext(outfile, ".h");
2580                 if (f) {
2581                         code_node_print(f, hdr, infile);
2582                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2583                                 name);
2584                         fclose(f);
2585                 } else {
2586                         fprintf(stderr, "Cannot create %s.h\n",
2587                                 outfile);
2588                         rv |= 4;
2589                 }
2590         }
2591
2592 And if all goes well and an output file was provided, we create the `.c`
2593 file with the code section (if any) and the parser tables and function.
2594
2595         if (rv == 0 && outfile) {
2596                 FILE *f = open_ext(outfile, ".c");
2597                 if (f) {
2598                         if (code)
2599                                 code_node_print(f, code, infile);
2600                         gen_parser(f, g, infile, name, pre_reduce);
2601                         fclose(f);
2602                 } else {
2603                         fprintf(stderr, "Cannot create %s.c\n",
2604                                 outfile);
2605                         rv |= 4;
2606                 }
2607         }
2608
2609 And that about wraps it up.  We need to set the locale so that UTF-8 is
2610 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2611
2612 ###### File: parsergen.mk
2613         parsergen : parsergen.o libscanner.o libmdcode.o
2614                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2615
2616 ###### includes
2617         #include <locale.h>
2618
2619 ###### main
2620
2621         int main(int argc, char *argv[])
2622         {
2623                 struct section *s;
2624                 int rv = 0;
2625
2626                 setlocale(LC_ALL,"");
2627
2628                 ## process arguments
2629                 ## load file
2630                 ## process input
2631
2632                 return rv;
2633         }
2634
2635 ## The SHIFT/REDUCE parser
2636
2637 Having analysed the grammar and generated all the tables, we only need
2638 the shift/reduce engine to bring it all together.
2639
2640 ### Goto table lookup
2641
2642 The parser generator has nicely provided us with goto tables sorted by
2643 symbol number.  We need a binary search function to find a symbol in the
2644 table.
2645
2646 ###### parser functions
2647
2648         static int search(const struct state *l, int sym)
2649         {
2650                 int lo = 0;
2651                 int hi = l->go_to_cnt;
2652
2653                 if (hi == 0)
2654                         return -1;
2655                 while (lo + 1 < hi) {
2656                         int mid = (lo + hi) / 2;
2657                         if (l->go_to[mid].sym <= sym)
2658                                 lo = mid;
2659                         else
2660                                 hi = mid;
2661                 }
2662                 if (l->go_to[lo].sym == sym)
2663                         return l->go_to[lo].state;
2664                 else
2665                         return -1;
2666         }
2667
2668 ### Memory allocation
2669
2670 The `scanner` returns tokens in a local variable - we want them in allocated
2671 memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
2672 a reduce is in a large buffer.  Both of these require some allocation and
2673 copying, hence `memdup` and `tok_copy`.
2674
2675 ###### parser includes
2676         #include <memory.h>
2677
2678 ###### parser functions
2679
2680         void *memdup(void *m, int len)
2681         {
2682                 void *ret;
2683                 if (len == 0)
2684                         return NULL;
2685                 ret = malloc(len);
2686                 memcpy(ret, m, len);
2687                 return ret;
2688         }
2689
2690         static struct token *tok_copy(struct token tk)
2691         {
2692                 struct token *new = malloc(sizeof(*new));
2693                 *new = tk;
2694                 return new;
2695         }
2696
2697 ### The state stack.
2698
2699 The core data structure for the parser is the stack.  This tracks all
2700 the symbols that have been recognised or partially recognised.
2701
2702 The stack usually won't grow very large - maybe a few tens of entries.
2703 So we dynamically grow an array as required but never bother to
2704 shrink it down again.
2705
2706 We keep the stack as two separate allocations.  One, `asn_stack` stores
2707 the "abstract syntax nodes" which are created by each reduction.  When
2708 we call `do_reduce` we need to pass an array of the `asn`s of the body
2709 of the production, and by keeping a separate `asn` stack, we can just
2710 pass a pointer into this stack.
2711
2712 The other allocation stores all other stack fields of which there are
2713 several.  The `state` is the most important one and guides the parsing
2714 process.  The `sym` is nearly unnecessary as it is implicit in the
2715 state.  However when we want to free entries from the `asn_stack`, it
2716 helps to know what type they are so we can call the right freeing
2717 function.  The symbol leads us to the right free function through
2718 `do_free`.
2719
2720 The `indents` count tracks the line indents with-in the symbol or
2721 immediately follow it.  These are used to allow indent information to
2722 guide parsing and error recovery.
2723
2724 `since_newline` tracks how many stack frames since the last
2725 start-of-line (whether indented or not).  So if `since_newline` is
2726 zero, then this symbol is at the start of a line.  Similarly
2727 `since_indent` counts the number of states since an indent, it is zero
2728 precisely when `indents` is not zero.
2729
2730 `newline_permitted` keeps track of whether newlines should be ignored
2731 or not.
2732
2733 The stack is most properly seen as alternating states and symbols -
2734 states, like the 'DOT' in items, are between symbols.  Each frame in
2735 our stack holds a state and the symbol that was before it.  The
2736 bottom of stack holds the start state but no symbol, as nothing came
2737 before the beginning.  As we need to store some value, `TK_eof` is used
2738 to mark the beginning of the file as well as the end.
2739
2740 ###### parser functions
2741
2742         struct parser {
2743                 struct frame {
2744                         short state;
2745                         short newline_permitted;
2746
2747                         short sym;
2748                         short indents;
2749                         short since_newline;
2750                         short since_indent;
2751                 } *stack;
2752                 void **asn_stack;
2753                 int stack_size;
2754                 int tos;
2755         };
2756
2757 #### Shift and pop
2758
2759 Two operations are needed on the stack - shift (which is like push) and pop.
2760
2761 Shift applies not only to terminals but also to non-terminals.  When we
2762 reduce a production we will pop off frames corresponding to the body
2763 symbols, then push on a frame for the head of the production.  This last
2764 is exactly the same process as shifting in a terminal so we use the same
2765 function for both.  In both cases we provide the symbol, the number of
2766 indents the symbol contains (which will be zero for a terminal symbol)
2767 and a flag indicating the the symbol was at (or was reduced from a
2768 symbol which was at) the start of a line.  The state is deduced from the
2769 current top-of-stack state and the new symbol.
2770
2771 To simplify other code we arrange for `shift` to fail if there is no `goto`
2772 state for the symbol.  This is useful in basic parsing due to our design
2773 that we shift when we can, and reduce when we cannot.  So the `shift`
2774 function reports if it could.
2775
2776 `shift` is also used to push state zero onto the stack, so if the
2777 stack is empty, it always chooses zero as the next state.
2778
2779 So `shift` finds the next state.  If that succeeds it extends the
2780 allocations if needed and pushes all the information onto the stacks.
2781
2782 Newlines are permitted after a `starts_line` state until an internal
2783 indent.  If the new frame has neither a `starts_line` state nor an
2784 indent, newlines are permitted if the previous stack frame permitted
2785 them.
2786
2787 ###### parser functions
2788
2789         static int shift(struct parser *p,
2790                          short sym, short indents, short start_of_line,
2791                          void *asn,
2792                          const struct state states[])
2793         {
2794                 // Push an entry onto the stack
2795                 struct frame next = {0};
2796                 int newstate = p->tos
2797                         ? search(&states[p->stack[p->tos-1].state],
2798                                  sym)
2799                         : 0;
2800                 if (newstate < 0)
2801                         return 0;
2802                 if (p->tos >= p->stack_size) {
2803                         p->stack_size += 10;
2804                         p->stack = realloc(p->stack, p->stack_size
2805                                            * sizeof(p->stack[0]));
2806                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2807                                            * sizeof(p->asn_stack[0]));
2808                 }
2809                 next.sym = sym;
2810                 next.indents = indents;
2811                 next.state = newstate;
2812                 if (states[newstate].starts_line)
2813                         next.newline_permitted = 1;
2814                 else if (indents)
2815                         next.newline_permitted = 0;
2816                 else if (p->tos)
2817                         next.newline_permitted =
2818                                 p->stack[p->tos-1].newline_permitted;
2819                 else
2820                         next.newline_permitted = 0;
2821
2822                 if (!start_of_line) {
2823                         if (p->tos)
2824                                 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2825                         else
2826                                 next.since_newline = 1;
2827                 }
2828                 if (indents)
2829                         next.since_indent = 0;
2830                 else if (p->tos)
2831                         next.since_indent = p->stack[p->tos-1].since_indent + 1;
2832                 else
2833                         next.since_indent = 1;
2834
2835                 p->stack[p->tos] = next;
2836                 p->asn_stack[p->tos] = asn;
2837                 p->tos++;
2838                 return 1;
2839         }
2840
2841 `pop` primarily moves the top of stack (`tos`) back down the required
2842 amount and frees any `asn` entries that need to be freed.  It also
2843 collects a summary of the indents and line starts in the symbols that
2844 are being removed. It is called _after_ we reduce a production, just
2845 before we `shift` the nonterminal in.
2846
2847 ###### parser functions
2848
2849         static int pop(struct parser *p, int num,
2850                        short *start_of_line,
2851                        void(*do_free)(short sym, void *asn))
2852         {
2853                 int i;
2854                 short indents = 0;
2855                 int sol = 0;
2856
2857                 p->tos -= num;
2858                 for (i = 0; i < num; i++) {
2859                         sol |= !p->stack[p->tos+i].since_newline;
2860                         indents += p->stack[p->tos+i].indents;
2861                         do_free(p->stack[p->tos+i].sym,
2862                                 p->asn_stack[p->tos+i]);
2863                 }
2864                 if (start_of_line)
2865                         *start_of_line = sol;
2866                 return indents;
2867         }
2868
2869 ### The heart of the parser.
2870
2871 Now we have the parser.  For each token we might shift it, trigger a
2872 reduction, or start error handling.  2D tokens (IN, OUT, EOL) also need
2873 to be handled.
2874
2875 We return whatever `asn` was returned by reducing production zero.
2876
2877 If we can neither shift nor reduce we have an error to handle.  We pop
2878 single entries off the stack until we can shift the `TK_error` symbol,
2879 then drop input tokens until we find one we can shift into the new error
2880 state.  We need to ensure that something is dropped or shifted after an
2881 error, or we could get into an infinite loop, only shifting in
2882 `TK_error`, then reducing.  So we track if there has been a shift since
2883 the last error, and if not the next error always discards one token.
2884
2885 When we find `TK_in` and `TK_out` tokens which report indents we need
2886 to handle them directly as the grammar cannot express what we want to
2887 do with them.
2888
2889 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2890 record how many indents there are following the previous token.
2891
2892 `TK_out` tokens must be canceled against an indent count
2893 within the stack.  If we can reduce some symbols that are all since
2894 the most recent indent, then we do that first.  If the minimum prefix
2895 of the current state then extends back before the most recent indent,
2896 that indent can be cancelled.  If the minimum prefix is shorter then
2897 the indent had ended prematurely and we must start error handling, which
2898 is still a work-in-progress.
2899
2900 `TK_newline` tokens are ignored unless the top stack frame records
2901 that they are permitted.  In that case they will not be considered for
2902 shifting if it is possible to reduce some symbols that are all since
2903 the most recent start of line.  This is how a newline forcibly
2904 terminates any line-like structure - we try to reduce down to at most
2905 one symbol for each line where newlines are allowed.
2906 A consequence of this is that a rule like
2907
2908 ###### Example: newlines - broken
2909
2910         Newlines ->
2911                 | NEWLINE Newlines
2912         IfStatement -> Newlines if ....
2913
2914 cannot work, as the NEWLINE will never be shifted as the empty string
2915 will be reduced first.  Optional sets of newlines need to be include
2916 in the thing that preceed:
2917
2918 ###### Example: newlines - works
2919
2920         If -> if
2921                 | NEWLINE If
2922         IfStatement -> If ....
2923
2924 Here the NEWLINE will be shifted because nothing can be reduced until
2925 the `if` is seen.
2926
2927 When during error handling we discard tokens read in, we want to keep
2928 discarding until we see one that is recognised.  If we had a full set
2929 of LR(1) grammar states, this would mean looking in the look-ahead set,
2930 but we don't keep a full look-ahead set.  We only record the subset
2931 that leads to SHIFT.  We can, however, deduce the look-ahead set by
2932 looking at the SHIFT subsets for all states that we can get to by
2933 reducing zero or more times.  So we need a little function which
2934 checks if a given token is in any of these look-ahead sets.
2935
2936 ###### parser includes
2937         #include "parser.h"
2938
2939 ###### parser_run
2940
2941         static int in_lookahead(struct token *tk, const struct state *states, int state)
2942         {
2943                 while (state >= 0) {
2944                         if (search(&states[state], tk->num) >= 0)
2945                                 return 1;
2946                         if (states[state].reduce_prod < 0)
2947                                 return 0;
2948                         state = search(&states[state], states[state].reduce_sym);
2949                 }
2950                 return 0;
2951         }
2952
2953         void *parser_run(struct token_state *tokens,
2954                          const struct state states[],
2955                          int (*do_reduce)(int, void**, struct token_config*, void*),
2956                          void (*do_free)(short, void*),
2957                          FILE *trace, const char *non_term[],
2958                          struct token_config *config)
2959         {
2960                 struct parser p = { 0 };
2961                 struct token *tk = NULL;
2962                 int accepted = 0;
2963                 int shift_since_err = 1;
2964                 void *ret = NULL;
2965
2966                 shift(&p, TK_eof, 0, 1, NULL, states);
2967                 while (!accepted && p.tos > 0) {
2968                         struct token *err_tk;
2969                         struct frame *tos = &p.stack[p.tos-1];
2970                         if (!tk)
2971                                 tk = tok_copy(token_next(tokens));
2972                         parser_trace(trace, &p,
2973                                      tk, states, non_term, config->known_count);
2974
2975                         if (tk->num == TK_in) {
2976                                 tos->indents += 1;
2977                                 tos->since_newline = 0;
2978                                 tos->since_indent = 0;
2979                                 if (!states[tos->state].starts_line)
2980                                         tos->newline_permitted = 0;
2981                                 free(tk);
2982                                 tk = NULL;
2983                                 parser_trace_action(trace, "Record");
2984                                 continue;
2985                         }
2986                         if (tk->num == TK_out) {
2987                                 if (states[tos->state].reduce_size >= 0 &&
2988                                     states[tos->state].reduce_size <= tos->since_indent)
2989                                         goto force_reduce;
2990                                 if (states[tos->state].min_prefix >= tos->since_indent) {
2991                                         // OK to cancel
2992                                         struct frame *in = tos - tos->since_indent;
2993                                         in->indents -= 1;
2994                                         if (in->indents == 0) {
2995                                                 /* Reassess since_indent and newline_permitted */
2996                                                 if (in > p.stack) {
2997                                                         in->since_indent = in[-1].since_indent + 1;
2998                                                         in->newline_permitted = in[-1].newline_permitted;
2999                                                 } else {
3000                                                         in->since_indent = 0;
3001                                                         in->newline_permitted = 0;
3002                                                 }
3003                                                 if (states[in->state].starts_line)
3004                                                         in->newline_permitted = 1;
3005                                                 while (in < tos) {
3006                                                         in += 1;
3007                                                         in->since_indent = in[-1].since_indent + 1;
3008                                                         if (states[in->state].starts_line)
3009                                                                 in->newline_permitted = 1;
3010                                                         else
3011                                                                 in->newline_permitted = in[-1].newline_permitted;
3012                                                 }
3013                                         }
3014                                         free(tk);
3015                                         tk = NULL;
3016                                         parser_trace_action(trace, "Cancel");
3017                                         continue;
3018                                 }
3019                                 // fall through to error handling as both SHIFT and REDUCE
3020                                 // will fail.
3021                         }
3022                         if (tk->num == TK_newline) {
3023                                 if (!tos->newline_permitted) {
3024                                         free(tk);
3025                                         tk = NULL;
3026                                         parser_trace_action(trace, "Discard");
3027                                         continue;
3028                                 }
3029                                 if (tos->since_newline > 1 &&
3030                                     states[tos->state].reduce_size >= 0 &&
3031                                     states[tos->state].reduce_size <= tos->since_newline)
3032                                         goto force_reduce;
3033                         }
3034                         if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
3035                                 shift_since_err = 1;
3036                                 tk = NULL;
3037                                 parser_trace_action(trace, "Shift");
3038                                 continue;
3039                         }
3040                 force_reduce:
3041                         if (states[tos->state].reduce_prod >= 0 &&
3042                             states[tos->state].newline_only &&
3043                             !(tk->num == TK_newline ||
3044                               tk->num == TK_eof ||
3045                               tk->num == TK_out ||
3046                               (tos->indents == 0 && tos->since_newline == 0))) {
3047                                 /* Anything other than newline or out or eof
3048                                  * in an error unless we are already at start
3049                                  * of line, as this production must end at EOL.
3050                                  */
3051                         } else if (states[tos->state].reduce_prod >= 0) {
3052                                 void **body;
3053                                 void *res;
3054                                 const struct state *nextstate = &states[tos->state];
3055                                 int prod = nextstate->reduce_prod;
3056                                 int size = nextstate->reduce_size;
3057                                 int bufsize;
3058                                 static char buf[16*1024];
3059                                 short indents, start_of_line;
3060
3061                                 body = p.asn_stack + (p.tos - size);
3062
3063                                 bufsize = do_reduce(prod, body, config, buf);
3064
3065                                 indents = pop(&p, size, &start_of_line,
3066                                               do_free);
3067                                 res = memdup(buf, bufsize);
3068                                 memset(buf, 0, bufsize);
3069                                 if (!shift(&p, nextstate->reduce_sym,
3070                                            indents, start_of_line,
3071                                            res, states)) {
3072                                         if (prod != 0) abort();
3073                                         accepted = 1;
3074                                         ret = res;
3075                                 }
3076                                 parser_trace_action(trace, "Reduce");
3077                                 continue;
3078                         }
3079                         /* Error. We walk up the stack until we
3080                          * find a state which will accept TK_error.
3081                          * We then shift in TK_error and see what state
3082                          * that takes us too.
3083                          * Then we discard input tokens until
3084                          * we find one that is acceptable.
3085                          */
3086                         parser_trace_action(trace, "ERROR");
3087                         short indents = 0, start_of_line;
3088
3089                         err_tk = tok_copy(*tk);
3090                         while (p.tos > 0 &&
3091                                shift(&p, TK_error, 0, 0,
3092                                      err_tk, states) == 0)
3093                                 // discard this state
3094                                 indents += pop(&p, 1, &start_of_line, do_free);
3095                         if (p.tos == 0) {
3096                                 free(err_tk);
3097                                 // no state accepted TK_error
3098                                 break;
3099                         }
3100                         if (!shift_since_err) {
3101                                 /* must discard at least one token to avoid
3102                                  * infinite loop.
3103                                  */
3104                                 if (tk->num == TK_eof)
3105                                         break;
3106                                 free(tk);
3107                                 tk = tok_copy(token_next(tokens));
3108                         }
3109                         shift_since_err = 0;
3110                         tos = &p.stack[p.tos-1];
3111                         while (!in_lookahead(tk, states, tos->state) &&
3112                                tk->num != TK_eof) {
3113                                 free(tk);
3114                                 tk = tok_copy(token_next(tokens));
3115                                 shift_since_err = 1;
3116                                 if (tk->num == TK_in)
3117                                         indents += 1;
3118                                 if (tk->num == TK_out) {
3119                                         if (indents == 0)
3120                                                 break;
3121                                         indents -= 1;
3122                                         // FIXME update since_indent here
3123                                 }
3124                         }
3125                         tos->indents += indents;
3126                 }
3127                 free(tk);
3128                 pop(&p, p.tos, NULL, do_free);
3129                 free(p.asn_stack);
3130                 free(p.stack);
3131                 return ret;
3132         }
3133
3134 ###### exported functions
3135         void *parser_run(struct token_state *tokens,
3136                          const struct state states[],
3137                          int (*do_reduce)(int, void**, struct token_config*, void*),
3138                          void (*do_free)(short, void*),
3139                          FILE *trace, const char *non_term[],
3140                          struct token_config *config);
3141
3142 ### Tracing
3143
3144 Being able to visualize the parser in action can be invaluable when
3145 debugging the parser code, or trying to understand how the parse of a
3146 particular grammar progresses.  The stack contains all the important
3147 state, so just printing out the stack every time around the parse loop
3148 can make it possible to see exactly what is happening.
3149
3150 This doesn't explicitly show each SHIFT and REDUCE action.  However they
3151 are easily deduced from the change between consecutive lines, and the
3152 details of each state can be found by cross referencing the states list
3153 in the stack with the "report" that parsergen can generate.
3154
3155 For terminal symbols, we just dump the token.  For non-terminals we
3156 print the name of the symbol.  The look ahead token is reported at the
3157 end inside square brackets.
3158
3159 ###### parser functions
3160
3161         static char *reserved_words[] = {
3162                 [TK_error]        = "ERROR",
3163                 [TK_in]           = "IN",
3164                 [TK_out]          = "OUT",
3165                 [TK_newline]      = "NEWLINE",
3166                 [TK_eof]          = "$eof",
3167         };
3168         static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
3169         {
3170                 fprintf(trace, "(%d", f->state);
3171                 if (states[f->state].starts_line)
3172                         fprintf(trace, "s");
3173                 if (f->newline_permitted)
3174                         fprintf(trace, "n%d", f->since_newline);
3175                 fprintf(trace, ") ");
3176         }
3177
3178         void parser_trace(FILE *trace, struct parser *p,
3179                           struct token *tk, const struct state states[],
3180                           const char *non_term[], int knowns)
3181         {
3182                 int i;
3183                 if (!trace)
3184                         return;
3185                 for (i = 0; i < p->tos; i++) {
3186                         struct frame *f = &p->stack[i];
3187                         if (i) {
3188                                 int sym = f->sym;
3189                                 if (sym < TK_reserved &&
3190                                     reserved_words[sym] != NULL)
3191                                         fputs(reserved_words[sym], trace);
3192                                 else if (sym < TK_reserved + knowns) {
3193                                         struct token *t = p->asn_stack[i];
3194                                         text_dump(trace, t->txt, 20);
3195                                 } else
3196                                         fputs(non_term[sym - TK_reserved - knowns],
3197                                               trace);
3198                                 if (f->indents)
3199                                         fprintf(trace, ".%d", f->indents);
3200                                 if (f->since_newline == 0)
3201                                         fputs("/", trace);
3202                                 fputs(" ", trace);
3203                         }
3204                         parser_trace_state(trace, f, states);
3205                 }
3206                 fprintf(trace, "[");
3207                 if (tk->num < TK_reserved &&
3208                     reserved_words[tk->num] != NULL)
3209                         fputs(reserved_words[tk->num], trace);
3210                 else
3211                         text_dump(trace, tk->txt, 20);
3212                 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3213         }
3214
3215         void parser_trace_action(FILE *trace, char *action)
3216         {
3217                 if (trace)
3218                         fprintf(trace, " - %s\n", action);
3219         }
3220
3221 # A Worked Example
3222
3223 The obvious example for a parser is a calculator.
3224
3225 As `scanner` provides number parsing function using `libgmp` it is not much
3226 work to perform arbitrary rational number calculations.
3227
3228 This calculator takes one expression, or an equality test, per line.
3229 The results are printed and if any equality test fails, the program
3230 exits with an error.
3231
3232 ###### File: parsergen.mk
3233         calc.c calc.h : parsergen parsergen.mdc
3234                 ./parsergen --tag calc -o calc parsergen.mdc
3235         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3236                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3237         calctest : calc
3238                 ./calc parsergen.mdc
3239         demos :: calctest
3240
3241 # calc: header
3242
3243         #include "parse_number.h"
3244         // what do we use for a demo-grammar?  A calculator of course.
3245         struct number {
3246                 mpq_t val;
3247                 char tail[2];
3248                 int err;
3249         };
3250
3251 # calc: code
3252
3253         #include <stdlib.h>
3254         #include <unistd.h>
3255         #include <fcntl.h>
3256         #include <sys/mman.h>
3257         #include <stdio.h>
3258         #include <malloc.h>
3259         #include <gmp.h>
3260         #include <string.h>
3261         #include "mdcode.h"
3262         #include "scanner.h"
3263         #include "parser.h"
3264
3265         #include "calc.h"
3266
3267         static void free_number(struct number *n)
3268         {
3269                 mpq_clear(n->val);
3270                 free(n);
3271         }
3272
3273         static int text_is(struct text t, char *s)
3274         {
3275                 return (strlen(s) == t.len &&
3276                         strncmp(s, t.txt, t.len) == 0);
3277         }
3278
3279         int main(int argc, char *argv[])
3280         {
3281                 int fd = open(argv[1], O_RDONLY);
3282                 int len = lseek(fd, 0, 2);
3283                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3284                 struct section *table = code_extract(file, file+len, NULL);
3285                 struct section *s;
3286                 struct token_config config = {
3287                         .ignored = (1 << TK_line_comment)
3288                                  | (1 << TK_in)
3289                                  | (1 << TK_out),
3290                         .number_chars = ".,_+-",
3291                         .word_start = "",
3292                         .word_cont = "",
3293                 };
3294                 for (s = table; s; s = s->next)
3295                         if (text_is(s->section, "example: input"))
3296                                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3297                 while (table) {
3298                         struct section *t = table->next;
3299                         code_free(table->code);
3300                         free(table);
3301                         table = t;
3302                 }
3303                 exit(0);
3304         }
3305
3306 # calc: grammar
3307
3308         $LEFT + -
3309         $LEFT * / //
3310
3311         Session -> Session Line
3312                 | Line
3313
3314         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3315                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3316                                         gmp_printf("  or as a decimal: %Fg\n", fl);
3317                                         mpf_clear(fl);
3318                                         }
3319                                      }$
3320                 | Expression = Expression NEWLINE ${
3321                         if (mpq_equal($1.val, $3.val))
3322                                 gmp_printf("Both equal %Qd\n", $1.val);
3323                         else {
3324                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
3325                                         $1.val, $3.val);
3326                                 exit(1);
3327                         }
3328                 }$
3329                 | NEWLINE ${ printf("Blank line\n"); }$
3330                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3331
3332         $number
3333         Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3334                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3335                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3336                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3337                 | Expression // Expression ${ {
3338                         mpz_t z0, z1, z2;
3339                         mpq_init($0.val);
3340                         mpz_init(z0); mpz_init(z1); mpz_init(z2);
3341                         mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3342                         mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3343                         mpz_tdiv_q(z0, z1, z2);
3344                         mpq_set_z($0.val, z0);
3345                         mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3346                 } }$
3347                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3348                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3349
3350 # example: input
3351
3352         355/113
3353         3.1415926535 - 355/113
3354         2 + 4 * 5
3355         1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3356         10 * 9 / 2
3357         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
3358
3359         355//113
3360
3361         error