ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator with 2D support #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 "2D support" means that indentation and line breaks can be significant.
   8
   9 There are several distinct sections.
  10
  11 1. `grammar_read` will read a grammar from a literate-code file and
  12    store the productions, symbols, and code fragments.
  13 2. `grammar_analyse` will take that grammar and build LR parsing
  14    states and look-ahead tables.
  15 3. `grammar_report` will present the details of the analysis
  16    in a readable format and will report any conflicts.
  17 4. `parser_generate` will write out C code files with various
  18    tables and with the code fragments arranged in useful places.
  19 5. `parser_run` is a library function intended to be linked together
  20    with the generated parser tables to complete the implementation of
  21    a parser.
  22 6. Finally `calc` is a test grammar for a simple calculator.  The
  23    `parsergen` program built from the C code in this file can extract
  24    that grammar directly from this file and process it.
  25
  26 ###### File: parsergen.c
  27         #include <unistd.h>
  28         #include <stdlib.h>
  29         #include <stdio.h>
  30         ## includes
  31         ## forward declarations
  32         ## declarations
  33         ## functions
  34         ## grammar_read
  35         ## grammar_analyse
  36         ## grammar_report
  37         ## parser_generate
  38         ## main
  39 ###### File: parser.h
  40         ## exported types
  41         ## exported functions
  42 ###### File: libparser.c
  43         #include <unistd.h>
  44         #include <stdlib.h>
  45         #include <stdio.h>
  46         ## parser includes
  47         ## parser functions
  48         ## parser_run
  49 ###### File: parsergen.mk
  50         CFLAGS += -Wall -g
  51         all :: parsergen calc
  52         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  53                 ./md2c parsergen.mdc
  54
  55 ## Reading the grammar
  56
  57 The grammar must be stored in a markdown literate code file as parsed by
  58 "[mdcode][]".  It may have up to four top level (i.e.  unreferenced)
  59 sections: `header`, `code`, `grammar` and `reduce`.  The first two, if
  60 present, will be literally copied into the generated `.c` and `.h`
  61 files.  The third is required and contains the grammar which is
  62 tokenised with "[scanner][]".  `reduce` can provide variable
  63 declarations and common intialization code for all the reduction code
  64 fragments in the grammar.
  65
  66 If the `--tag` option is given, then any top level header that doesn't
  67 start with the tag is ignored, and the tag is striped from the rest.  So
  68 `--tag Foo` means that the four sections expected are `Foo: header`,
  69 `Foo: code`, `Foo: grammar` and `Foo: reduce`.  The tag `calc` is used
  70 to extract the sample calculator from this file.
  71
  72 [mdcode]: mdcode.html
  73 [scanner]: scanner.html
  74
  75 ###### includes
  76         #include "mdcode.h"
  77         #include "scanner.h"
  78
  79 ###### parser includes
  80         #include "mdcode.h"
  81         #include "scanner.h"
  82
  83 The grammar contains production sets, precedence/associativity
  84 declarations, general terminal declarations, and data type declarations.
  85 These are all parsed with _ad hoc_ parsing as we don't have a parser
  86 generator yet.
  87
  88 The precedence and associativity can be set for each production, and
  89 can be inherited from symbols.  The data type (either structure or a
  90 reference to a structure) is potentially defined for each non-terminal
  91 and describes what C structure is used to store information about each
  92 symbol.
  93
  94 ###### declarations
  95         enum assoc {Left, Right, Non};
  96         char *assoc_names[] = {"Left","Right","Non"};
  97
  98         struct symbol {
  99                 struct text struct_name;
 100                 int isref;
 101                 enum assoc assoc;
 102                 unsigned short precedence;
 103                 ## symbol fields
 104         };
 105         struct production {
 106                 unsigned short precedence;
 107                 enum assoc assoc;
 108                 char line_like;
 109                 ## production fields
 110         };
 111         struct grammar {
 112                 ## grammar fields
 113         };
 114
 115 The strings reported by `mdcode` and `scanner` are `struct text` which
 116 have length rather than being nul terminated.  To help with printing and
 117 comparing we define `text_is` and `prtxt`, which should possibly go in
 118 `mdcode`.  `scanner` does provide `text_dump` which is useful for
 119 strings which might contain control characters.
 120
 121 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 122 because that is what we need to detect tags.
 123
 124 ###### functions
 125         static int text_is(struct text t, char *s)
 126         {
 127                 return (strlen(s) == t.len &&
 128                         strncmp(s, t.txt, t.len) == 0);
 129         }
 130         static void prtxt(struct text t)
 131         {
 132                 printf("%.*s", t.len, t.txt);
 133         }
 134
 135         static int strip_tag(struct text *t, char *tag)
 136         {
 137                 int skip = strlen(tag) + 1;
 138                 if (skip >= t->len ||
 139                     strncmp(t->txt, tag, skip-1) != 0 ||
 140                     t->txt[skip-1] != ':')
 141                         return 0;
 142                 while (skip < t->len && t->txt[skip] == ' ')
 143                         skip++;
 144                 t->len -= skip;
 145                 t->txt += skip;
 146                 return 1;
 147         }
 148
 149 ### Symbols
 150
 151 Productions are comprised primarily of symbols - terminal and
 152 non-terminal.  We do not make a syntactic distinction between the two,
 153 though non-terminals must be identifiers.  Non-terminal symbols are
 154 those which appear in the head of a production, terminal symbols are
 155 those which don't.  There are also "virtual" symbols used for precedence
 156 marking discussed later, and sometimes we won't know what type a symbol
 157 is yet.
 158
 159 To help with code safety it is possible to declare the terminal symbols.
 160 If this is done, then any symbol used in a production that does not
 161 appear in the head of any production and is not declared as a terminal
 162 is treated as an error.
 163
 164 ###### forward declarations
 165         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 166         char *symtypes = "UVTN";
 167 ###### symbol fields
 168         enum symtype type;
 169 ###### grammar fields
 170         int terminals_declared;
 171
 172 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 173 table of known symbols and the resulting parser will report them as
 174 `TK_reserved + N`.  A small set of identifiers are reserved for the
 175 different token types that `scanner` can report.
 176
 177 ###### declarations
 178
 179         static struct { int num; char *name; } reserved_words[] = {
 180                 { TK_error,        "ERROR" },
 181                 { TK_number,       "NUMBER" },
 182                 { TK_ident,        "IDENTIFIER" },
 183                 { TK_mark,         "MARK" },
 184                 { TK_string,       "STRING" },
 185                 { TK_multi_string, "MULTI_STRING" },
 186                 { TK_in,           "IN" },
 187                 { TK_out,          "OUT" },
 188                 { TK_newline,      "NEWLINE" },
 189                 { TK_eof,          "$eof" },
 190         };
 191 ###### symbol fields
 192         short num;
 193
 194 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 195 recognised.  The former is automatically expected at the end of the text
 196 being parsed. The latter are always ignored by the parser.
 197
 198 All of these symbols are stored in a simple symbol table.  We use the
 199 `struct text` from `mdcode` to store the name and link them together
 200 into a sorted list using an insertion sort.
 201
 202 We don't have separate `find` and `insert` functions as any symbol we
 203 find needs to be remembered.  We simply expect `find` to always return a
 204 symbol, but its type might be `Unknown`.
 205
 206 ###### includes
 207         #include <string.h>
 208
 209 ###### symbol fields
 210         struct text name;
 211         struct symbol *next;
 212
 213 ###### grammar fields
 214         struct symbol *syms;
 215         int num_syms;
 216
 217 ###### functions
 218         static struct symbol *sym_find(struct grammar *g, struct text s)
 219         {
 220                 struct symbol **l = &g->syms;
 221                 struct symbol *n;
 222                 int cmp = 1;
 223
 224                 while (*l &&
 225                         (cmp = text_cmp((*l)->name, s)) < 0)
 226                                 l = & (*l)->next;
 227                 if (cmp == 0)
 228                         return *l;
 229                 n = calloc(1, sizeof(*n));
 230                 n->name = s;
 231                 n->type = Unknown;
 232                 n->next = *l;
 233                 n->num = -1;
 234                 *l = n;
 235                 return n;
 236         }
 237
 238         static void symbols_init(struct grammar *g)
 239         {
 240                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 241                 int i;
 242                 for (i = 0; i < entries; i++) {
 243                         struct text t;
 244                         struct symbol *s;
 245                         t.txt = reserved_words[i].name;
 246                         t.len = strlen(t.txt);
 247                         s = sym_find(g, t);
 248                         s->type = Terminal;
 249                         s->num = reserved_words[i].num;
 250                 }
 251         }
 252
 253 ### Data types and precedence.
 254
 255 Data type specification, precedence specification, and declaration of
 256 terminals are all introduced by a dollar sign at the start of the line.
 257 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 258 precedence, if it is `TERM` then the line declares terminals without
 259 precedence, otherwise it specifies a data type.
 260
 261 The data type name is simply stored and applied to the head of all
 262 subsequent productions.  It must be the name of a structure optionally
 263 preceded by an asterisk which means a reference or "pointer".  So
 264 `$expression` maps to `struct expression` and `$*statement` maps to
 265 `struct statement *`.
 266
 267 Any productions given before the first data type declaration will have
 268 no data type associated with them and can carry no information.  In
 269 order to allow other non-terminals to have no type, the data type
 270 `$void` can be given.  This does *not* mean that `struct void` will be
 271 used, but rather than no type will be associated with subsequent
 272 non-terminals.
 273
 274 The precedence line must contain a list of symbols, either terminal
 275 symbols or virtual symbols.  It can only contain symbols that have not
 276 been seen yet, so precedence declaration must precede the productions
 277 that they affect.
 278
 279 A "virtual" symbol is an identifier preceded by `$$`.  Virtual symbols
 280 carry precedence information but are not included in the grammar.  A
 281 production can declare that it inherits the precedence of a given
 282 virtual symbol.
 283
 284 This use for `$$` precludes it from being used as a symbol in the
 285 described language.  Two other symbols: `${` and `}$` are also
 286 unavailable.
 287
 288 Each new precedence line introduces a new precedence level and
 289 declares how it associates.  This level is stored in each symbol
 290 listed and may be inherited by any production which uses the symbol.  A
 291 production inherits from the last symbol which has a precedence.
 292
 293 The symbols on the first precedence line have the lowest precedence.
 294 Subsequent lines introduce symbols with higher precedence and so bind
 295 more tightly.
 296
 297 Note that a declaration line (or "dollar line") can start with either of
 298 two different marks: "$" or "$*".  The `dollar_line()` function defined
 299 here is told which was found in the `isref` argument.
 300
 301 ###### grammar fields
 302         struct text current_type;
 303         int type_isref;
 304         int prec_levels;
 305
 306 ###### declarations
 307         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 308         static const char *known[] = { "$$", "${", "}$" };
 309
 310 ###### functions
 311         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 312         {
 313                 struct token t = token_next(ts);
 314                 char *err;
 315                 enum assoc assoc;
 316                 int term = 0;
 317                 int found;
 318
 319                 if (t.num != TK_ident) {
 320                         err = "type or assoc expected after '$'";
 321                         goto abort;
 322                 }
 323                 if (text_is(t.txt, "LEFT"))
 324                         assoc = Left;
 325                 else if (text_is(t.txt, "RIGHT"))
 326                         assoc = Right;
 327                 else if (text_is(t.txt, "NON"))
 328                         assoc = Non;
 329                 else if (text_is(t.txt, "TERM")) {
 330                         term = 1;
 331                         g->terminals_declared = 1;
 332                 } else {
 333                         g->current_type = t.txt;
 334                         g->type_isref = isref;
 335                         if (text_is(t.txt, "void"))
 336                                 g->current_type.txt = NULL;
 337                         t = token_next(ts);
 338                         if (t.num != TK_newline) {
 339                                 err = "Extra tokens after type name";
 340                                 goto abort;
 341                         }
 342                         return NULL;
 343                 }
 344
 345                 if (isref) {
 346                         err = "$* cannot be followed by a precedence";
 347                         goto abort;
 348                 }
 349
 350                 // This is a precedence or TERM line, need some symbols.
 351                 found = 0;
 352                 g->prec_levels += 1;
 353                 t = token_next(ts);
 354                 while (t.num != TK_newline) {
 355                         enum symtype type = Terminal;
 356                         struct symbol *s;
 357                         if (t.num == TK_virtual) {
 358                                 type = Virtual;
 359                                 t = token_next(ts);
 360                                 if (t.num != TK_ident) {
 361                                         err = "$$ must be followed by a word";
 362                                         goto abort;
 363                                 }
 364                                 if (term) {
 365                                         err = "Virtual symbols not permitted on $TERM line";
 366                                         goto abort;
 367                                 }
 368                         } else if (t.num != TK_ident &&
 369                                    t.num != TK_mark) {
 370                                 err = "Illegal token in precedence line";
 371                                 goto abort;
 372                         }
 373                         s = sym_find(g, t.txt);
 374                         if (s->type != Unknown) {
 375                                 err = "Symbols in precedence/TERM line must not already be known.";
 376                                 goto abort;
 377                         }
 378                         s->type = type;
 379                         if (!term) {
 380                                 s->precedence = g->prec_levels;
 381                                 s->assoc = assoc;
 382                         }
 383                         found += 1;
 384                         t = token_next(ts);
 385                 }
 386                 if (found == 0)
 387                         err = "No symbols given on precedence/TERM line";
 388                         goto abort;
 389                 return NULL;
 390         abort:
 391                 while (t.num != TK_newline && t.num != TK_eof)
 392                         t = token_next(ts);
 393                 return err;
 394         }
 395
 396 ### Productions
 397
 398 A production either starts with an identifier which is the head
 399 non-terminal, or a vertical bar (`|`) in which case this production
 400 uses the same head as the previous one.  The identifier must be
 401 followed by a `->` mark.  All productions for a given non-terminal must
 402 be grouped together with the `nonterminal ->` given only once.
 403
 404 After this a (possibly empty) sequence of identifiers and marks form
 405 the body of the production.  A virtual symbol may be given after the
 406 body by preceding it with `$$`.  If a virtual symbol is given, the
 407 precedence of the production is that for the virtual symbol.  If none
 408 is given, the precedence is inherited from the last symbol in the
 409 production which has a precedence specified.
 410
 411 After the optional precedence may come the `${` mark.  This indicates
 412 the start of a code fragment.  If present, this must be on the same
 413 line as the start of the production.
 414
 415 All of the text from the `${` through to the matching `}$` is
 416 collected and forms the code-fragment for the production.  It must all
 417 be in one `code_node` of the literate code.  The `}$` must be
 418 at the end of a line.
 419
 420 Text in the code fragment will undergo substitutions where `$N` or
 421 `$<N`,for some numeric `N` (or non-numeric indicator as described
 422 later), will be replaced with a variable holding the parse information
 423 for the particular symbol in the production.  `$0` is the head of the
 424 production, `$1` is the first symbol of the body, etc.  The type of `$N`
 425 for a terminal symbol is `struct token`.  For a non-terminal, it is
 426 whatever has been declared for that symbol.  The `<` may be included and
 427 means that the value (usually a reference) is being moved out, so it
 428 will not automatically be freed.  The effect of using '<' is that the
 429 variable is cleared to all-zeros.
 430
 431 While building productions we will need to add to an array which needs to
 432 grow dynamically.
 433
 434 ###### functions
 435         static void array_add(void *varray, int *cnt, void *new)
 436         {
 437                 void ***array = varray;
 438                 int current = 0;
 439                 const int step = 8;
 440                 current = ((*cnt-1) | (step-1))+1;
 441                 if (*cnt == current) {
 442                         /* must grow */
 443                         current += step;
 444                         *array = realloc(*array, current * sizeof(void*));
 445                 }
 446                 (*array)[*cnt] = new;
 447                 (*cnt) += 1;
 448         }
 449
 450 Collecting the code fragment simply involves reading tokens until we
 451 hit the end token `}$`, and noting the character position of the start and
 452 the end.
 453
 454 ###### functions
 455         static struct text collect_code(struct token_state *state,
 456                                         struct token start)
 457         {
 458                 struct text code;
 459                 struct token t;
 460                 code.txt = start.txt.txt + start.txt.len;
 461                 do
 462                         t = token_next(state);
 463                 while (t.node == start.node &&
 464                        t.num != TK_close && t.num != TK_error &&
 465                        t.num != TK_eof);
 466                 if (t.num == TK_close && t.node == start.node)
 467                         code.len = t.txt.txt - code.txt;
 468                 else
 469                         code.txt = NULL;
 470                 return code;
 471         }
 472
 473 Now we have all the bits we need to parse a full production.
 474
 475 ###### includes
 476         #include <memory.h>
 477
 478 ###### grammar fields
 479         struct production **productions;
 480         int production_count;
 481
 482 ###### production fields
 483         struct symbol  *head;
 484         struct symbol **body;
 485         int             body_size;
 486         struct text     code;
 487         int             code_line;
 488
 489 ###### symbol fields
 490         int first_production;
 491
 492 ###### functions
 493         static char *parse_production(struct grammar *g,
 494                                       struct symbol *head,
 495                                       struct token_state *state)
 496         {
 497                 /* Head has already been parsed. */
 498                 struct token tk;
 499                 char *err;
 500                 struct production p, *pp;
 501
 502                 memset(&p, 0, sizeof(p));
 503                 p.head = head;
 504                 tk = token_next(state);
 505                 while (tk.num == TK_ident || tk.num == TK_mark) {
 506                         struct symbol *bs = sym_find(g, tk.txt);
 507                         if (bs->type == Unknown) {
 508                                 if (!g->terminals_declared)
 509                                         bs->type = Terminal;
 510                         }
 511                         if (bs->type == Virtual) {
 512                                 err = "Virtual symbol not permitted in production";
 513                                 goto abort;
 514                         }
 515                         if (bs->precedence) {
 516                                 p.precedence = bs->precedence;
 517                                 p.assoc = bs->assoc;
 518                         }
 519                         array_add(&p.body, &p.body_size, bs);
 520                         tk = token_next(state);
 521                 }
 522                 if (tk.num == TK_virtual) {
 523                         struct symbol *vs;
 524                         tk = token_next(state);
 525                         if (tk.num != TK_ident) {
 526                                 err = "word required after $$";
 527                                 goto abort;
 528                         }
 529                         vs = sym_find(g, tk.txt);
 530                         if (vs->num == TK_newline)
 531                                 p.line_like = 1;
 532                         else if (vs->num == TK_out)
 533                                 p.line_like = 2;
 534                         else if (vs->precedence == 0) {
 535                                 err = "symbol after $$ must have precedence";
 536                                 goto abort;
 537                         } else {
 538                                 p.precedence = vs->precedence;
 539                                 p.assoc = vs->assoc;
 540                         }
 541                         tk = token_next(state);
 542                 }
 543                 if (tk.num == TK_open) {
 544                         p.code_line = tk.line;
 545                         p.code = collect_code(state, tk);
 546                         if (p.code.txt == NULL) {
 547                                 err = "code fragment not closed properly";
 548                                 goto abort;
 549                         }
 550                         tk = token_next(state);
 551                 }
 552
 553                 if (tk.num != TK_newline && tk.num != TK_eof) {
 554                         err = "stray tokens at end of line";
 555                         goto abort;
 556                 }
 557                 pp = malloc(sizeof(*pp));
 558                 *pp = p;
 559                 array_add(&g->productions, &g->production_count, pp);
 560                 return NULL;
 561         abort:
 562                 while (tk.num != TK_newline && tk.num != TK_eof)
 563                         tk = token_next(state);
 564                 free(p.body);
 565                 return err;
 566         }
 567
 568 With the ability to parse productions and dollar-lines, we have nearly all
 569 that we need to parse a grammar from a `code_node`.
 570
 571 The head of the first production will effectively be the `start` symbol
 572 of the grammar.  However it won't _actually_ be so.  Processing the
 573 grammar is greatly simplified if the real start symbol only has a single
 574 production, and expects `$eof` as the final terminal.  So when we find
 575 the first explicit production we insert an extra implicit production as
 576 production zero which looks like
 577
 578 ###### Example: production 0
 579         $start -> START $eof
 580
 581 where `START` is the first non-terminal given.
 582
 583 ###### create production zero
 584         struct production *p = calloc(1,sizeof(*p));
 585         struct text start = {"$start",6};
 586         struct text eof = {"$eof",4};
 587         struct text code = {"$0 = $<1;", 9};
 588         p->head = sym_find(g, start);
 589         p->head->type = Nonterminal;
 590         p->head->struct_name = g->current_type;
 591         p->head->isref = g->type_isref;
 592         if (g->current_type.txt)
 593                 p->code = code;
 594         array_add(&p->body, &p->body_size, head);
 595         array_add(&p->body, &p->body_size, sym_find(g, eof));
 596         p->head->first_production = g->production_count;
 597         array_add(&g->productions, &g->production_count, p);
 598
 599 Now we are ready to read in the grammar.  We ignore comments
 600 and strings so that the marks which introduce them can be
 601 used as terminals in the grammar.  We don't ignore numbers
 602 even though we don't allow them as that causes the scanner
 603 to produce errors that the parser is better positioned to handle.
 604 We also ignore indentation, but we expect and use newlines.
 605
 606 To allow comments in the grammar, we explicitly check for "//" as the
 607 first token on the line and those lines are skipped.  "//" can still be
 608 used as a terminal anywhere that a terminal is expected.
 609
 610 ###### grammar_read
 611         static struct grammar *grammar_read(struct code_node *code)
 612         {
 613                 struct token_config conf = {
 614                         .word_start = "",
 615                         .word_cont = "",
 616                         .words_marks = known,
 617                         .known_count = sizeof(known)/sizeof(known[0]),
 618                         .number_chars = "",
 619                         .ignored = (1 << TK_line_comment)
 620                                  | (1 << TK_block_comment)
 621                                  | (0 << TK_number)
 622                                  | (1 << TK_string)
 623                                  | (1 << TK_multi_string)
 624                                  | (1 << TK_in)
 625                                  | (1 << TK_out),
 626                 };
 627
 628                 struct token_state *state = token_open(code, &conf);
 629                 struct token tk;
 630                 struct symbol *head = NULL;
 631                 struct grammar *g;
 632                 char *err = NULL;
 633
 634                 g = calloc(1, sizeof(*g));
 635                 symbols_init(g);
 636
 637                 for (tk = token_next(state); tk.num != TK_eof;
 638                      tk = token_next(state)) {
 639                         if (tk.num == TK_newline)
 640                                 continue;
 641                         if (tk.num == TK_ident) {
 642                                 // new non-terminal
 643                                 head = sym_find(g, tk.txt);
 644                                 if (head->type == Nonterminal)
 645                                         err = "This non-terminal has already be used.";
 646                                 else if (head->type == Virtual)
 647                                         err = "Virtual symbol not permitted in head of production";
 648                                 else {
 649                                         head->type = Nonterminal;
 650                                         head->struct_name = g->current_type;
 651                                         head->isref = g->type_isref;
 652                                         if (g->production_count == 0) {
 653                                                 ## create production zero
 654                                         }
 655                                         head->first_production = g->production_count;
 656                                         tk = token_next(state);
 657                                         if (tk.num == TK_mark &&
 658                                             text_is(tk.txt, "->"))
 659                                                 err = parse_production(g, head, state);
 660                                         else
 661                                                 err = "'->' missing in production";
 662                                 }
 663                         } else if (tk.num == TK_mark
 664                                    && text_is(tk.txt, "|")) {
 665                                 // another production for same non-term
 666                                 if (head)
 667                                         err = parse_production(g, head, state);
 668                                 else
 669                                         err = "First production must have a head";
 670                         } else if (tk.num == TK_mark
 671                                    && text_is(tk.txt, "$")) {
 672                                 err = dollar_line(state, g, 0);
 673                         } else if (tk.num == TK_mark
 674                                    && text_is(tk.txt, "$*")) {
 675                                 err = dollar_line(state, g, 1);
 676                         } else if (tk.num == TK_mark
 677                                    && text_is(tk.txt, "//")) {
 678                                 while (tk.num != TK_newline &&
 679                                        tk.num != TK_eof)
 680                                         tk = token_next(state);
 681                         } else {
 682                                 err = "Unrecognised token at start of line.";
 683                         }
 684                         if (err)
 685                                 goto abort;
 686                 }
 687                 token_close(state);
 688                 if (g->terminals_declared) {
 689                         struct symbol *s;
 690                         int errs = 0;
 691                         for (s = g->syms; s; s = s->next) {
 692                                 if (s->type != Unknown)
 693                                         continue;
 694                                 errs += 1;
 695                                 fprintf(stderr, "Token %.*s not declared\n",
 696                                         s->name.len, s->name.txt);
 697                         }
 698                         if (errs) {
 699                                 free(g); // FIXME free content
 700                                 g = NULL;
 701                         }
 702                 }
 703                 return g;
 704         abort:
 705                 fprintf(stderr, "Error at line %d: %s\n",
 706                         tk.line, err);
 707                 token_close(state);
 708                 free(g); // FIXME free content
 709                 return NULL;
 710         }
 711
 712 ## Analysing the grammar
 713
 714 The central task in analysing the grammar is to determine a set of
 715 states to drive the parsing process.  This will require finding various
 716 sets of symbols and of "items".  Some of these sets will need to attach
 717 information to each element in the set, so they are more like maps.
 718
 719 Each "item" may have a 'look-ahead' or `LA` set associated with it.
 720 Many of these will be the same as each other.  To avoid memory wastage,
 721 and to simplify some comparisons of sets, these sets will be stored in a
 722 list of unique sets, each assigned a number.
 723
 724 Once we have the data structures in hand to manage these sets and lists,
 725 we can start setting the 'nullable' flag, build the 'FIRST' sets, and
 726 then create the item sets which define the various states.
 727
 728 ### Symbol sets.
 729
 730 Though we don't only store symbols in these sets, they are the main
 731 things we store, so they are called symbol sets or "symsets".
 732
 733 A symset has a size, an array of shorts, and an optional array of data
 734 values, which are also shorts.  If the array of data is not present, we
 735 store an impossible pointer, as `NULL` is used to indicate that no
 736 memory has been allocated yet;
 737
 738 ###### declarations
 739         struct symset {
 740                 short cnt;
 741                 unsigned short *syms, *data;
 742         };
 743         #define NO_DATA ((unsigned short *)1)
 744         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 745         const struct symset INIT_DATASET = { 0, NULL, NULL };
 746
 747 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
 748 using an insertion sort.  We don't explicitly record the amount of
 749 allocated space as it can be derived directly from the current `cnt`
 750 using `((cnt - 1) | 7) + 1`.
 751
 752 ###### functions
 753         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 754         {
 755                 int i;
 756                 int current = ((s->cnt-1) | 7) + 1;
 757                 if (current == s->cnt) {
 758                         current += 8;
 759                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 760                         if (s->data != NO_DATA)
 761                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 762                 }
 763                 i = s->cnt;
 764                 while (i > 0 && s->syms[i-1] > key) {
 765                         s->syms[i] = s->syms[i-1];
 766                         if (s->data != NO_DATA)
 767                                 s->data[i] = s->data[i-1];
 768                         i--;
 769                 }
 770                 s->syms[i] = key;
 771                 if (s->data != NO_DATA)
 772                         s->data[i] = val;
 773                 s->cnt += 1;
 774         }
 775
 776 Finding a symbol (or item) in a `symset` uses a simple binary search.
 777 We return the index where the value was found (so data can be accessed),
 778 or `-1` to indicate failure.
 779
 780         static int symset_find(struct symset *ss, unsigned short key)
 781         {
 782                 int lo = 0;
 783                 int hi = ss->cnt;
 784
 785                 if (hi == 0)
 786                         return -1;
 787                 while (lo + 1 < hi) {
 788                         int mid = (lo + hi) / 2;
 789                         if (ss->syms[mid] <= key)
 790                                 lo = mid;
 791                         else
 792                                 hi = mid;
 793                 }
 794                 if (ss->syms[lo] == key)
 795                         return lo;
 796                 return -1;
 797         }
 798
 799 We will often want to form the union of two symsets.  When we do, we
 800 will often want to know if anything changed (as that might mean there is
 801 more work to do).  So `symset_union` reports whether anything was added
 802 to the first set.  We use a slow quadratic approach as these sets are
 803 rarely large.  If profiling shows this to be a problem it can be
 804 optimised later.
 805
 806         static int symset_union(struct symset *a, struct symset *b)
 807         {
 808                 int i;
 809                 int added = 0;
 810                 for (i = 0; i < b->cnt; i++)
 811                         if (symset_find(a, b->syms[i]) < 0) {
 812                                 unsigned short data = 0;
 813                                 if (b->data != NO_DATA)
 814                                         data = b->data[i];
 815                                 symset_add(a, b->syms[i], data);
 816                                 added++;
 817                         }
 818                 return added;
 819         }
 820
 821 And of course we must be able to free a symset.
 822
 823         static void symset_free(struct symset ss)
 824         {
 825                 free(ss.syms);
 826                 if (ss.data != NO_DATA)
 827                         free(ss.data);
 828         }
 829
 830 ### Symset Storage
 831
 832 Some symsets are simply stored somewhere appropriate in the `grammar`
 833 data structure, others needs a bit of indirection.  This applies
 834 particularly to "LA" sets.  These will be paired with "items" in an
 835 "itemset".  As itemsets will be stored in a symset, the "LA" set needs
 836 to be stored in the `data` field, so we need a mapping from a small
 837 (short) number to an LA `symset`.
 838
 839 As mentioned earlier this involves creating a list of unique symsets.
 840
 841 For now, we just use a linear list sorted by insertion.  A skiplist
 842 would be more efficient and may be added later.
 843
 844 ###### declarations
 845
 846         struct setlist {
 847                 struct symset ss;
 848                 int num;
 849                 struct setlist *next;
 850         };
 851
 852 ###### grammar fields
 853         struct setlist *sets;
 854         int nextset;
 855
 856 ###### functions
 857
 858         static int ss_cmp(struct symset a, struct symset b)
 859         {
 860                 int i;
 861                 int diff = a.cnt - b.cnt;
 862                 if (diff)
 863                         return diff;
 864                 for (i = 0; i < a.cnt; i++) {
 865                         diff = (int)a.syms[i] - (int)b.syms[i];
 866                         if (diff)
 867                                 return diff;
 868                 }
 869                 return 0;
 870         }
 871
 872         static int save_set(struct grammar *g, struct symset ss)
 873         {
 874                 struct setlist **sl = &g->sets;
 875                 int cmp = 1;
 876                 struct setlist *s;
 877
 878                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 879                         sl = & (*sl)->next;
 880                 if (cmp == 0) {
 881                         symset_free(ss);
 882                         return (*sl)->num;
 883                 }
 884
 885                 s = malloc(sizeof(*s));
 886                 s->ss = ss;
 887                 s->num = g->nextset;
 888                 g->nextset += 1;
 889                 s->next = *sl;
 890                 *sl = s;
 891                 return s->num;
 892         }
 893
 894 Finding a set by number is currently performed by a simple linear
 895 search.  If this turns out to hurt performance, we can store the sets in
 896 a dynamic array like the productions.
 897
 898         static struct symset set_find(struct grammar *g, int num)
 899         {
 900                 struct setlist *sl = g->sets;
 901                 while (sl && sl->num != num)
 902                         sl = sl->next;
 903                 return sl->ss;
 904         }
 905
 906 ### Setting `nullable`
 907
 908 We set `nullable` on the head symbol for any production for which all
 909 the body symbols (if any) are nullable.  As this is a recursive
 910 definition, any change in the `nullable` setting means that we need to
 911 re-evaluate where it needs to be set.
 912
 913 We simply loop around performing the same calculations until no more
 914 changes happen.
 915
 916 ###### symbol fields
 917         int nullable;
 918
 919 ###### functions
 920         static void set_nullable(struct grammar *g)
 921         {
 922                 int check_again = 1;
 923                 while (check_again) {
 924                         int p;
 925                         check_again = 0;
 926                         for (p = 0; p < g->production_count; p++) {
 927                                 struct production *pr = g->productions[p];
 928                                 int s;
 929
 930                                 if (pr->head->nullable)
 931                                         continue;
 932                                 for (s = 0; s < pr->body_size; s++)
 933                                         if (! pr->body[s]->nullable)
 934                                                 break;
 935                                 if (s == pr->body_size) {
 936                                         pr->head->nullable = 1;
 937                                         check_again = 1;
 938                                 }
 939                         }
 940                 }
 941         }
 942
 943 ### Setting `line_like`
 944
 945 In order to be able to ignore newline tokens when not relevant, but
 946 still include them in the parse when needed, we will need to know
 947 which states can start a "line-like" section of code.  We ignore
 948 newlines when there is an indent since the most recent start of a
 949 line-like symbol.
 950
 951 A "line_like" symbol is simply any symbol that can derive a NEWLINE.
 952 If a symbol cannot derive a NEWLINE, then it is only part of a line -
 953 so is word-like.  If it can derive a NEWLINE, then we consider it to
 954 be like a line.
 955
 956 Clearly the `TK_newline` token can derive a NEWLINE.  Any symbol which
 957 is the head of a production that contains a line_like symbol is also a
 958 line-like symbol.  We use a new field `line_like` to record this
 959 attribute of symbols, and compute it in a repetitive manner similar to
 960 `set_nullable`.
 961
 962 ###### symbol fields
 963         int line_like;
 964
 965 ###### functions
 966         static void set_line_like(struct grammar *g)
 967         {
 968                 int check_again = 1;
 969                 g->symtab[TK_newline]->line_like = 1;
 970                 while (check_again) {
 971                         int p;
 972                         check_again = 0;
 973                         for (p = 0; p < g->production_count; p++) {
 974                                 struct production *pr = g->productions[p];
 975                                 int s;
 976
 977                                 if (pr->head->line_like)
 978                                         continue;
 979
 980                                 for (s = 0 ; s < pr->body_size; s++) {
 981                                         if (pr->body[s]->line_like) {
 982                                                 pr->head->line_like = 1;
 983                                                 check_again = 1;
 984                                                 break;
 985                                         }
 986                                 }
 987                         }
 988                 }
 989         }
 990
 991 ### Building the `first` sets
 992
 993 When calculating what can follow a particular non-terminal, we will need
 994 to know what the "first" terminal in any subsequent non-terminal might
 995 be.  So we calculate the `first` set for every non-terminal and store
 996 them in an array.  We don't bother recording the "first" set for
 997 terminals as they are trivial.
 998
 999 As the "first" for one symbol might depend on the "first" of another, we
1000 repeat the calculations until no changes happen, just like with
1001 `set_nullable`.  This makes use of the fact that `symset_union` reports
1002 if any change happens.
1003
1004 The core of this, which finds the "first" of part of a production body,
1005 will be reused for computing the "follow" sets, so we split that out
1006 into a separate function, `add_first`, which also reports if it got all
1007 the way to the end of the production without finding a non-nullable
1008 symbol.
1009
1010 ###### grammar fields
1011         struct symset *first;
1012
1013 ###### functions
1014
1015         static int add_first(struct production *pr, int start,
1016                              struct symset *target, struct grammar *g,
1017                              int *to_end)
1018         {
1019                 int s;
1020                 int changed = 0;
1021                 for (s = start; s < pr->body_size; s++) {
1022                         struct symbol *bs = pr->body[s];
1023                         if (bs->type == Terminal) {
1024                                 if (symset_find(target, bs->num) < 0) {
1025                                         symset_add(target, bs->num, 0);
1026                                         changed = 1;
1027                                 }
1028                                 break;
1029                         } else if (symset_union(target, &g->first[bs->num]))
1030                                 changed = 1;
1031                         if (!bs->nullable)
1032                                 break;
1033                 }
1034                 if (to_end)
1035                         *to_end = (s == pr->body_size);
1036                 return changed;
1037         }
1038
1039         static void build_first(struct grammar *g)
1040         {
1041                 int check_again = 1;
1042                 int s;
1043                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1044                 for (s = 0; s < g->num_syms; s++)
1045                         g->first[s] = INIT_SYMSET;
1046
1047                 while (check_again) {
1048                         int p;
1049                         check_again = 0;
1050                         for (p = 0; p < g->production_count; p++) {
1051                                 struct production *pr = g->productions[p];
1052                                 struct symset *head = &g->first[pr->head->num];
1053
1054                                 if (add_first(pr, 0, head, g, NULL))
1055                                         check_again = 1;
1056                         }
1057                 }
1058         }
1059
1060 ### Building the `follow` sets.
1061
1062 There are two different situations when we will want to generate
1063 "follow" sets.  If we are doing an SLR analysis, we want to generate a
1064 single "follow" set for each non-terminal in the grammar.  That is what
1065 is happening here.  If we are doing an LALR or LR analysis we will want
1066 to generate a separate "LA" set for each item.  We do that later in
1067 state generation.
1068
1069 There are two parts to generating a "follow" set.  Firstly we look at
1070 every place that any non-terminal appears in the body of any production,
1071 and we find the set of possible "first" symbols after there.  This is
1072 done using `add_first` above and only needs to be done once as the
1073 "first" sets are now stable and will not change.
1074
1075 ###### grammar fields
1076         struct symset *follow;
1077
1078 ###### follow code
1079
1080         for (p = 0; p < g->production_count; p++) {
1081                 struct production *pr = g->productions[p];
1082                 int b;
1083
1084                 for (b = 0; b < pr->body_size - 1; b++) {
1085                         struct symbol *bs = pr->body[b];
1086                         if (bs->type == Terminal)
1087                                 continue;
1088                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1089                 }
1090         }
1091
1092 The second part is to add the "follow" set of the head of a production
1093 to the "follow" sets of the final symbol in the production, and any
1094 other symbol which is followed only by `nullable` symbols.  As this
1095 depends on "follow" itself we need to repeatedly perform the process
1096 until no further changes happen.
1097
1098 Rather than 2 nested loops, one that repeats the whole process until
1099 there is no change, and one that iterates of the set of productions, we
1100 combine these two functions into a single loop.
1101
1102 ###### follow code
1103
1104         for (check_again = 0, p = 0;
1105              p < g->production_count;
1106              p < g->production_count-1
1107                 ? p++ : check_again ? (check_again = 0, p = 0)
1108                               : p++) {
1109                 struct production *pr = g->productions[p];
1110                 int b;
1111
1112                 for (b = pr->body_size - 1; b >= 0; b--) {
1113                         struct symbol *bs = pr->body[b];
1114                         if (bs->type == Terminal)
1115                                 break;
1116                         if (symset_union(&g->follow[bs->num],
1117                                          &g->follow[pr->head->num]))
1118                                 check_again = 1;
1119                         if (!bs->nullable)
1120                                 break;
1121                 }
1122         }
1123
1124 We now just need to create and initialise the `follow` list to get a
1125 complete function.
1126
1127 ###### functions
1128         static void build_follow(struct grammar *g)
1129         {
1130                 int s, check_again, p;
1131                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1132                 for (s = 0; s < g->num_syms; s++)
1133                         g->follow[s] = INIT_SYMSET;
1134                 ## follow code
1135         }
1136
1137 ### Building itemsets and states
1138
1139 There are three different levels of detail that can be chosen for
1140 building the itemsets and states for the LR grammar.  They are:
1141
1142 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1143    itemsets - look-ahead, if used at all, is simply the 'follow' sets
1144    already computed,
1145 2. LALR(1) where we build look-ahead sets with each item and merge
1146    the LA sets when we find two paths to the same "kernel" set of items.
1147 3. LR(1) where different look-ahead for any item in the set means
1148    a different item set must be created.
1149
1150 ###### forward declarations
1151         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1152
1153 We need to be able to look through existing states to see if a newly
1154 generated state already exists.  For now we use a simple sorted linked
1155 list.
1156
1157 An item is a pair of numbers: the production number and the position of
1158 "DOT", which is an index into the body of the production.  As the
1159 numbers are not enormous we can combine them into a single "short" and
1160 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1161 production number (so 2000 productions with maximum of 31 symbols in the
1162 body).
1163
1164 Comparing the itemsets will be a little different to comparing symsets
1165 as we want to do the lookup after generating the "kernel" of an
1166 itemset, so we need to ignore the offset=zero items which are added during
1167 completion.
1168
1169 To facilitate this, we modify the "DOT" number so that "0" sorts to
1170 the end of the list in the symset, and then only compare items before
1171 the first "0".
1172
1173 ###### declarations
1174         static inline unsigned short item_num(int production, int dot)
1175         {
1176                 return production | ((31-dot) << 11);
1177         }
1178         static inline int item_prod(unsigned short item)
1179         {
1180                 return item & 0x7ff;
1181         }
1182         static inline int item_dot(unsigned short item)
1183         {
1184                 return (31-(item >> 11)) & 0x1f;
1185         }
1186
1187 For LR(1) analysis we need to compare not just the itemset in a state
1188 but also the LA sets.  As we assign each unique LA set a number, we
1189 can just compare the symset and the data values together.
1190
1191 ###### functions
1192         static int itemset_cmp(struct symset a, struct symset b,
1193                                enum grammar_type type)
1194         {
1195                 int i;
1196                 int av, bv;
1197
1198                 for (i = 0;
1199                      i < a.cnt && i < b.cnt &&
1200                      item_dot(a.syms[i]) > 0 &&
1201                      item_dot(b.syms[i]) > 0;
1202                      i++) {
1203                         int diff = a.syms[i] - b.syms[i];
1204                         if (diff)
1205                                 return diff;
1206                         if (type == LR1) {
1207                                 diff = a.data[i] - b.data[i];
1208                                 if (diff)
1209                                         return diff;
1210                         }
1211                 }
1212                 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1213                         av = -1;
1214                 else
1215                         av = a.syms[i];
1216                 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1217                         bv = -1;
1218                 else
1219                         bv = b.syms[i];
1220                 if (av - bv)
1221                         return av - bv;
1222                 if (type < LR1 || av == -1)
1223                         return 0;
1224                 return
1225                         a.data[i] - b.data[i];
1226         }
1227
1228 It will be helpful to know if an itemset has been "completed" or not,
1229 particularly for LALR where itemsets get merged, at which point they
1230 need to be consider for completion again.  So a `completed` flag is
1231 needed.
1232
1233 For correct handling of `TK_newline` when parsing, we will need to
1234 know which states (itemsets) can occur at the start of a line, so we
1235 will record a `starts_line` flag too whenever DOT is at the start of a
1236 `line_like` symbol.
1237
1238 Finally, for handling `TK_out` we need to know whether productions in the
1239 current state started *before* the most recent indent.  A state
1240 doesn't usually keep details of individual productions, so we need to
1241 add one extra detail. `min_prefix` is the smallest non-zero number of
1242 symbols *before* DOT in any production in an itemset.  This will allow
1243 us to determine if the the most recent indent is sufficiently recent
1244 to cancel it against a `TK_out`.  If it was seen longer ago than the
1245 `min_prefix`, and if the current state cannot be reduced, then the
1246 indented section must have ended in the middle of a syntactic unit, so
1247 an error must be signaled.
1248
1249 And now we can build the list of itemsets.  The lookup routine returns
1250 both a success flag and a pointer to where in the list an insert should
1251 happen, so we don't need to search a second time.
1252
1253 ###### declarations
1254         struct itemset {
1255                 struct itemset *next;
1256                 short state;
1257                 struct symset items;
1258                 struct symset go_to;
1259                 enum assoc assoc;
1260                 unsigned short precedence;
1261                 char completed;
1262                 char starts_line;
1263                 int min_prefix;
1264         };
1265
1266 ###### grammar fields
1267         struct itemset *items;
1268         int states;
1269
1270 ###### functions
1271         static int itemset_find(struct grammar *g, struct itemset ***where,
1272                                 struct symset kernel, enum grammar_type type)
1273         {
1274                 struct itemset **ip;
1275
1276                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1277                         struct itemset *i = *ip;
1278                         int diff;
1279                         diff = itemset_cmp(i->items, kernel, type);
1280                         if (diff < 0)
1281                                 continue;
1282                         if (diff > 0)
1283                                 break;
1284                         /* found */
1285                         *where = ip;
1286                         return 1;
1287                 }
1288                 *where = ip;
1289                 return 0;
1290         }
1291
1292 Adding an itemset may require merging the LA sets if LALR analysis is
1293 happening.  If any new LA set adds any symbols that weren't in the old
1294 LA set, we clear the `completed` flag so that the dependants of this
1295 itemset will be recalculated and their LA sets updated.
1296
1297 `add_itemset` must consume the symsets it is passed, either by adding
1298 them to a data structure, of freeing them.
1299
1300         static int add_itemset(struct grammar *g, struct symset ss,
1301                                enum assoc assoc, unsigned short precedence,
1302                                enum grammar_type type)
1303         {
1304                 struct itemset **where, *is;
1305                 int i;
1306                 int found = itemset_find(g, &where, ss, type);
1307                 if (!found) {
1308                         is = calloc(1, sizeof(*is));
1309                         is->state = g->states;
1310                         g->states += 1;
1311                         is->items = ss;
1312                         is->assoc = assoc;
1313                         is->precedence = precedence;
1314                         is->next = *where;
1315                         is->go_to = INIT_DATASET;
1316                         *where = is;
1317                         return is->state;
1318                 }
1319                 is = *where;
1320                 if (type != LALR) {
1321                         symset_free(ss);
1322                         return is->state;
1323                 }
1324                 for (i = 0; i < ss.cnt; i++) {
1325                         struct symset temp = INIT_SYMSET, s;
1326                         if (ss.data[i] == is->items.data[i])
1327                                 continue;
1328                         s = set_find(g, is->items.data[i]);
1329                         symset_union(&temp, &s);
1330                         s = set_find(g, ss.data[i]);
1331                         if (symset_union(&temp, &s)) {
1332                                 is->items.data[i] = save_set(g, temp);
1333                                 is->completed = 0;
1334                         } else
1335                                 symset_free(temp);
1336                 }
1337                 symset_free(ss);
1338                 return is->state;
1339         }
1340
1341 #### The build
1342
1343 To build all the itemsets, we first insert the initial itemset made from
1344 production zero, complete each itemset, and then generate new itemsets
1345 from old until no new ones can be made.
1346
1347 Completing an itemset means finding all the items where "DOT" is
1348 followed by a nonterminal and adding "DOT=0" items for every production
1349 from that non-terminal - providing each item hasn't already been added.
1350 When we add such an item it might get inserted before the current
1351 one, so to make sure we process it we reset the loop counter to the
1352 start.
1353
1354 If LA sets are needed, the LA set for each new item is found using
1355 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This
1356 may be supplemented by the LA set for the item which produced the new
1357 item.
1358
1359 We also collect a set of all symbols which follow "DOT" (in `done`) as
1360 this is used in the next stage.  If any of these symbols are flagged as
1361 `line_like`, then this state must be a `starts_line` state so now is a
1362 good time to record that.
1363
1364 When itemsets are created we assign a precedence to the itemset from the
1365 complete item, if there is one.  We ignore the possibility of there
1366 being two and don't (currently) handle precedence in such grammars.
1367 When completing a grammar we ignore any item where DOT is followed by a
1368 terminal with a precedence lower than that for the itemset.  Unless the
1369 terminal has right associativity, we also ignore items where the
1370 terminal has the same precedence.  The result is that unwanted items are
1371 still in the itemset, but the terminal doesn't get into the go to set,
1372 so the item is ineffective.
1373
1374 ###### complete itemset
1375         for (i = 0; i < is->items.cnt; i++) {
1376                 int p = item_prod(is->items.syms[i]);
1377                 int bs = item_dot(is->items.syms[i]);
1378                 struct production *pr = g->productions[p];
1379                 int p2;
1380                 struct symbol *s;
1381                 struct symset LA = INIT_SYMSET;
1382                 unsigned short sn = 0;
1383                 struct symset LAnl = INIT_SYMSET;
1384                 unsigned short snnl = 0;
1385
1386                 if (is->min_prefix == 0 ||
1387                     (bs > 0 && bs < is->min_prefix))
1388                         is->min_prefix = bs;
1389                 if (bs == pr->body_size)
1390                         continue;
1391                 s = pr->body[bs];
1392                 if (s->precedence && is->precedence &&
1393                     is->precedence > s->precedence)
1394                         /* This terminal has a low precedence and
1395                          * shouldn't be shifted
1396                          */
1397                         continue;
1398                 if (s->precedence && is->precedence &&
1399                     is->precedence == s->precedence && s->assoc != Right)
1400                         /* This terminal has a matching precedence and is
1401                          * not Right-associative, so we mustn't shift it.
1402                          */
1403                         continue;
1404                 if (symset_find(&done, s->num) < 0) {
1405                         symset_add(&done, s->num, 0);
1406                 }
1407                 if (s->type != Nonterminal)
1408                         continue;
1409                 if (s->line_like)
1410                         is->starts_line = 1;
1411                 check_again = 1;
1412                 if (type >= LALR) {
1413                         // Need the LA set.
1414                         int to_end;
1415                         add_first(pr, bs+1, &LA, g, &to_end);
1416                         if (to_end) {
1417                                 struct symset ss = set_find(g, is->items.data[i]);
1418                                 symset_union(&LA, &ss);
1419                         }
1420                         sn = save_set(g, LA);
1421                         LA = set_find(g, sn);
1422                         if (symset_find(&LA, TK_newline))
1423                                 symset_add(&LAnl, TK_newline, 0);
1424                         snnl = save_set(g, LAnl);
1425                         LAnl = set_find(g, snnl);
1426                 }
1427
1428                 /* Add productions for this symbol */
1429                 for (p2 = s->first_production;
1430                      p2 < g->production_count &&
1431                       g->productions[p2]->head == s;
1432                      p2++) {
1433                         int itm = item_num(p2, 0);
1434                         int pos = symset_find(&is->items, itm);
1435                         if (pos < 0) {
1436                                 if (g->productions[p2]->line_like)
1437                                         symset_add(&is->items, itm, snnl);
1438                                 else
1439                                         symset_add(&is->items, itm, sn);
1440                                 /* Will have re-ordered, so start
1441                                  * from beginning again */
1442                                 i = -1;
1443                         } else if (type >= LALR) {
1444                                 struct symset ss = set_find(g, is->items.data[pos]);
1445                                 struct symset tmp = INIT_SYMSET;
1446                                 struct symset *la = &LA;
1447
1448                                 if (g->productions[p2]->line_like)
1449                                         la = &LAnl;
1450                                 symset_union(&tmp, &ss);
1451                                 if (symset_union(&tmp, la)) {
1452                                         is->items.data[pos] = save_set(g, tmp);
1453                                         i = -1;
1454                                 } else
1455                                         symset_free(tmp);
1456                         }
1457                 }
1458         }
1459
1460 For each symbol we found (and placed in `done`) we collect all the items
1461 for which this symbol is next, and create a new itemset with "DOT"
1462 advanced over the symbol.  This is then added to the collection of
1463 itemsets (or merged with a pre-existing itemset).
1464
1465 ###### derive itemsets
1466         // Now we have a completed itemset, so we need to
1467         // compute all the 'next' itemsets and create them
1468         // if they don't exist.
1469         for (i = 0; i < done.cnt; i++) {
1470                 int j;
1471                 unsigned short state;
1472                 struct symbol *sym = g->symtab[done.syms[i]];
1473                 enum assoc assoc = Non;
1474                 unsigned short precedence = 0;
1475                 struct symset newitemset = INIT_SYMSET;
1476                 if (type >= LALR)
1477                         newitemset = INIT_DATASET;
1478
1479                 for (j = 0; j < is->items.cnt; j++) {
1480                         int itm = is->items.syms[j];
1481                         int p = item_prod(itm);
1482                         int bp = item_dot(itm);
1483                         struct production *pr = g->productions[p];
1484                         unsigned short la = 0;
1485                         int pos;
1486
1487                         if (bp == pr->body_size)
1488                                 continue;
1489                         if (pr->body[bp] != sym)
1490                                 continue;
1491
1492                         bp += 1;
1493                         if (type >= LALR)
1494                                 la = is->items.data[j];
1495                         if (bp == pr->body_size &&
1496                             pr->precedence > 0 &&
1497                             pr->precedence > precedence) {
1498                                 // new itemset is reducible and has a precedence.
1499                                 precedence = pr->precedence;
1500                                 assoc = pr->assoc;
1501                         }
1502                         pos = symset_find(&newitemset, item_num(p, bp));
1503
1504                         if (pos < 0)
1505                                 symset_add(&newitemset, item_num(p, bp), la);
1506                         else if (type >= LALR) {
1507                                 // Need to merge la set.
1508                                 int la2 = newitemset.data[pos];
1509                                 if (la != la2) {
1510                                         struct symset ss = set_find(g, la2);
1511                                         struct symset LA = INIT_SYMSET;
1512                                         symset_union(&LA, &ss);
1513                                         ss = set_find(g, la);
1514                                         if (symset_union(&LA, &ss))
1515                                                 newitemset.data[pos] = save_set(g, LA);
1516                                         else
1517                                                 symset_free(LA);
1518                                 }
1519                         }
1520                 }
1521                 state = add_itemset(g, newitemset, assoc, precedence, type);
1522                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1523                         symset_add(&is->go_to, done.syms[i], state);
1524         }
1525
1526 All that is left is to create the initial itemset from production zero, and
1527 with `TK_eof` as the LA set.
1528
1529 ###### functions
1530         static void build_itemsets(struct grammar *g, enum grammar_type type)
1531         {
1532                 struct symset first = INIT_SYMSET;
1533                 struct itemset *is;
1534                 int check_again;
1535                 unsigned short la = 0;
1536                 if (type >= LALR) {
1537                         // LA set just has eof
1538                         struct symset eof = INIT_SYMSET;
1539                         symset_add(&eof, TK_eof, 0);
1540                         la = save_set(g, eof);
1541                         first = INIT_DATASET;
1542                 }
1543                 // production 0, offset 0 (with no data)
1544                 symset_add(&first, item_num(0, 0), la);
1545                 add_itemset(g, first, Non, 0, type);
1546                 for (check_again = 0, is = g->items;
1547                      is;
1548                      is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1549                         int i;
1550                         struct symset done = INIT_SYMSET;
1551                         if (is->completed)
1552                                 continue;
1553                         is->completed = 1;
1554                         check_again = 1;
1555                         ## complete itemset
1556                         ## derive itemsets
1557                         symset_free(done);
1558                 }
1559         }
1560
1561 ### Completing the analysis.
1562
1563 The exact process of analysis depends on which level was requested.  For
1564 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1565 `LR1` we need first, but not follow.  For `SLR` we need both.
1566
1567 We don't build the "action" tables that you might expect as the parser
1568 doesn't use them.  They will be calculated to some extent if needed for
1569 a report.
1570
1571 Once we have built everything we allocate arrays for the two lists:
1572 symbols and itemsets.  This allows more efficient access during
1573 reporting.  The symbols are grouped as terminals and then non-terminals,
1574 and we record the changeover point in `first_nonterm`.
1575
1576 ###### grammar fields
1577         struct symbol **symtab;
1578         struct itemset **statetab;
1579         int first_nonterm;
1580
1581 ###### grammar_analyse
1582
1583         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1584         {
1585                 struct symbol *s;
1586                 struct itemset *is;
1587                 int snum = TK_reserved;
1588                 for (s = g->syms; s; s = s->next)
1589                         if (s->num < 0 && s->type == Terminal) {
1590                                 s->num = snum;
1591                                 snum++;
1592                         }
1593                 g->first_nonterm = snum;
1594                 for (s = g->syms; s; s = s->next)
1595                         if (s->num < 0 && s->type != Virtual) {
1596                                 s->num = snum;
1597                                 snum++;
1598                         }
1599                 for (s = g->syms; s; s = s->next)
1600                         if (s->num < 0) {
1601                                 s->num = snum;
1602                                 snum++;
1603                         }
1604                 g->num_syms = snum;
1605                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1606                 for (s = g->syms; s; s = s->next)
1607                         g->symtab[s->num] = s;
1608
1609                 set_nullable(g);
1610                 set_line_like(g);
1611                 if (type >= SLR)
1612                         build_first(g);
1613
1614                 if (type == SLR)
1615                         build_follow(g);
1616
1617                 build_itemsets(g, type);
1618
1619                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1620                 for (is = g->items; is ; is = is->next)
1621                         g->statetab[is->state] = is;
1622         }
1623
1624 ## Reporting on the Grammar
1625
1626 The purpose of the report is to give the grammar developer insight into
1627 how the grammar parser will work.  It is basically a structured dump of
1628 all the tables that have been generated, plus a description of any conflicts.
1629
1630 ###### grammar_report
1631         static int grammar_report(struct grammar *g, enum grammar_type type)
1632         {
1633                 report_symbols(g);
1634                 if (g->follow)
1635                         report_follow(g);
1636                 report_itemsets(g);
1637                 return report_conflicts(g, type);
1638         }
1639
1640 Firstly we have the complete list of symbols, together with the
1641 "FIRST" set if that was generated.  We add a mark to each symbol to
1642 show if it is considered to be "line-like" (`<`), or if it is nullable (`.`).
1643
1644 ###### functions
1645
1646         static void report_symbols(struct grammar *g)
1647         {
1648                 int n;
1649                 if (g->first)
1650                         printf("SYMBOLS + FIRST:\n");
1651                 else
1652                         printf("SYMBOLS:\n");
1653
1654                 for (n = 0; n < g->num_syms; n++) {
1655                         struct symbol *s = g->symtab[n];
1656                         if (!s)
1657                                 continue;
1658
1659                         printf(" %c%c%3d%c: ",
1660                                s->nullable ? '.':' ',
1661                                s->line_like ? '<':' ',
1662                                s->num, symtypes[s->type]);
1663                         prtxt(s->name);
1664                         if (s->precedence)
1665                                 printf(" (%d%s)", s->precedence,
1666                                        assoc_names[s->assoc]);
1667
1668                         if (g->first && s->type == Nonterminal) {
1669                                 int j;
1670                                 char c = ':';
1671                                 for (j = 0; j < g->first[n].cnt; j++) {
1672                                         printf("%c ", c);
1673                                         c = ',';
1674                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1675                                 }
1676                         }
1677                         printf("\n");
1678                 }
1679         }
1680
1681 Then we have the follow sets if they were computed.
1682
1683         static void report_follow(struct grammar *g)
1684         {
1685                 int n;
1686                 printf("FOLLOW:\n");
1687                 for (n = 0; n < g->num_syms; n++)
1688                         if (g->follow[n].cnt) {
1689                                 int j;
1690                                 char c = ':';
1691                                 printf("  ");
1692                                 prtxt(g->symtab[n]->name);
1693                                 for (j = 0; j < g->follow[n].cnt; j++) {
1694                                         printf("%c ", c);
1695                                         c = ',';
1696                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1697                                 }
1698                                 printf("\n");
1699                         }
1700         }
1701
1702 And finally the item sets.  These include the GO TO tables and, for
1703 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1704 it up a bit.  First the items, with production number and associativity.
1705
1706         static void report_item(struct grammar *g, int itm)
1707         {
1708                 int p = item_prod(itm);
1709                 int dot = item_dot(itm);
1710                 struct production *pr = g->productions[p];
1711                 int i;
1712
1713                 printf("    ");
1714                 prtxt(pr->head->name);
1715                 printf(" ->");
1716                 for (i = 0; i < pr->body_size; i++) {
1717                         printf(" %s", (dot == i ? ". ": ""));
1718                         prtxt(pr->body[i]->name);
1719                 }
1720                 if (dot == pr->body_size)
1721                         printf(" .");
1722                 printf(" [%d]", p);
1723                 if (pr->precedence && dot == pr->body_size)
1724                         printf(" (%d%s)", pr->precedence,
1725                                assoc_names[pr->assoc]);
1726                 if (dot < pr->body_size &&
1727                     pr->body[dot]->precedence) {
1728                         struct symbol *s = pr->body[dot];
1729                         printf(" [%d%s]", s->precedence,
1730                                assoc_names[s->assoc]);
1731                 }
1732                 if (pr->line_like == 1)
1733                         printf(" $$NEWLINE");
1734                 else if (pr->line_like)
1735                         printf(" $$OUT");
1736                 printf("\n");
1737         }
1738
1739 The LA sets which are (possibly) reported with each item:
1740
1741         static void report_la(struct grammar *g, int lanum)
1742         {
1743                 struct symset la = set_find(g, lanum);
1744                 int i;
1745                 char c = ':';
1746
1747                 printf("        LOOK AHEAD(%d)", lanum);
1748                 for (i = 0; i < la.cnt; i++) {
1749                         printf("%c ", c);
1750                         c = ',';
1751                         prtxt(g->symtab[la.syms[i]]->name);
1752                 }
1753                 printf("\n");
1754         }
1755
1756 Then the go to sets:
1757
1758         static void report_goto(struct grammar *g, struct symset gt)
1759         {
1760                 int i;
1761                 printf("    GOTO:\n");
1762
1763                 for (i = 0; i < gt.cnt; i++) {
1764                         printf("      ");
1765                         prtxt(g->symtab[gt.syms[i]]->name);
1766                         printf(" -> %d\n", gt.data[i]);
1767                 }
1768         }
1769
1770 Now we can report all the item sets complete with items, LA sets, and GO TO.
1771
1772         static void report_itemsets(struct grammar *g)
1773         {
1774                 int s;
1775                 printf("ITEM SETS(%d)\n", g->states);
1776                 for (s = 0; s < g->states; s++) {
1777                         int j;
1778                         struct itemset *is = g->statetab[s];
1779                         printf("  Itemset %d:%s min prefix=%d",
1780                                s, is->starts_line?" (startsline)":"", is->min_prefix);
1781                         if (is->precedence)
1782                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1783                         printf("\n");
1784                         for (j = 0; j < is->items.cnt; j++) {
1785                                 report_item(g, is->items.syms[j]);
1786                                 if (is->items.data != NO_DATA)
1787                                         report_la(g, is->items.data[j]);
1788                         }
1789                         report_goto(g, is->go_to);
1790                 }
1791         }
1792
1793 ### Reporting conflicts
1794
1795 Conflict detection varies a lot among different analysis levels.  However
1796 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1797 LR1 are also similar as they have FOLLOW or LA sets.
1798
1799 ###### functions
1800
1801         ## conflict functions
1802
1803         static int report_conflicts(struct grammar *g, enum grammar_type type)
1804         {
1805                 int cnt = 0;
1806                 printf("Conflicts:\n");
1807                 if (type < SLR)
1808                         cnt = conflicts_lr0(g, type);
1809                 else
1810                         cnt = conflicts_slr(g, type);
1811                 if (cnt == 0)
1812                         printf(" - no conflicts\n");
1813                 return cnt;
1814         }
1815
1816 LR0 conflicts are any state which have both a reducible item and
1817 a shiftable item, or two reducible items.
1818
1819 LR05 conflicts only occur if two possible reductions exist,
1820 as shifts always over-ride reductions.
1821
1822 ###### conflict functions
1823         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1824         {
1825                 int i;
1826                 int cnt = 0;
1827                 for (i = 0; i < g->states; i++) {
1828                         struct itemset *is = g->statetab[i];
1829                         int last_reduce = -1;
1830                         int prev_reduce = -1;
1831                         int last_shift = -1;
1832                         int j;
1833                         if (!is)
1834                                 continue;
1835                         for (j = 0; j < is->items.cnt; j++) {
1836                                 int itm = is->items.syms[j];
1837                                 int p = item_prod(itm);
1838                                 int bp = item_dot(itm);
1839                                 struct production *pr = g->productions[p];
1840
1841                                 if (bp == pr->body_size) {
1842                                         prev_reduce = last_reduce;
1843                                         last_reduce = j;
1844                                         continue;
1845                                 }
1846                                 if (pr->body[bp]->type == Terminal)
1847                                         last_shift = j;
1848                         }
1849                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1850                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1851                                 report_item(g, is->items.syms[last_shift]);
1852                                 report_item(g, is->items.syms[last_reduce]);
1853                                 cnt++;
1854                         }
1855                         if (prev_reduce >= 0) {
1856                                 printf("  State %d has 2 (or more) reducible items\n", i);
1857                                 report_item(g, is->items.syms[prev_reduce]);
1858                                 report_item(g, is->items.syms[last_reduce]);
1859                                 cnt++;
1860                         }
1861                 }
1862                 return cnt;
1863         }
1864
1865 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1866 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1867 in the source of the look ahead set.
1868
1869 We build two datasets to reflect the "action" table: one which maps
1870 terminals to items where that terminal could be shifted and another
1871 which maps terminals to items that could be reduced when the terminal
1872 is in look-ahead.  We report when we get conflicts between the two.
1873
1874 As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
1875 terminal, we ignore it.  NEWLINES are handled specially with its own
1876 rules for when to shift and when to reduce.  Conflicts are expected,
1877 but handled internally.
1878
1879         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1880         {
1881                 int i;
1882                 int cnt = 0;
1883
1884                 for (i = 0; i < g->states; i++) {
1885                         struct itemset *is = g->statetab[i];
1886                         struct symset shifts = INIT_DATASET;
1887                         struct symset reduce = INIT_DATASET;
1888                         int j;
1889                         if (!is)
1890                                 continue;
1891                         /* First collect the shifts */
1892                         for (j = 0; j < is->items.cnt; j++) {
1893                                 unsigned short itm = is->items.syms[j];
1894                                 int p = item_prod(itm);
1895                                 int bp = item_dot(itm);
1896                                 struct production *pr = g->productions[p];
1897                                 struct symbol *s;
1898
1899                                 if (bp >= pr->body_size ||
1900                                     pr->body[bp]->type != Terminal)
1901                                         /* not shiftable */
1902                                         continue;
1903
1904                                 s = pr->body[bp];
1905                                 if (s->precedence && is->precedence)
1906                                         /* Precedence resolves this, so no conflict */
1907                                         continue;
1908
1909                                 if (symset_find(&shifts, s->num) < 0)
1910                                         symset_add(&shifts, s->num, itm);
1911                         }
1912                         /* Now look for reductions and conflicts */
1913                         for (j = 0; j < is->items.cnt; j++) {
1914                                 unsigned short itm = is->items.syms[j];
1915                                 int p = item_prod(itm);
1916                                 int bp = item_dot(itm);
1917                                 struct production *pr = g->productions[p];
1918
1919                                 if (bp < pr->body_size)
1920                                         continue;
1921                                 /* reducible */
1922                                 struct symset la;
1923                                 if (type == SLR)
1924                                         la = g->follow[pr->head->num];
1925                                 else
1926                                         la = set_find(g, is->items.data[j]);
1927                                 int k;
1928                                 for (k = 0; k < la.cnt; k++) {
1929                                         int pos = symset_find(&shifts, la.syms[k]);
1930                                         if (pos >= 0 && la.syms[k] != TK_newline) {
1931                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1932                                                 cnt++;
1933                                                         prtxt(g->symtab[la.syms[k]]->name);
1934                                                 printf(":\n");
1935                                                 report_item(g, shifts.data[pos]);
1936                                                 report_item(g, itm);
1937                                         }
1938                                         pos = symset_find(&reduce, la.syms[k]);
1939                                         if (pos < 0) {
1940                                                 symset_add(&reduce, la.syms[k], itm);
1941                                                 continue;
1942                                         }
1943                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1944                                         prtxt(g->symtab[la.syms[k]]->name);
1945                                         printf(":\n");
1946                                         report_item(g, itm);
1947                                         report_item(g, reduce.data[pos]);
1948                                         cnt++;
1949                                 }
1950                         }
1951                         symset_free(shifts);
1952                         symset_free(reduce);
1953                 }
1954                 return cnt;
1955         }
1956
1957
1958 ## Generating the parser
1959
1960 The exported part of the parser is the `parse_XX` function, where the name
1961 `XX` is based on the name of the parser files.
1962
1963 This takes a `code_node`, a partially initialized `token_config`, and an
1964 optional `FILE` to send tracing to.  The `token_config` gets the list of
1965 known words added and then is used with the `code_node` to initialize the
1966 scanner.
1967
1968 `parse_XX` then calls the library function `parser_run` to actually complete
1969 the parse.  This needs the `states` table and functions to call the various
1970 pieces of code provided in the grammar file, so they are generated first.
1971
1972 ###### parser_generate
1973
1974         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1975                                struct code_node *pre_reduce)
1976         {
1977                 gen_known(f, g);
1978                 gen_non_term(f, g);
1979                 gen_goto(f, g);
1980                 gen_states(f, g);
1981                 gen_reduce(f, g, file, pre_reduce);
1982                 gen_free(f, g);
1983
1984                 fprintf(f, "#line 0 \"gen_parser\"\n");
1985                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1986                         name);
1987                 fprintf(f, "{\n");
1988                 fprintf(f, "\tstruct token_state *tokens;\n");
1989                 fprintf(f, "\tconfig->words_marks = known;\n");
1990                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1991                 fprintf(f, "\ttokens = token_open(code, config);\n");
1992                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1993                 fprintf(f, "\ttoken_close(tokens);\n");
1994                 fprintf(f, "\treturn rv;\n");
1995                 fprintf(f, "}\n\n");
1996         }
1997
1998 ### Known words table
1999
2000 The known words table is simply an array of terminal symbols.
2001 The table of nonterminals used for tracing is a similar array.
2002
2003 ###### functions
2004
2005         static void gen_known(FILE *f, struct grammar *g)
2006         {
2007                 int i;
2008                 fprintf(f, "#line 0 \"gen_known\"\n");
2009                 fprintf(f, "static const char *known[] = {\n");
2010                 for (i = TK_reserved;
2011                      i < g->num_syms && g->symtab[i]->type == Terminal;
2012                      i++)
2013                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
2014                                 g->symtab[i]->name.txt);
2015                 fprintf(f, "};\n\n");
2016         }
2017
2018         static void gen_non_term(FILE *f, struct grammar *g)
2019         {
2020                 int i;
2021                 fprintf(f, "#line 0 \"gen_non_term\"\n");
2022                 fprintf(f, "static const char *non_term[] = {\n");
2023                 for (i = TK_reserved;
2024                      i < g->num_syms;
2025                      i++)
2026                         if (g->symtab[i]->type == Nonterminal)
2027                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
2028                                         g->symtab[i]->name.txt);
2029                 fprintf(f, "};\n\n");
2030         }
2031
2032 ### States and the goto tables.
2033
2034 For each state we record the goto table and details of the reducible
2035 production if there is one.
2036 Some of the details of the reducible production are stored in the
2037 `do_reduce` function to come later.  Here we store the production
2038 number, the body size (useful for stack management), and the resulting
2039 symbol (useful for knowing how to free data later).
2040
2041 The go to table is stored in a simple array of `sym` and corresponding
2042 `state`.
2043
2044 ###### exported types
2045
2046         struct lookup {
2047                 short sym;
2048                 short state;
2049         };
2050         struct state {
2051                 short go_to_cnt;
2052                 const struct lookup * go_to;
2053                 short reduce_prod;
2054                 short reduce_size;
2055                 short reduce_sym;
2056                 char starts_line;
2057                 char newline_only;
2058                 short min_prefix;
2059         };
2060
2061 ###### functions
2062
2063         static void gen_goto(FILE *f, struct grammar *g)
2064         {
2065                 int i;
2066                 fprintf(f, "#line 0 \"gen_goto\"\n");
2067                 for (i = 0; i < g->states; i++) {
2068                         struct symset gt = g->statetab[i]->go_to;
2069                         int j;
2070
2071                         if (gt.cnt == 0)
2072                                 continue;
2073                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
2074                                 i);
2075                         for (j = 0; j < gt.cnt; j++)
2076                                 fprintf(f, "\t{ %d, %d },\n",
2077                                         gt.syms[j], gt.data[j]);
2078                         fprintf(f, "};\n");
2079                 }
2080         }
2081
2082         static void gen_states(FILE *f, struct grammar *g)
2083         {
2084                 int i;
2085                 fprintf(f, "#line 0 \"gen_states\"\n");
2086                 fprintf(f, "static const struct state states[] = {\n");
2087                 for (i = 0; i < g->states; i++) {
2088                         struct itemset *is = g->statetab[i];
2089                         int j, prod = -1, prod_len;
2090
2091                         for (j = 0; j < is->items.cnt; j++) {
2092                                 int itm = is->items.syms[j];
2093                                 int p = item_prod(itm);
2094                                 int bp = item_dot(itm);
2095                                 struct production *pr = g->productions[p];
2096
2097                                 if (bp < pr->body_size)
2098                                         continue;
2099                                 /* This is what we reduce - choose longest */
2100                                 if (prod < 0 || prod_len < pr->body_size) {
2101                                         prod = p;
2102                                         prod_len = pr->body_size;
2103                                 }
2104                         }
2105                         if (is->go_to.cnt)
2106                                 fprintf(f, "\t[%d] = { %d, goto_%d, ",
2107                                         i, is->go_to.cnt, i);
2108                         else
2109                                 fprintf(f, "\t[%d] = { 0, NULL, ", i);
2110                         if (prod >= 0) {
2111                                 struct production *pr = g->productions[prod];
2112                                 fprintf(f, "%d, %d, %d, %d, %d, %d },\n", prod,
2113                                         pr->body_size,
2114                                         pr->head->num,
2115                                         is->starts_line,
2116                                         pr->line_like,
2117                                         is->min_prefix);
2118                         } else
2119                                 fprintf(f, "-1, -1, -1, %d, 0, %d },\n",
2120                                         is->starts_line, is->min_prefix);
2121                 }
2122                 fprintf(f, "};\n\n");
2123         }
2124
2125 ### The `do_reduce` function and the code
2126
2127 When the parser engine decides to reduce a production, it calls
2128 `do_reduce` which runs the code that was included with the production,
2129 if any.
2130
2131 This code needs to be able to store data somewhere.  Rather than
2132 requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
2133 buffer and have `do_reduce` return the size to be saved.
2134
2135 In order for the code to access "global" context, we pass in the
2136 "config" pointer that was passed to the parser function.  If the `struct
2137 token_config` is embedded in some larger structure, the reducing code
2138 can access the larger structure using pointer manipulation.  Performing
2139 that pointer manipulation, and any other common code or variables for
2140 all reduce actions, can be provided in code node called "reduce" which
2141 is passed around in `pre_reduce` which you might have already noticed.
2142
2143 The code fragments require translation when written out.  Any `$N` needs
2144 to be converted to a reference either to that buffer (if `$0`) or to the
2145 structure returned by a previous reduction.  These pointers need to be
2146 cast to the appropriate type for each access.  All this is handled in
2147 `gen_code`.
2148
2149 `gen_code` also allows symbol references to contain a '`<`' as in
2150 '`$<2`'.  This is particularly useful for references (or pointers), but
2151 can be used with structures too.  The `<` implies that the value is
2152 being moved out, so the object will not be automatically freed.  It is
2153 equivalent to assigning `NULL` to the pointer or filling a structure
2154 with zeros.
2155
2156 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2157 and possibly a number.  A number by itself (other than zero) selects a
2158 symbol from the body of the production.  A sequence of letters selects
2159 the shortest symbol in the body which contains those letters in the
2160 given order.  If a number follows the letters, then a later occurrence
2161 of that symbol is chosen.  So "`$AB2`" will refer to the structure
2162 attached to the second occurrence of the shortest symbol which contains
2163 an `A` followed by a `B`.  If there is no unique shortest system, or if
2164 the number given is too large, then the symbol reference is not
2165 transformed, and will cause an error when the code is compiled.
2166
2167 ###### functions
2168
2169         static int textchr(struct text t, char c, int s)
2170         {
2171                 int i;
2172                 for (i = s; i < t.len; i++)
2173                         if (t.txt[i] == c)
2174                                 return i;
2175                 return -1;
2176         }
2177
2178         static int subseq_match(char *seq, int slen, struct text name)
2179         {
2180                 int st = 0;
2181                 while (slen > 0) {
2182                         st = textchr(name, *seq, st);
2183                         if (st < 0)
2184                                 return 0;
2185                         slen -= 1;
2186                         seq += 1;
2187                         st += 1;
2188                 }
2189                 return 1;
2190         }
2191
2192         static int choose_sym(char **namep, int len, struct production *p)
2193         {
2194                 char *name = *namep;
2195                 char *nam = name;
2196                 int namlen;
2197                 int n = 0;
2198                 int i, s, slen;
2199                 char c;
2200
2201                 c = *name;
2202                 while (len > 0 &&
2203                        ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2204                         name += 1;
2205                         len -= 1;
2206                         c = *name;
2207                 }
2208                 namlen = name-nam;
2209                 while (len > 0 && (c >= '0' && c <= '9')) {
2210                         name += 1;
2211                         len -= 1;
2212                         n = n * 10 + (c - '0');
2213                         c = *name;
2214                 }
2215                 if (namlen == 0) {
2216                         if (name == *namep || n > p->body_size)
2217                                 return -1;
2218                         *namep = name;
2219                         return n;
2220                 }
2221                 slen = 0; s = -1;
2222                 for (i = 0; i < p->body_size; i++) {
2223                         if (!subseq_match(nam, namlen, p->body[i]->name))
2224                                 continue;
2225                         if (slen == 0 || p->body[i]->name.len < slen) {
2226                                 s = i;
2227                                 slen = p->body[i]->name.len;
2228                         }
2229                         if (s >= 0 && p->body[i] != p->body[s] &&
2230                             p->body[i]->name.len == p->body[s]->name.len)
2231                                 /* not unique, so s cannot be used */
2232                                 s = -1;
2233                 }
2234                 if (s < 0)
2235                         return -1;
2236                 if (n == 0)
2237                         n = 1;
2238                 for (i = 0; i < p->body_size; i++)
2239                         if (p->body[i] == p->body[s]) {
2240                                 n -= 1;
2241                                 if (n == 0)
2242                                         break;
2243                         }
2244                 if (n > 0 || i == p->body_size)
2245                         return -1;
2246                 *namep = name;
2247                 return i + 1;
2248         }
2249
2250         static void gen_code(struct production *p, FILE *f, struct grammar *g)
2251         {
2252                 char *c;
2253                 char *used = calloc(1, p->body_size);
2254                 int i;
2255
2256                 fprintf(f, "\t\t\t");
2257                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2258                         int n;
2259                         int use = 0;
2260                         if (*c != '$') {
2261                                 fputc(*c, f);
2262                                 if (*c == '\n')
2263                                         fputs("\t\t\t", f);
2264                                 continue;
2265                         }
2266                         c++;
2267                         if (*c == '<') {
2268                                 use = 1;
2269                                 c++;
2270                         }
2271                         n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2272                         if (n < 0) {
2273                                 fputc('$', f);
2274                                 if (use)
2275                                         fputc('<', f);
2276                                 fputc(*c, f);
2277                                 continue;
2278                         }
2279                         if (n == 0)
2280                                 fprintf(f, "(*(struct %.*s*%s)ret)",
2281                                         p->head->struct_name.len,
2282                                         p->head->struct_name.txt,
2283                                         p->head->isref ? "*":"");
2284                         else if (p->body[n-1]->type == Terminal)
2285                                 fprintf(f, "(*(struct token *)body[%d])",
2286                                         n-1);
2287                         else if (p->body[n-1]->struct_name.txt == NULL)
2288                                 fprintf(f, "$%d", n);
2289                         else {
2290                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2291                                         p->body[n-1]->struct_name.len,
2292                                         p->body[n-1]->struct_name.txt,
2293                                         p->body[n-1]->isref ? "*":"", n-1);
2294                                 used[n-1] = use;
2295                         }
2296                         c -= 1;
2297                 }
2298                 fputs("\n", f);
2299                 for (i = 0; i < p->body_size; i++) {
2300                         if (p->body[i]->struct_name.txt &&
2301                             used[i]) {
2302                                 // assume this has been copied out
2303                                 if (p->body[i]->isref)
2304                                         fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2305                                 else
2306                                         fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2307                         }
2308                 }
2309                 free(used);
2310         }
2311
2312 ###### functions
2313
2314         static void gen_reduce(FILE *f, struct grammar *g, char *file,
2315                                struct code_node *pre_reduce)
2316         {
2317                 int i;
2318                 fprintf(f, "#line 1 \"gen_reduce\"\n");
2319                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2320                 fprintf(f, "{\n");
2321                 fprintf(f, "\tint ret_size = 0;\n");
2322                 if (pre_reduce)
2323                         code_node_print(f, pre_reduce, file);
2324
2325                 fprintf(f, "#line 4 \"gen_reduce\"\n");
2326                 fprintf(f, "\tswitch(prod) {\n");
2327                 for (i = 0; i < g->production_count; i++) {
2328                         struct production *p = g->productions[i];
2329                         fprintf(f, "\tcase %d:\n", i);
2330
2331                         if (p->code.txt) {
2332                                 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2333                                 gen_code(p, f, g);
2334                         }
2335
2336                         if (p->head->struct_name.txt)
2337                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2338                                         p->head->struct_name.len,
2339                                         p->head->struct_name.txt,
2340                                         p->head->isref ? "*":"");
2341
2342                         fprintf(f, "\t\tbreak;\n");
2343                 }
2344                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2345         }
2346
2347 ### `do_free`
2348
2349 As each non-terminal can potentially cause a different type of data
2350 structure to be allocated and filled in, we need to be able to free it when
2351 done.
2352
2353 It is particularly important to have fine control over freeing during error
2354 recovery where individual stack frames might need to be freed.
2355
2356 For this, the grammar author is required to defined a `free_XX` function for
2357 each structure that is used by a non-terminal.  `do_free` will call whichever
2358 is appropriate given a symbol number, and will call `free` (as is
2359 appropriate for tokens) on any terminal symbol.
2360
2361 ###### functions
2362
2363         static void gen_free(FILE *f, struct grammar *g)
2364         {
2365                 int i;
2366
2367                 fprintf(f, "#line 0 \"gen_free\"\n");
2368                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2369                 fprintf(f, "{\n");
2370                 fprintf(f, "\tif (!asn) return;\n");
2371                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2372                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2373                 fprintf(f, "\tswitch(sym) {\n");
2374
2375                 for (i = 0; i < g->num_syms; i++) {
2376                         struct symbol *s = g->symtab[i];
2377                         if (!s ||
2378                             s->type != Nonterminal ||
2379                             s->struct_name.txt == NULL)
2380                                 continue;
2381
2382                         fprintf(f, "\tcase %d:\n", s->num);
2383                         if (s->isref) {
2384                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2385                                         s->struct_name.len,
2386                                         s->struct_name.txt);
2387                                 fprintf(f, "\t\tfree(asn);\n");
2388                         } else
2389                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2390                                         s->struct_name.len,
2391                                         s->struct_name.txt);
2392                         fprintf(f, "\t\tbreak;\n");
2393                 }
2394                 fprintf(f, "\t}\n}\n\n");
2395         }
2396
2397 ## The main routine.
2398
2399 There are three key parts to the "main" function of parsergen: processing
2400 the arguments, loading the grammar file, and dealing with the grammar.
2401
2402 ### Argument processing.
2403
2404 Command line options allow the selection of analysis mode, name of output
2405 file, and whether or not a report should be generated.  By default we create
2406 a report only if no code output was requested.
2407
2408 The `parse_XX` function name uses the basename of the output file which
2409 should not have a suffix (`.c`).  `.c` is added to the given name for the
2410 code, and `.h` is added for the header (if header text is specifed in the
2411 grammar file).
2412
2413 ###### includes
2414         #include <getopt.h>
2415
2416 ###### declarations
2417         static const struct option long_options[] = {
2418                 { "LR0",        0, NULL, '0' },
2419                 { "LR05",       0, NULL, '5' },
2420                 { "SLR",        0, NULL, 'S' },
2421                 { "LALR",       0, NULL, 'L' },
2422                 { "LR1",        0, NULL, '1' },
2423                 { "tag",        1, NULL, 't' },
2424                 { "report",     0, NULL, 'R' },
2425                 { "output",     1, NULL, 'o' },
2426                 { NULL,         0, NULL, 0   }
2427         };
2428         const char *options = "05SL1t:Ro:";
2429
2430 ###### process arguments
2431         int opt;
2432         char *outfile = NULL;
2433         char *infile;
2434         char *name;
2435         char *tag = NULL;
2436         int report = 1;
2437         enum grammar_type type = LR05;
2438         while ((opt = getopt_long(argc, argv, options,
2439                                   long_options, NULL)) != -1) {
2440                 switch(opt) {
2441                 case '0':
2442                         type = LR0; break;
2443                 case '5':
2444                         type = LR05; break;
2445                 case 'S':
2446                         type = SLR; break;
2447                 case 'L':
2448                         type = LALR; break;
2449                 case '1':
2450                         type = LR1; break;
2451                 case 'R':
2452                         report = 2; break;
2453                 case 'o':
2454                         outfile = optarg; break;
2455                 case 't':
2456                         tag = optarg; break;
2457                 default:
2458                         fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2459                         exit(1);
2460                 }
2461         }
2462         if (optind < argc)
2463                 infile = argv[optind++];
2464         else {
2465                 fprintf(stderr, "No input file given\n");
2466                 exit(1);
2467         }
2468         if (outfile && report == 1)
2469                 report = 0;
2470         name = outfile;
2471         if (name && strchr(name, '/'))
2472                 name = strrchr(name, '/')+1;
2473
2474         if (optind < argc) {
2475                 fprintf(stderr, "Excess command line arguments\n");
2476                 exit(1);
2477         }
2478
2479 ### Loading the grammar file
2480
2481 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2482 map it.
2483
2484 Once we have extracted the code (with `mdcode`) we expect to find three
2485 or four sections: header, code, grammar, reduce.
2486 Anything else that is not excluded by the `--tag` option is an error.
2487
2488 "header", "code", and "reduce" are optional, though it is hard to build
2489 a working parser without the first two.  "grammar" must be provided.
2490
2491 ###### includes
2492         #include <fcntl.h>
2493         #include <sys/mman.h>
2494         #include <errno.h>
2495
2496 ###### functions
2497         static int errs;
2498         static void pr_err(char *msg)
2499         {
2500                 errs++;
2501                 fprintf(stderr, "%s\n", msg);
2502         }
2503
2504 ###### load file
2505         struct section *table;
2506         int fd;
2507         int len;
2508         char *file;
2509         fd = open(infile, O_RDONLY);
2510         if (fd < 0) {
2511                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2512                         infile, strerror(errno));
2513                 exit(1);
2514         }
2515         len = lseek(fd, 0, 2);
2516         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2517         table = code_extract(file, file+len, pr_err);
2518
2519         struct code_node *hdr = NULL;
2520         struct code_node *code = NULL;
2521         struct code_node *gram = NULL;
2522         struct code_node *pre_reduce = NULL;
2523         for (s = table; s; s = s->next) {
2524                 struct text sec = s->section;
2525                 if (tag && !strip_tag(&sec, tag))
2526                         continue;
2527                 if (text_is(sec, "header"))
2528                         hdr = s->code;
2529                 else if (text_is(sec, "code"))
2530                         code = s->code;
2531                 else if (text_is(sec, "grammar"))
2532                         gram = s->code;
2533                 else if (text_is(sec, "reduce"))
2534                         pre_reduce = s->code;
2535                 else {
2536                         fprintf(stderr, "Unknown content section: %.*s\n",
2537                                 s->section.len, s->section.txt);
2538                         rv |= 2;
2539                 }
2540         }
2541
2542 ### Processing the input
2543
2544 As we need to append an extention to a filename and then open it for
2545 writing, and we need to do this twice, it helps to have a separate function.
2546
2547 ###### functions
2548
2549         static FILE *open_ext(char *base, char *ext)
2550         {
2551                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2552                 FILE *f;
2553                 strcat(strcpy(fn, base), ext);
2554                 f = fopen(fn, "w");
2555                 free(fn);
2556                 return f;
2557         }
2558
2559 If we can read the grammar, then we analyse and optionally report on it.  If
2560 the report finds conflicts we will exit with an error status.
2561
2562 ###### process input
2563         struct grammar *g = NULL;
2564         if (gram == NULL) {
2565                 fprintf(stderr, "No grammar section provided\n");
2566                 rv |= 2;
2567         } else {
2568                 g = grammar_read(gram);
2569                 if (!g) {
2570                         fprintf(stderr, "Failure to parse grammar\n");
2571                         rv |= 2;
2572                 }
2573         }
2574         if (g) {
2575                 grammar_analyse(g, type);
2576                 if (report)
2577                         if (grammar_report(g, type))
2578                                 rv |= 1;
2579         }
2580
2581 If a "headers" section is defined, we write it out and include a declaration
2582 for the `parse_XX` function so it can be used from separate code.
2583
2584         if (rv == 0 && hdr && outfile) {
2585                 FILE *f = open_ext(outfile, ".h");
2586                 if (f) {
2587                         code_node_print(f, hdr, infile);
2588                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2589                                 name);
2590                         fclose(f);
2591                 } else {
2592                         fprintf(stderr, "Cannot create %s.h\n",
2593                                 outfile);
2594                         rv |= 4;
2595                 }
2596         }
2597
2598 And if all goes well and an output file was provided, we create the `.c`
2599 file with the code section (if any) and the parser tables and function.
2600
2601         if (rv == 0 && outfile) {
2602                 FILE *f = open_ext(outfile, ".c");
2603                 if (f) {
2604                         if (code)
2605                                 code_node_print(f, code, infile);
2606                         gen_parser(f, g, infile, name, pre_reduce);
2607                         fclose(f);
2608                 } else {
2609                         fprintf(stderr, "Cannot create %s.c\n",
2610                                 outfile);
2611                         rv |= 4;
2612                 }
2613         }
2614
2615 And that about wraps it up.  We need to set the locale so that UTF-8 is
2616 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2617
2618 ###### File: parsergen.mk
2619         parsergen : parsergen.o libscanner.o libmdcode.o
2620                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2621
2622 ###### includes
2623         #include <locale.h>
2624
2625 ###### main
2626
2627         int main(int argc, char *argv[])
2628         {
2629                 struct section *s;
2630                 int rv = 0;
2631
2632                 setlocale(LC_ALL,"");
2633
2634                 ## process arguments
2635                 ## load file
2636                 ## process input
2637
2638                 return rv;
2639         }
2640
2641 ## The SHIFT/REDUCE parser
2642
2643 Having analysed the grammar and generated all the tables, we only need
2644 the shift/reduce engine to bring it all together.
2645
2646 ### Goto table lookup
2647
2648 The parser generator has nicely provided us with goto tables sorted by
2649 symbol number.  We need a binary search function to find a symbol in the
2650 table.
2651
2652 ###### parser functions
2653
2654         static int search(const struct state *l, int sym)
2655         {
2656                 int lo = 0;
2657                 int hi = l->go_to_cnt;
2658
2659                 if (hi == 0)
2660                         return -1;
2661                 while (lo + 1 < hi) {
2662                         int mid = (lo + hi) / 2;
2663                         if (l->go_to[mid].sym <= sym)
2664                                 lo = mid;
2665                         else
2666                                 hi = mid;
2667                 }
2668                 if (l->go_to[lo].sym == sym)
2669                         return l->go_to[lo].state;
2670                 else
2671                         return -1;
2672         }
2673
2674 ### Memory allocation
2675
2676 The `scanner` returns tokens in a local variable - we want them in allocated
2677 memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
2678 a reduce is in a large buffer.  Both of these require some allocation and
2679 copying, hence `memdup` and `tok_copy`.
2680
2681 ###### parser includes
2682         #include <memory.h>
2683
2684 ###### parser functions
2685
2686         void *memdup(void *m, int len)
2687         {
2688                 void *ret;
2689                 if (len == 0)
2690                         return NULL;
2691                 ret = malloc(len);
2692                 memcpy(ret, m, len);
2693                 return ret;
2694         }
2695
2696         static struct token *tok_copy(struct token tk)
2697         {
2698                 struct token *new = malloc(sizeof(*new));
2699                 *new = tk;
2700                 return new;
2701         }
2702
2703 ### The state stack.
2704
2705 The core data structure for the parser is the stack.  This tracks all
2706 the symbols that have been recognised or partially recognised.
2707
2708 The stack usually won't grow very large - maybe a few tens of entries.
2709 So we dynamically grow an array as required but never bother to
2710 shrink it down again.
2711
2712 We keep the stack as two separate allocations.  One, `asn_stack` stores
2713 the "abstract syntax nodes" which are created by each reduction.  When
2714 we call `do_reduce` we need to pass an array of the `asn`s of the body
2715 of the production, and by keeping a separate `asn` stack, we can just
2716 pass a pointer into this stack.
2717
2718 The other allocation stores all other stack fields of which there are
2719 several.  The `state` is the most important one and guides the parsing
2720 process.  The `sym` is nearly unnecessary as it is implicit in the
2721 state.  However when we want to free entries from the `asn_stack`, it
2722 helps to know what type they are so we can call the right freeing
2723 function.  The symbol leads us to the right free function through
2724 `do_free`.
2725
2726 The `indents` count tracks the line indents with-in the symbol or
2727 immediately follow it.  These are used to allow indent information to
2728 guide parsing and error recovery.
2729
2730 `since_newline` tracks how many stack frames since the last
2731 start-of-line (whether indented or not).  So if `since_newline` is
2732 zero, then this symbol is at the start of a line.  Similarly
2733 `since_indent` counts the number of states since an indent, it is zero
2734 precisely when `indents` is not zero.
2735
2736 `newline_permitted` keeps track of whether newlines should be ignored
2737 or not.
2738
2739 The stack is most properly seen as alternating states and symbols -
2740 states, like the 'DOT' in items, are between symbols.  Each frame in
2741 our stack holds a state and the symbol that was before it.  The
2742 bottom of stack holds the start state but no symbol, as nothing came
2743 before the beginning.  As we need to store some value, `TK_eof` is used
2744 to mark the beginning of the file as well as the end.
2745
2746 ###### parser functions
2747
2748         struct parser {
2749                 struct frame {
2750                         short state;
2751                         short newline_permitted;
2752
2753                         short sym;
2754                         short indents;
2755                         short since_newline;
2756                         short since_indent;
2757                 } *stack;
2758                 void **asn_stack;
2759                 int stack_size;
2760                 int tos;
2761         };
2762
2763 #### Shift and pop
2764
2765 Two operations are needed on the stack - shift (which is like push) and pop.
2766
2767 Shift applies not only to terminals but also to non-terminals.  When we
2768 reduce a production we will pop off frames corresponding to the body
2769 symbols, then push on a frame for the head of the production.  This last
2770 is exactly the same process as shifting in a terminal so we use the same
2771 function for both.  In both cases we provide the symbol, the number of
2772 indents the symbol contains (which will be zero for a terminal symbol)
2773 and a flag indicating the the symbol was at (or was reduced from a
2774 symbol which was at) the start of a line.  The state is deduced from the
2775 current top-of-stack state and the new symbol.
2776
2777 To simplify other code we arrange for `shift` to fail if there is no `goto`
2778 state for the symbol.  This is useful in basic parsing due to our design
2779 that we shift when we can, and reduce when we cannot.  So the `shift`
2780 function reports if it could.
2781
2782 `shift` is also used to push state zero onto the stack, so if the
2783 stack is empty, it always chooses zero as the next state.
2784
2785 So `shift` finds the next state.  If that succeeds it extends the
2786 allocations if needed and pushes all the information onto the stacks.
2787
2788 Newlines are permitted after a `starts_line` state until an internal
2789 indent.  If the new frame has neither a `starts_line` state nor an
2790 indent, newlines are permitted if the previous stack frame permitted
2791 them.
2792
2793 ###### parser functions
2794
2795         static int shift(struct parser *p,
2796                          short sym, short indents, short start_of_line,
2797                          void *asn,
2798                          const struct state states[])
2799         {
2800                 // Push an entry onto the stack
2801                 struct frame next = {0};
2802                 int newstate = p->tos
2803                         ? search(&states[p->stack[p->tos-1].state],
2804                                  sym)
2805                         : 0;
2806                 if (newstate < 0)
2807                         return 0;
2808                 if (p->tos >= p->stack_size) {
2809                         p->stack_size += 10;
2810                         p->stack = realloc(p->stack, p->stack_size
2811                                            * sizeof(p->stack[0]));
2812                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2813                                            * sizeof(p->asn_stack[0]));
2814                 }
2815                 next.sym = sym;
2816                 next.indents = indents;
2817                 next.state = newstate;
2818                 if (states[newstate].starts_line)
2819                         next.newline_permitted = 1;
2820                 else if (indents)
2821                         next.newline_permitted = 0;
2822                 else if (p->tos)
2823                         next.newline_permitted =
2824                                 p->stack[p->tos-1].newline_permitted;
2825                 else
2826                         next.newline_permitted = 0;
2827
2828                 if (!start_of_line) {
2829                         if (p->tos)
2830                                 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2831                         else
2832                                 next.since_newline = 1;
2833                 }
2834                 if (indents)
2835                         next.since_indent = 0;
2836                 else if (p->tos)
2837                         next.since_indent = p->stack[p->tos-1].since_indent + 1;
2838                 else
2839                         next.since_indent = 1;
2840
2841                 p->stack[p->tos] = next;
2842                 p->asn_stack[p->tos] = asn;
2843                 p->tos++;
2844                 return 1;
2845         }
2846
2847 `pop` primarily moves the top of stack (`tos`) back down the required
2848 amount and frees any `asn` entries that need to be freed.  It also
2849 collects a summary of the indents and line starts in the symbols that
2850 are being removed. It is called _after_ we reduce a production, just
2851 before we `shift` the nonterminal in.
2852
2853 ###### parser functions
2854
2855         static int pop(struct parser *p, int num,
2856                        short *start_of_line,
2857                        void(*do_free)(short sym, void *asn))
2858         {
2859                 int i;
2860                 short indents = 0;
2861                 int sol = 0;
2862
2863                 p->tos -= num;
2864                 for (i = 0; i < num; i++) {
2865                         sol |= !p->stack[p->tos+i].since_newline;
2866                         indents += p->stack[p->tos+i].indents;
2867                         do_free(p->stack[p->tos+i].sym,
2868                                 p->asn_stack[p->tos+i]);
2869                 }
2870                 if (start_of_line)
2871                         *start_of_line = sol;
2872                 return indents;
2873         }
2874
2875 ### The heart of the parser.
2876
2877 Now we have the parser.  For each token we might shift it, trigger a
2878 reduction, or start error handling.  2D tokens (IN, OUT, EOL) also need
2879 to be handled.
2880
2881 We return whatever `asn` was returned by reducing production zero.
2882
2883 If we can neither shift nor reduce we have an error to handle.  We pop
2884 single entries off the stack until we can shift the `TK_error` symbol,
2885 then drop input tokens until we find one we can shift into the new error
2886 state.  We need to ensure that something is dropped or shifted after an
2887 error, or we could get into an infinite loop, only shifting in
2888 `TK_error`, then reducing.  So we track if there has been a shift since
2889 the last error, and if not the next error always discards one token.
2890
2891 When we find `TK_in` and `TK_out` tokens which report indents we need
2892 to handle them directly as the grammar cannot express what we want to
2893 do with them.
2894
2895 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2896 record how many indents there are following the previous token.
2897
2898 `TK_out` tokens must be canceled against an indent count
2899 within the stack.  If we can reduce some symbols that are all since
2900 the most recent indent, then we do that first.  If the minimum prefix
2901 of the current state then extends back before the most recent indent,
2902 that indent can be cancelled.  If the minimum prefix is shorter then
2903 the indent had ended prematurely and we must start error handling, which
2904 is still a work-in-progress.
2905
2906 `TK_newline` tokens are ignored unless the top stack frame records
2907 that they are permitted.  In that case they will not be considered for
2908 shifting if it is possible to reduce some symbols that are all since
2909 the most recent start of line.  This is how a newline forcibly
2910 terminates any line-like structure - we try to reduce down to at most
2911 one symbol for each line where newlines are allowed.
2912 A consequence of this is that a rule like
2913
2914 ###### Example: newlines - broken
2915
2916         Newlines ->
2917                 | NEWLINE Newlines
2918         IfStatement -> Newlines if ....
2919
2920 cannot work, as the NEWLINE will never be shifted as the empty string
2921 will be reduced first.  Optional sets of newlines need to be include
2922 in the thing that preceed:
2923
2924 ###### Example: newlines - works
2925
2926         If -> if
2927                 | NEWLINE If
2928         IfStatement -> If ....
2929
2930 Here the NEWLINE will be shifted because nothing can be reduced until
2931 the `if` is seen.
2932
2933 When during error handling we discard tokens read in, we want to keep
2934 discarding until we see one that is recognised.  If we had a full set
2935 of LR(1) grammar states, this would mean looking in the look-ahead set,
2936 but we don't keep a full look-ahead set.  We only record the subset
2937 that leads to SHIFT.  We can, however, deduce the look-ahead set by
2938 looking at the SHIFT subsets for all states that we can get to by
2939 reducing zero or more times.  So we need a little function which
2940 checks if a given token is in any of these look-ahead sets.
2941
2942 ###### parser includes
2943         #include "parser.h"
2944
2945 ###### parser_run
2946
2947         static int in_lookahead(struct token *tk, const struct state *states, int state)
2948         {
2949                 while (state >= 0) {
2950                         if (search(&states[state], tk->num) >= 0)
2951                                 return 1;
2952                         if (states[state].reduce_prod < 0)
2953                                 return 0;
2954                         state = search(&states[state], states[state].reduce_sym);
2955                 }
2956                 return 0;
2957         }
2958
2959         void *parser_run(struct token_state *tokens,
2960                          const struct state states[],
2961                          int (*do_reduce)(int, void**, struct token_config*, void*),
2962                          void (*do_free)(short, void*),
2963                          FILE *trace, const char *non_term[],
2964                          struct token_config *config)
2965         {
2966                 struct parser p = { 0 };
2967                 struct token *tk = NULL;
2968                 int accepted = 0;
2969                 int shift_since_err = 1;
2970                 void *ret = NULL;
2971
2972                 shift(&p, TK_eof, 0, 1, NULL, states);
2973                 while (!accepted && p.tos > 0) {
2974                         struct token *err_tk;
2975                         struct frame *tos = &p.stack[p.tos-1];
2976                         if (!tk)
2977                                 tk = tok_copy(token_next(tokens));
2978                         parser_trace(trace, &p,
2979                                      tk, states, non_term, config->known_count);
2980
2981                         if (tk->num == TK_in) {
2982                                 tos->indents += 1;
2983                                 tos->since_newline = 0;
2984                                 tos->since_indent = 0;
2985                                 if (!states[tos->state].starts_line)
2986                                         tos->newline_permitted = 0;
2987                                 free(tk);
2988                                 tk = NULL;
2989                                 parser_trace_action(trace, "Record");
2990                                 continue;
2991                         }
2992                         if (tk->num == TK_out) {
2993                                 if (states[tos->state].reduce_size >= 0 &&
2994                                     states[tos->state].reduce_size <= tos->since_indent)
2995                                         goto force_reduce;
2996                                 if (states[tos->state].min_prefix >= tos->since_indent) {
2997                                         // OK to cancel
2998                                         struct frame *in = tos - tos->since_indent;
2999                                         in->indents -= 1;
3000                                         if (in->indents == 0) {
3001                                                 /* Reassess since_indent and newline_permitted */
3002                                                 if (in > p.stack) {
3003                                                         in->since_indent = in[-1].since_indent + 1;
3004                                                         in->newline_permitted = in[-1].newline_permitted;
3005                                                 } else {
3006                                                         in->since_indent = 0;
3007                                                         in->newline_permitted = 0;
3008                                                 }
3009                                                 if (states[in->state].starts_line)
3010                                                         in->newline_permitted = 1;
3011                                                 while (in < tos) {
3012                                                         in += 1;
3013                                                         in->since_indent = in[-1].since_indent + 1;
3014                                                         if (states[in->state].starts_line)
3015                                                                 in->newline_permitted = 1;
3016                                                         else
3017                                                                 in->newline_permitted = in[-1].newline_permitted;
3018                                                 }
3019                                         }
3020                                         free(tk);
3021                                         tk = NULL;
3022                                         parser_trace_action(trace, "Cancel");
3023                                         continue;
3024                                 }
3025                                 // fall through to error handling as both SHIFT and REDUCE
3026                                 // will fail.
3027                         }
3028                         if (tk->num == TK_newline) {
3029                                 if (!tos->newline_permitted) {
3030                                         free(tk);
3031                                         tk = NULL;
3032                                         parser_trace_action(trace, "Discard");
3033                                         continue;
3034                                 }
3035                                 if (tos->since_newline > 1 &&
3036                                     states[tos->state].reduce_size >= 0 &&
3037                                     states[tos->state].reduce_size <= tos->since_newline)
3038                                         goto force_reduce;
3039                         }
3040                         if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
3041                                 shift_since_err = 1;
3042                                 tk = NULL;
3043                                 parser_trace_action(trace, "Shift");
3044                                 continue;
3045                         }
3046                 force_reduce:
3047                         if (states[tos->state].reduce_prod >= 0 &&
3048                             states[tos->state].newline_only &&
3049                             !(tk->num == TK_newline ||
3050                               tk->num == TK_eof ||
3051                               tk->num == TK_out ||
3052                               (tos->indents == 0 && tos->since_newline == 0))) {
3053                                 /* Anything other than newline or out or eof
3054                                  * in an error unless we are already at start
3055                                  * of line, as this production must end at EOL.
3056                                  */
3057                         } else if (states[tos->state].reduce_prod >= 0) {
3058                                 void **body;
3059                                 void *res;
3060                                 const struct state *nextstate = &states[tos->state];
3061                                 int prod = nextstate->reduce_prod;
3062                                 int size = nextstate->reduce_size;
3063                                 int bufsize;
3064                                 static char buf[16*1024];
3065                                 short indents, start_of_line;
3066
3067                                 body = p.asn_stack + (p.tos - size);
3068
3069                                 bufsize = do_reduce(prod, body, config, buf);
3070
3071                                 indents = pop(&p, size, &start_of_line,
3072                                               do_free);
3073                                 res = memdup(buf, bufsize);
3074                                 memset(buf, 0, bufsize);
3075                                 if (!shift(&p, nextstate->reduce_sym,
3076                                            indents, start_of_line,
3077                                            res, states)) {
3078                                         if (prod != 0) abort();
3079                                         accepted = 1;
3080                                         ret = res;
3081                                 }
3082                                 parser_trace_action(trace, "Reduce");
3083                                 continue;
3084                         }
3085                         /* Error. We walk up the stack until we
3086                          * find a state which will accept TK_error.
3087                          * We then shift in TK_error and see what state
3088                          * that takes us too.
3089                          * Then we discard input tokens until
3090                          * we find one that is acceptable.
3091                          */
3092                         parser_trace_action(trace, "ERROR");
3093                         short indents = 0, start_of_line;
3094
3095                         err_tk = tok_copy(*tk);
3096                         while (p.tos > 0 &&
3097                                shift(&p, TK_error, 0, 0,
3098                                      err_tk, states) == 0)
3099                                 // discard this state
3100                                 indents += pop(&p, 1, &start_of_line, do_free);
3101                         if (p.tos == 0) {
3102                                 free(err_tk);
3103                                 // no state accepted TK_error
3104                                 break;
3105                         }
3106                         if (!shift_since_err) {
3107                                 /* must discard at least one token to avoid
3108                                  * infinite loop.
3109                                  */
3110                                 if (tk->num == TK_eof)
3111                                         break;
3112                                 free(tk);
3113                                 tk = tok_copy(token_next(tokens));
3114                         }
3115                         shift_since_err = 0;
3116                         tos = &p.stack[p.tos-1];
3117                         while (!in_lookahead(tk, states, tos->state) &&
3118                                tk->num != TK_eof) {
3119                                 free(tk);
3120                                 tk = tok_copy(token_next(tokens));
3121                                 shift_since_err = 1;
3122                                 if (tk->num == TK_in)
3123                                         indents += 1;
3124                                 if (tk->num == TK_out) {
3125                                         if (indents == 0)
3126                                                 break;
3127                                         indents -= 1;
3128                                         // FIXME update since_indent here
3129                                 }
3130                         }
3131                         tos->indents += indents;
3132                 }
3133                 free(tk);
3134                 pop(&p, p.tos, NULL, do_free);
3135                 free(p.asn_stack);
3136                 free(p.stack);
3137                 return ret;
3138         }
3139
3140 ###### exported functions
3141         void *parser_run(struct token_state *tokens,
3142                          const struct state states[],
3143                          int (*do_reduce)(int, void**, struct token_config*, void*),
3144                          void (*do_free)(short, void*),
3145                          FILE *trace, const char *non_term[],
3146                          struct token_config *config);
3147
3148 ### Tracing
3149
3150 Being able to visualize the parser in action can be invaluable when
3151 debugging the parser code, or trying to understand how the parse of a
3152 particular grammar progresses.  The stack contains all the important
3153 state, so just printing out the stack every time around the parse loop
3154 can make it possible to see exactly what is happening.
3155
3156 This doesn't explicitly show each SHIFT and REDUCE action.  However they
3157 are easily deduced from the change between consecutive lines, and the
3158 details of each state can be found by cross referencing the states list
3159 in the stack with the "report" that parsergen can generate.
3160
3161 For terminal symbols, we just dump the token.  For non-terminals we
3162 print the name of the symbol.  The look ahead token is reported at the
3163 end inside square brackets.
3164
3165 ###### parser functions
3166
3167         static char *reserved_words[] = {
3168                 [TK_error]        = "ERROR",
3169                 [TK_in]           = "IN",
3170                 [TK_out]          = "OUT",
3171                 [TK_newline]      = "NEWLINE",
3172                 [TK_eof]          = "$eof",
3173         };
3174         static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
3175         {
3176                 fprintf(trace, "(%d", f->state);
3177                 if (states[f->state].starts_line)
3178                         fprintf(trace, "s");
3179                 if (f->newline_permitted)
3180                         fprintf(trace, "n%d", f->since_newline);
3181                 fprintf(trace, ") ");
3182         }
3183
3184         void parser_trace(FILE *trace, struct parser *p,
3185                           struct token *tk, const struct state states[],
3186                           const char *non_term[], int knowns)
3187         {
3188                 int i;
3189                 if (!trace)
3190                         return;
3191                 for (i = 0; i < p->tos; i++) {
3192                         struct frame *f = &p->stack[i];
3193                         if (i) {
3194                                 int sym = f->sym;
3195                                 if (sym < TK_reserved &&
3196                                     reserved_words[sym] != NULL)
3197                                         fputs(reserved_words[sym], trace);
3198                                 else if (sym < TK_reserved + knowns) {
3199                                         struct token *t = p->asn_stack[i];
3200                                         text_dump(trace, t->txt, 20);
3201                                 } else
3202                                         fputs(non_term[sym - TK_reserved - knowns],
3203                                               trace);
3204                                 if (f->indents)
3205                                         fprintf(trace, ".%d", f->indents);
3206                                 if (f->since_newline == 0)
3207                                         fputs("/", trace);
3208                                 fputs(" ", trace);
3209                         }
3210                         parser_trace_state(trace, f, states);
3211                 }
3212                 fprintf(trace, "[");
3213                 if (tk->num < TK_reserved &&
3214                     reserved_words[tk->num] != NULL)
3215                         fputs(reserved_words[tk->num], trace);
3216                 else
3217                         text_dump(trace, tk->txt, 20);
3218                 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3219         }
3220
3221         void parser_trace_action(FILE *trace, char *action)
3222         {
3223                 if (trace)
3224                         fprintf(trace, " - %s\n", action);
3225         }
3226
3227 # A Worked Example
3228
3229 The obvious example for a parser is a calculator.
3230
3231 As `scanner` provides number parsing function using `libgmp` it is not much
3232 work to perform arbitrary rational number calculations.
3233
3234 This calculator takes one expression, or an equality test, per line.
3235 The results are printed and if any equality test fails, the program
3236 exits with an error.
3237
3238 ###### File: parsergen.mk
3239         calc.c calc.h : parsergen parsergen.mdc
3240                 ./parsergen --tag calc -o calc parsergen.mdc
3241         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3242                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3243         calctest : calc
3244                 ./calc parsergen.mdc
3245         demos :: calctest
3246
3247 # calc: header
3248
3249         #include "parse_number.h"
3250         // what do we use for a demo-grammar?  A calculator of course.
3251         struct number {
3252                 mpq_t val;
3253                 char tail[2];
3254                 int err;
3255         };
3256
3257 # calc: code
3258
3259         #include <stdlib.h>
3260         #include <unistd.h>
3261         #include <fcntl.h>
3262         #include <sys/mman.h>
3263         #include <stdio.h>
3264         #include <malloc.h>
3265         #include <gmp.h>
3266         #include <string.h>
3267         #include "mdcode.h"
3268         #include "scanner.h"
3269         #include "parser.h"
3270
3271         #include "calc.h"
3272
3273         static void free_number(struct number *n)
3274         {
3275                 mpq_clear(n->val);
3276                 free(n);
3277         }
3278
3279         static int text_is(struct text t, char *s)
3280         {
3281                 return (strlen(s) == t.len &&
3282                         strncmp(s, t.txt, t.len) == 0);
3283         }
3284
3285         int main(int argc, char *argv[])
3286         {
3287                 int fd = open(argv[1], O_RDONLY);
3288                 int len = lseek(fd, 0, 2);
3289                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3290                 struct section *table = code_extract(file, file+len, NULL);
3291                 struct section *s;
3292                 struct token_config config = {
3293                         .ignored = (1 << TK_line_comment)
3294                                  | (1 << TK_in)
3295                                  | (1 << TK_out),
3296                         .number_chars = ".,_+-",
3297                         .word_start = "",
3298                         .word_cont = "",
3299                 };
3300                 for (s = table; s; s = s->next)
3301                         if (text_is(s->section, "example: input"))
3302                                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3303                 while (table) {
3304                         struct section *t = table->next;
3305                         code_free(table->code);
3306                         free(table);
3307                         table = t;
3308                 }
3309                 exit(0);
3310         }
3311
3312 # calc: grammar
3313
3314         $LEFT + -
3315         $LEFT * / //
3316
3317         Session -> Session Line
3318                 | Line
3319
3320         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3321                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3322                                         gmp_printf("  or as a decimal: %Fg\n", fl);
3323                                         mpf_clear(fl);
3324                                         }
3325                                      }$
3326                 | Expression = Expression NEWLINE ${
3327                         if (mpq_equal($1.val, $3.val))
3328                                 gmp_printf("Both equal %Qd\n", $1.val);
3329                         else {
3330                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
3331                                         $1.val, $3.val);
3332                                 exit(1);
3333                         }
3334                 }$
3335                 | NEWLINE ${ printf("Blank line\n"); }$
3336                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3337
3338         $number
3339         Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3340                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3341                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3342                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3343                 | Expression // Expression ${ {
3344                         mpz_t z0, z1, z2;
3345                         mpq_init($0.val);
3346                         mpz_init(z0); mpz_init(z1); mpz_init(z2);
3347                         mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3348                         mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3349                         mpz_tdiv_q(z0, z1, z2);
3350                         mpq_set_z($0.val, z0);
3351                         mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3352                 } }$
3353                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3354                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3355
3356 # example: input
3357
3358         355/113
3359         3.1415926535 - 355/113
3360         2 + 4 * 5
3361         1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3362         10 * 9 / 2
3363         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
3364
3365         355//113
3366
3367         error