ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 There are several distinct sections.
   8
   9 1. `grammar_read` will read a grammar from a literate-code file and
  10    store the productions, symbols, and code fragments.
  11 2. `grammar_analyse` will take that grammar and build LR parsing
  12    states and look-ahead tables.
  13 3. `grammar_report` will present the details of the analysis
  14    in a readable format and will report any conflicts.
  15 4. `parser_generate` will write out C code files with various
  16    tables and with the code fragments arranged in useful places.
  17 5. `parser_run` is a library function intended to be linked together
  18    with the generated parser tables to complete the implementation of
  19    a parser.
  20 6. Finally `calc` is a test grammar for a simple calculator.  The
  21    `parsergen` program built from the C code in this file can extract
  22    that grammar directly from this file and process it.
  23
  24 ###### File: parsergen.c
  25         #include <unistd.h>
  26         #include <stdlib.h>
  27         #include <stdio.h>
  28         ## includes
  29         ## forward declarations
  30         ## declarations
  31         ## functions
  32         ## grammar_read
  33         ## grammar_analyse
  34         ## grammar_report
  35         ## parser_generate
  36         ## main
  37 ###### File: parser.h
  38         ## exported types
  39         ## exported functions
  40 ###### File: libparser.c
  41         #include <unistd.h>
  42         #include <stdlib.h>
  43         #include <stdio.h>
  44         ## parser includes
  45         ## parser functions
  46         ## parser_run
  47 ###### File: parsergen.mk
  48         CFLAGS += -Wall -g
  49         all :: parsergen calc
  50         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  51                 ./md2c parsergen.mdc
  52
  53 ## Reading the grammar
  54
  55 The grammar must be stored in a markdown literate code file as parsed
  56 by "[mdcode][]".  It must have three top level (i.e. unreferenced)
  57 sections: `header`, `code`, and `grammar`.  The first two will be
  58 literally copied into the generated `.c` and `.h`. files.  The last
  59 contains the grammar.  This is tokenised with "[scanner][]".
  60
  61 If the `--tag` option is given, then any top level header that doesn't
  62 start with the tag is ignored, and the tag is striped from the rest.  So
  63 `--tag Foo`
  64 means that the three needed sections must be `Foo: header`, `Foo: code`,
  65 and `Foo: grammar`.  The tag `calc` is used to extract the same calculator
  66 from this file.
  67
  68 [mdcode]: mdcode.html
  69 [scanner]: scanner.html
  70
  71 ###### includes
  72         #include "mdcode.h"
  73         #include "scanner.h"
  74
  75 ###### parser includes
  76         #include "mdcode.h"
  77         #include "scanner.h"
  78
  79 The grammar contains production sets, precedence/associativity
  80 declarations, and data type declarations.  These are all parsed with
  81 _ad hoc_ parsing as we don't have a parser generator yet.
  82
  83 The precedence and associativity can be set for each production, but
  84 can be inherited from symbols.  The data type (either structure or a
  85 reference to a structure) is potentially defined for each non-terminal
  86 and describes what C structure is used to store information about each
  87 symbol.
  88
  89 ###### declarations
  90         enum assoc {Left, Right, Non};
  91         char *assoc_names[] = {"Left","Right","Non"};
  92
  93         struct symbol {
  94                 struct text struct_name;
  95                 int isref;
  96                 enum assoc assoc;
  97                 unsigned short precedence;
  98                 ## symbol fields
  99         };
 100         struct production {
 101                 unsigned short precedence;
 102                 enum assoc assoc;
 103                 char line_like;
 104                 ## production fields
 105         };
 106         struct grammar {
 107                 ## grammar fields
 108         };
 109
 110 The strings reported by `mdcode` and `scanner` are `struct text` which have
 111 length rather than being null terminated.  To help with printing and
 112 comparing we define `text_is` and `prtxt`, which should possibly go in
 113 `mdcode`.  `scanner` does provide `text_dump` which is useful for strings
 114 which might contain control characters.
 115
 116 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 117 because that is what we need to detect tags.
 118
 119 ###### functions
 120         static int text_is(struct text t, char *s)
 121         {
 122                 return (strlen(s) == t.len &&
 123                         strncmp(s, t.txt, t.len) == 0);
 124         }
 125         static void prtxt(struct text t)
 126         {
 127                 printf("%.*s", t.len, t.txt);
 128         }
 129
 130         static int strip_tag(struct text *t, char *tag)
 131         {
 132                 int skip = strlen(tag) + 1;
 133                 if (skip >= t->len ||
 134                     strncmp(t->txt, tag, skip-1) != 0 ||
 135                     t->txt[skip-1] != ':')
 136                         return 0;
 137                 while (skip < t->len && t->txt[skip] == ' ')
 138                         skip++;
 139                 t->len -= skip;
 140                 t->txt += skip;
 141                 return 1;
 142         }
 143
 144 ### Symbols
 145
 146 Productions are comprised primarily of symbols - terminal and
 147 non-terminal.  We do not make a syntactic distinction between the two,
 148 though non-terminals must be identifiers.  Non-terminal symbols are
 149 those which appear in the head of a production, terminal symbols are
 150 those which don't.  There are also "virtual" symbols used for precedence
 151 marking discussed later, and sometimes we won't know what type a symbol
 152 is yet.
 153
 154 ###### forward declarations
 155         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 156         char *symtypes = "UVTN";
 157 ###### symbol fields
 158         enum symtype type;
 159
 160 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 161 table of known symbols and the resulting parser will report them as
 162 `TK_reserved + N`.  A small set of identifiers are reserved for the
 163 different token types that `scanner` can report.
 164
 165 ###### declarations
 166         static char *reserved_words[] = {
 167                 [TK_error]        = "ERROR",
 168                 [TK_number]       = "NUMBER",
 169                 [TK_ident]        = "IDENTIFIER",
 170                 [TK_mark]         = "MARK",
 171                 [TK_string]       = "STRING",
 172                 [TK_multi_string] = "MULTI_STRING",
 173                 [TK_in]           = "IN",
 174                 [TK_out]          = "OUT",
 175                 [TK_newline]      = "NEWLINE",
 176                 [TK_eof]          = "$eof",
 177         };
 178 ###### symbol fields
 179         short num;
 180
 181 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 182 recognised.  The former is automatically expected at the end of the text
 183 being parsed. The latter are always ignored by the parser.
 184
 185 All of these symbols are stored in a simple symbol table.  We use the
 186 `struct text` from `mdcode` to store the name and link them together
 187 into a sorted list using an insertion sort.
 188
 189 We don't have separate `find` and `insert` functions as any symbol we
 190 find needs to be remembered.  We simply expect `find` to always return a
 191 symbol, but its type might be `Unknown`.
 192
 193 ###### includes
 194         #include <string.h>
 195
 196 ###### symbol fields
 197         struct text name;
 198         struct symbol *next;
 199
 200 ###### grammar fields
 201         struct symbol *syms;
 202         int num_syms;
 203
 204 ###### functions
 205         static struct symbol *sym_find(struct grammar *g, struct text s)
 206         {
 207                 struct symbol **l = &g->syms;
 208                 struct symbol *n;
 209                 int cmp = 1;
 210
 211                 while (*l &&
 212                         (cmp = text_cmp((*l)->name, s)) < 0)
 213                                 l = & (*l)->next;
 214                 if (cmp == 0)
 215                         return *l;
 216                 n = calloc(1, sizeof(*n));
 217                 n->name = s;
 218                 n->type = Unknown;
 219                 n->next = *l;
 220                 n->num = -1;
 221                 *l = n;
 222                 return n;
 223         }
 224
 225         static void symbols_init(struct grammar *g)
 226         {
 227                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 228                 int i;
 229                 for (i = 0; i < entries; i++) {
 230                         struct text t;
 231                         struct symbol *s;
 232                         t.txt = reserved_words[i];
 233                         if (!t.txt)
 234                                 continue;
 235                         t.len = strlen(t.txt);
 236                         s = sym_find(g, t);
 237                         s->type = Terminal;
 238                         s->num = i;
 239                 }
 240         }
 241
 242 ### Data types and precedence.
 243
 244 Data type specification and precedence specification are both
 245 introduced by a dollar sign at the start of the line.  If the next
 246 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 247 precedence, otherwise it specifies a data type.
 248
 249 The data type name is simply stored and applied to the head of all
 250 subsequent productions.  It must be the name of a structure optionally
 251 preceded by an asterisk which means a reference or "pointer".  So
 252 `$expression` maps to `struct expression` and `$*statement` maps to
 253 `struct statement *`.
 254
 255 Any productions given before the first data type declaration will have
 256 no data type associated with them and can carry no information.  In
 257 order to allow other non-terminals to have no type, the data type
 258 `$void` can be given.  This does *not* mean that `struct void` will be
 259 used, but rather than no type will be associated with future
 260 non-terminals.
 261
 262 The precedence line must contain a list of symbols - typically
 263 terminal symbols, but not necessarily.  It can only contain symbols
 264 that have not been seen yet, so precedence declaration must precede
 265 the productions that they affect.
 266
 267 A precedence line may also contain a "virtual" symbol which is an
 268 identifier preceded by `$$`.  Virtual terminals carry precedence
 269 information but are not included in the grammar.  A production can
 270 declare that it inherits the precedence of a given virtual symbol.
 271
 272 This use for `$$` precludes it from being used as a symbol in the
 273 described language.  Two other symbols: `${` and `}$` are also
 274 unavailable.
 275
 276 Each new precedence line introduces a new precedence level and
 277 declares how it associates.  This level is stored in each symbol
 278 listed and may be inherited by any production which uses the symbol.  A
 279 production inherits from the last symbol which has a precedence.
 280
 281 The symbols on the first precedence line have the lowest precedence.
 282 Subsequent lines introduce symbols with higher precedence.
 283
 284 ###### grammar fields
 285         struct text current_type;
 286         int type_isref;
 287         int prec_levels;
 288
 289 ###### declarations
 290         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 291         static const char *known[] = { "$$", "${", "}$" };
 292
 293 ###### functions
 294         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 295         {
 296                 struct token t = token_next(ts);
 297                 char *err;
 298                 enum assoc assoc;
 299                 int found;
 300
 301                 if (t.num != TK_ident) {
 302                         err = "type or assoc expected after '$'";
 303                         goto abort;
 304                 }
 305                 if (text_is(t.txt, "LEFT"))
 306                         assoc = Left;
 307                 else if (text_is(t.txt, "RIGHT"))
 308                         assoc = Right;
 309                 else if (text_is(t.txt, "NON"))
 310                         assoc = Non;
 311                 else {
 312                         g->current_type = t.txt;
 313                         g->type_isref = isref;
 314                         if (text_is(t.txt, "void"))
 315                                 g->current_type.txt = NULL;
 316                         t = token_next(ts);
 317                         if (t.num != TK_newline) {
 318                                 err = "Extra tokens after type name";
 319                                 goto abort;
 320                         }
 321                         return NULL;
 322                 }
 323
 324                 if (isref) {
 325                         err = "$* cannot be followed by a precedence";
 326                         goto abort;
 327                 }
 328
 329                 // This is a precedence line, need some symbols.
 330                 found = 0;
 331                 g->prec_levels += 1;
 332                 t = token_next(ts);
 333                 while (t.num != TK_newline) {
 334                         enum symtype type = Terminal;
 335                         struct symbol *s;
 336                         if (t.num == TK_virtual) {
 337                                 type = Virtual;
 338                                 t = token_next(ts);
 339                                 if (t.num != TK_ident) {
 340                                         err = "$$ must be followed by a word";
 341                                         goto abort;
 342                                 }
 343                         } else if (t.num != TK_ident &&
 344                                    t.num != TK_mark) {
 345                                 err = "Illegal token in precedence line";
 346                                 goto abort;
 347                         }
 348                         s = sym_find(g, t.txt);
 349                         if (s->type != Unknown) {
 350                                 err = "Symbols in precedence line must not already be known.";
 351                                 goto abort;
 352                         }
 353                         s->type = type;
 354                         s->precedence = g->prec_levels;
 355                         s->assoc = assoc;
 356                         found += 1;
 357                         t = token_next(ts);
 358                 }
 359                 if (found == 0)
 360                         err = "No symbols given on precedence line";
 361                         goto abort;
 362                 return NULL;
 363         abort:
 364                 while (t.num != TK_newline && t.num != TK_eof)
 365                         t = token_next(ts);
 366                 return err;
 367         }
 368
 369 ### Productions
 370
 371 A production either starts with an identifier which is the head
 372 non-terminal, or a vertical bar (`|`) in which case this production
 373 uses the same head as the previous one.  The identifier must be
 374 followed by a `->` mark.  All productions for a given non-terminal must
 375 be grouped together with the `nonterminal ->` given only once.
 376
 377 After this a (possibly empty) sequence of identifiers and marks form
 378 the body of the production.  A virtual symbol may be given after the
 379 body by preceding it with `$$`.  If a virtual symbol is given, the
 380 precedence of the production is that for the virtual symbol.  If none
 381 is given, the precedence is inherited from the last symbol in the
 382 production which has a precedence specified.
 383
 384 After the optional precedence may come the `${` mark.  This indicates
 385 the start of a code fragment.  If present, this must be on the same
 386 line as the start of the production.
 387
 388 All of the text from the `${` through to the matching `}$` is
 389 collected and forms the code-fragment for the production.  It must all
 390 be in one `code_node` of the literate code.  The `}$` must be
 391 at the end of a line.
 392
 393 Text in the code fragment will undergo substitutions where `$N` or
 394 `$<N`,for some numeric `N`, will be replaced with a variable holding
 395 the parse information for the particular symbol in the production.
 396 `$0` is the head of the production, `$1` is the first symbol of the
 397 body, etc.  The type of `$N` for a terminal symbol is `struct token`.
 398 For a non-terminal, it is whatever has been declared for that symbol.
 399 The `<` may be included for symbols declared as storing a reference
 400 (not a structure) and means that the reference is being moved out, so
 401 it will not automatically be freed.
 402
 403 While building productions we will need to add to an array which needs to
 404 grow dynamically.
 405
 406 ###### functions
 407         static void array_add(void *varray, int *cnt, void *new)
 408         {
 409                 void ***array = varray;
 410                 int current = 0;
 411                 const int step = 8;
 412                 current = ((*cnt-1) | (step-1))+1;
 413                 if (*cnt == current) {
 414                         /* must grow */
 415                         current += step;
 416                         *array = realloc(*array, current * sizeof(void*));
 417                 }
 418                 (*array)[*cnt] = new;
 419                 (*cnt) += 1;
 420         }
 421
 422 Collecting the code fragment simply involves reading tokens until we
 423 hit the end token `}$`, and noting the character position of the start and
 424 the end.
 425
 426 ###### functions
 427         static struct text collect_code(struct token_state *state,
 428                                         struct token start)
 429         {
 430                 struct text code;
 431                 struct token t;
 432                 code.txt = start.txt.txt + start.txt.len;
 433                 do
 434                         t = token_next(state);
 435                 while (t.node == start.node &&
 436                        t.num != TK_close && t.num != TK_error &&
 437                        t.num != TK_eof);
 438                 if (t.num == TK_close && t.node == start.node)
 439                         code.len = t.txt.txt - code.txt;
 440                 else
 441                         code.txt = NULL;
 442                 return code;
 443         }
 444
 445 Now we have all the bits we need to parse a full production.
 446
 447 ###### includes
 448         #include <memory.h>
 449
 450 ###### grammar fields
 451         struct production **productions;
 452         int production_count;
 453
 454 ###### production fields
 455         struct symbol  *head;
 456         struct symbol **body;
 457         int             body_size;
 458         struct text     code;
 459         int             code_line;
 460
 461 ###### symbol fields
 462         int first_production;
 463
 464 ###### functions
 465         static char *parse_production(struct grammar *g,
 466                                       struct symbol *head,
 467                                       struct token_state *state)
 468         {
 469                 /* Head has already been parsed. */
 470                 struct token tk;
 471                 char *err;
 472                 struct production p, *pp;
 473
 474                 memset(&p, 0, sizeof(p));
 475                 p.head = head;
 476                 tk = token_next(state);
 477                 while (tk.num == TK_ident || tk.num == TK_mark) {
 478                         struct symbol *bs = sym_find(g, tk.txt);
 479                         if (bs->type == Unknown)
 480                                 bs->type = Terminal;
 481                         if (bs->type == Virtual) {
 482                                 err = "Virtual symbol not permitted in production";
 483                                 goto abort;
 484                         }
 485                         if (bs->precedence) {
 486                                 p.precedence = bs->precedence;
 487                                 p.assoc = bs->assoc;
 488                         }
 489                         array_add(&p.body, &p.body_size, bs);
 490                         tk = token_next(state);
 491                 }
 492                 if (tk.num == TK_virtual) {
 493                         struct symbol *vs;
 494                         tk = token_next(state);
 495                         if (tk.num != TK_ident) {
 496                                 err = "word required after $$";
 497                                 goto abort;
 498                         }
 499                         vs = sym_find(g, tk.txt);
 500                         if (vs->num == TK_newline)
 501                                 p.line_like = 1;
 502                         else if (vs->num == TK_out)
 503                                 p.line_like = 2;
 504                         else if (vs->precedence == 0) {
 505                                 err = "symbol after $$ must have precedence";
 506                                 goto abort;
 507                         } else {
 508                                 p.precedence = vs->precedence;
 509                                 p.assoc = vs->assoc;
 510                         }
 511                         tk = token_next(state);
 512                 }
 513                 if (tk.num == TK_open) {
 514                         p.code_line = tk.line;
 515                         p.code = collect_code(state, tk);
 516                         if (p.code.txt == NULL) {
 517                                 err = "code fragment not closed properly";
 518                                 goto abort;
 519                         }
 520                         tk = token_next(state);
 521                 }
 522                 if (tk.num != TK_newline && tk.num != TK_eof) {
 523                         err = "stray tokens at end of line";
 524                         goto abort;
 525                 }
 526                 pp = malloc(sizeof(*pp));
 527                 *pp = p;
 528                 array_add(&g->productions, &g->production_count, pp);
 529                 return NULL;
 530         abort:
 531                 while (tk.num != TK_newline && tk.num != TK_eof)
 532                         tk = token_next(state);
 533                 return err;
 534         }
 535
 536 With the ability to parse production and dollar-lines, we have nearly all
 537 that we need to parse a grammar from a `code_node`.
 538
 539 The head of the first production will effectively be the `start` symbol of
 540 the grammar.  However it won't _actually_ be so.  Processing the grammar is
 541 greatly simplified if the real start symbol only has a single production,
 542 and expects `$eof` as the final terminal.  So when we find the first
 543 explicit production we insert an extra production as production zero which
 544 looks like
 545
 546 ###### Example: production 0
 547         $start -> START $eof
 548
 549 where `START` is the first non-terminal given.
 550
 551 ###### create production zero
 552         struct production *p = calloc(1,sizeof(*p));
 553         struct text start = {"$start",6};
 554         struct text eof = {"$eof",4};
 555         struct text code = {"$0 = $<1;", 9};
 556         p->head = sym_find(g, start);
 557         p->head->type = Nonterminal;
 558         p->head->struct_name = g->current_type;
 559         p->head->isref = g->type_isref;
 560         if (g->current_type.txt)
 561                 p->code = code;
 562         array_add(&p->body, &p->body_size, head);
 563         array_add(&p->body, &p->body_size, sym_find(g, eof));
 564         p->head->first_production = g->production_count;
 565         array_add(&g->productions, &g->production_count, p);
 566
 567 Now we are ready to read in the grammar.  We ignore comments
 568 and strings so that the marks which introduce them can be
 569 used as terminals in the grammar.  We don't ignore numbers
 570 even though we don't allow them as that causes the scanner
 571 to produce errors that the parser is better positioned to handle.
 572
 573 ###### grammar_read
 574         static struct grammar *grammar_read(struct code_node *code)
 575         {
 576                 struct token_config conf = {
 577                         .word_start = "",
 578                         .word_cont = "",
 579                         .words_marks = known,
 580                         .known_count = sizeof(known)/sizeof(known[0]),
 581                         .number_chars = "",
 582                         .ignored = (1 << TK_line_comment)
 583                                  | (1 << TK_block_comment)
 584                                  | (0 << TK_number)
 585                                  | (1 << TK_string)
 586                                  | (1 << TK_multi_string)
 587                                  | (1 << TK_in)
 588                                  | (1 << TK_out),
 589                 };
 590
 591                 struct token_state *state = token_open(code, &conf);
 592                 struct token tk;
 593                 struct symbol *head = NULL;
 594                 struct grammar *g;
 595                 char *err = NULL;
 596
 597                 g = calloc(1, sizeof(*g));
 598                 symbols_init(g);
 599
 600                 for (tk = token_next(state); tk.num != TK_eof;
 601                      tk = token_next(state)) {
 602                         if (tk.num == TK_newline)
 603                                 continue;
 604                         if (tk.num == TK_ident) {
 605                                 // new non-terminal
 606                                 head = sym_find(g, tk.txt);
 607                                 if (head->type == Nonterminal)
 608                                         err = "This non-terminal has already be used.";
 609                                 else if (head->type == Virtual)
 610                                         err = "Virtual symbol not permitted in head of production";
 611                                 else {
 612                                         head->type = Nonterminal;
 613                                         head->struct_name = g->current_type;
 614                                         head->isref = g->type_isref;
 615                                         if (g->production_count == 0) {
 616                                                 ## create production zero
 617                                         }
 618                                         head->first_production = g->production_count;
 619                                         tk = token_next(state);
 620                                         if (tk.num == TK_mark &&
 621                                             text_is(tk.txt, "->"))
 622                                                 err = parse_production(g, head, state);
 623                                         else
 624                                                 err = "'->' missing in production";
 625                                 }
 626                         } else if (tk.num == TK_mark
 627                                    && text_is(tk.txt, "|")) {
 628                                 // another production for same non-term
 629                                 if (head)
 630                                         err = parse_production(g, head, state);
 631                                 else
 632                                         err = "First production must have a head";
 633                         } else if (tk.num == TK_mark
 634                                    && text_is(tk.txt, "$")) {
 635                                 err = dollar_line(state, g, 0);
 636                         } else if (tk.num == TK_mark
 637                                    && text_is(tk.txt, "$*")) {
 638                                 err = dollar_line(state, g, 1);
 639                         } else if (tk.num == TK_mark
 640                                    && text_is(tk.txt, "//")) {
 641                                 while (tk.num != TK_newline &&
 642                                        tk.num != TK_eof)
 643                                         tk = token_next(state);
 644                         } else {
 645                                 err = "Unrecognised token at start of line.";
 646                         }
 647                         if (err)
 648                                 goto abort;
 649                 }
 650                 token_close(state);
 651                 return g;
 652         abort:
 653                 fprintf(stderr, "Error at line %d: %s\n",
 654                         tk.line, err);
 655                 token_close(state);
 656                 free(g);
 657                 return NULL;
 658         }
 659
 660 ## Analysing the grammar
 661
 662 The central task in analysing the grammar is to determine a set of
 663 states to drive the parsing process.  This will require finding
 664 various sets of symbols and of "items".  Some of these sets will need
 665 to attach information to each element in the set, so they are more
 666 like maps.
 667
 668 Each "item" may have a 'look-ahead' or `LA` set associated with
 669 it.  Many of these will be the same as each other.  To avoid memory
 670 wastage, and to simplify some comparisons of sets, these sets will be
 671 stored in a list of unique sets, each assigned a number.
 672
 673 Once we have the data structures in hand to manage these sets and
 674 lists, we can start setting the 'nullable' flag, build the 'FIRST'
 675 sets, and then create the item sets which define the various states.
 676
 677 ### Symbol sets.
 678
 679 Though we don't only store symbols in these sets, they are the main
 680 things we store, so they are called symbol sets or "symsets".
 681
 682 A symset has a size, an array of shorts, and an optional array of data
 683 values, which are also shorts.  If the array of data is not present,
 684 we store an impossible pointer, as `NULL` is used to indicate that no
 685 memory has been allocated yet;
 686
 687 ###### declarations
 688         struct symset {
 689                 short cnt;
 690                 unsigned short *syms, *data;
 691         };
 692         #define NO_DATA ((unsigned short *)1)
 693         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 694         const struct symset INIT_DATASET = { 0, NULL, NULL };
 695
 696 The arrays of shorts are allocated in blocks of 8 and are kept sorted
 697 by using an insertion sort.  We don't explicitly record the amount of
 698 allocated space as it can be derived directly from the current `cnt` using
 699 `((cnt - 1) | 7) + 1`.
 700
 701 ###### functions
 702         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 703         {
 704                 int i;
 705                 int current = ((s->cnt-1) | 7) + 1;
 706                 if (current == s->cnt) {
 707                         current += 8;
 708                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 709                         if (s->data != NO_DATA)
 710                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 711                 }
 712                 i = s->cnt;
 713                 while (i > 0 && s->syms[i-1] > key) {
 714                         s->syms[i] = s->syms[i-1];
 715                         if (s->data != NO_DATA)
 716                                 s->data[i] = s->data[i-1];
 717                         i--;
 718                 }
 719                 s->syms[i] = key;
 720                 if (s->data != NO_DATA)
 721                         s->data[i] = val;
 722                 s->cnt += 1;
 723         }
 724
 725 Finding a symbol (or item) in a `symset` uses a simple binary search.
 726 We return the index where the value was found (so data can be accessed),
 727 or `-1` to indicate failure.
 728
 729         static int symset_find(struct symset *ss, unsigned short key)
 730         {
 731                 int lo = 0;
 732                 int hi = ss->cnt;
 733
 734                 if (hi == 0)
 735                         return -1;
 736                 while (lo + 1 < hi) {
 737                         int mid = (lo + hi) / 2;
 738                         if (ss->syms[mid] <= key)
 739                                 lo = mid;
 740                         else
 741                                 hi = mid;
 742                 }
 743                 if (ss->syms[lo] == key)
 744                         return lo;
 745                 return -1;
 746         }
 747
 748 We will often want to form the union of two symsets.  When we do, we
 749 will often want to know if anything changed (as that might mean there
 750 is more work to do).  So `symset_union` reports whether anything was
 751 added to the first set.  We use a slow quadratic approach as these
 752 sets don't really get very big.  If profiles shows this to be a problem it
 753 can be optimised later.
 754
 755         static int symset_union(struct symset *a, struct symset *b)
 756         {
 757                 int i;
 758                 int added = 0;
 759                 for (i = 0; i < b->cnt; i++)
 760                         if (symset_find(a, b->syms[i]) < 0) {
 761                                 unsigned short data = 0;
 762                                 if (b->data != NO_DATA)
 763                                         data = b->data[i];
 764                                 symset_add(a, b->syms[i], data);
 765                                 added++;
 766                         }
 767                 return added;
 768         }
 769
 770 And of course we must be able to free a symset.
 771
 772         static void symset_free(struct symset ss)
 773         {
 774                 free(ss.syms);
 775                 if (ss.data != NO_DATA)
 776                         free(ss.data);
 777         }
 778
 779 ### Symset Storage
 780
 781 Some symsets are simply stored somewhere appropriate in the `grammar`
 782 data structure, others needs a bit of indirection.  This applies
 783 particularly to "LA" sets.  These will be paired with "items" in an
 784 "itemset".  As itemsets will be stored in a symset, the "LA" set needs to be
 785 stored in the `data` field, so we need a mapping from a small (short)
 786 number to an LA `symset`.
 787
 788 As mentioned earlier this involves creating a list of unique symsets.
 789
 790 For now, we just use a linear list sorted by insertion.  A skiplist
 791 would be more efficient and may be added later.
 792
 793 ###### declarations
 794
 795         struct setlist {
 796                 struct symset ss;
 797                 int num;
 798                 struct setlist *next;
 799         };
 800
 801 ###### grammar fields
 802         struct setlist *sets;
 803         int nextset;
 804
 805 ###### functions
 806
 807         static int ss_cmp(struct symset a, struct symset b)
 808         {
 809                 int i;
 810                 int diff = a.cnt - b.cnt;
 811                 if (diff)
 812                         return diff;
 813                 for (i = 0; i < a.cnt; i++) {
 814                         diff = (int)a.syms[i] - (int)b.syms[i];
 815                         if (diff)
 816                                 return diff;
 817                 }
 818                 return 0;
 819         }
 820
 821         static int save_set(struct grammar *g, struct symset ss)
 822         {
 823                 struct setlist **sl = &g->sets;
 824                 int cmp = 1;
 825                 struct setlist *s;
 826
 827                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 828                         sl = & (*sl)->next;
 829                 if (cmp == 0) {
 830                         symset_free(ss);
 831                         return (*sl)->num;
 832                 }
 833
 834                 s = malloc(sizeof(*s));
 835                 s->ss = ss;
 836                 s->num = g->nextset;
 837                 g->nextset += 1;
 838                 s->next = *sl;
 839                 *sl = s;
 840                 return s->num;
 841         }
 842
 843 Finding a set by number is currently performed by a simple linear search.
 844 If this turns out to hurt performance, we can store the sets in a dynamic
 845 array like the productions.
 846
 847         static struct symset set_find(struct grammar *g, int num)
 848         {
 849                 struct setlist *sl = g->sets;
 850                 while (sl && sl->num != num)
 851                         sl = sl->next;
 852                 return sl->ss;
 853         }
 854
 855 ### Setting `nullable`
 856
 857 We set `nullable` on the head symbol for any production for which all
 858 the body symbols (if any) are nullable.  As this is a recursive
 859 definition, any change in the `nullable` setting means that we need to
 860 re-evaluate where it needs to be set.
 861
 862 We simply loop around performing the same calculations until no more
 863 changes happen.
 864
 865 ###### symbol fields
 866         int nullable;
 867
 868 ###### functions
 869         static void set_nullable(struct grammar *g)
 870         {
 871                 int check_again = 1;
 872                 while (check_again) {
 873                         int p;
 874                         check_again = 0;
 875                         for (p = 0; p < g->production_count; p++) {
 876                                 struct production *pr = g->productions[p];
 877                                 int s;
 878
 879                                 if (pr->head->nullable)
 880                                         continue;
 881                                 for (s = 0; s < pr->body_size; s++)
 882                                         if (! pr->body[s]->nullable)
 883                                                 break;
 884                                 if (s == pr->body_size) {
 885                                         pr->head->nullable = 1;
 886                                         check_again = 1;
 887                                 }
 888                         }
 889                 }
 890         }
 891
 892 ### Setting `line_like`
 893
 894 In order to be able to ignore newline tokens when not relevant, but
 895 still include them in the parse when needed, we will need to know
 896 which states can start a "line-like" section of code.  We ignore
 897 newlines when there is an indent since the most recent start of a
 898 line-like symbol.
 899
 900 A "line_like" symbol is simply any symbol that can derive a NEWLINE.
 901 If a symbol cannot derive a NEWLINE, then it is only part of a line -
 902 so is word-like.  If it can derive a NEWLINE, then we consider it to
 903 be like a line.
 904
 905 Clearly the `TK_newline` token can derive a NEWLINE.  Any symbol which
 906 is the head of a production that contains a line_like symbol is also a
 907 line-like symbol.  We use a new field `line_like` to record this
 908 attribute of symbols, and compute it in a repetitive manner similar to
 909 `set_nullable`.
 910
 911 ###### symbol fields
 912         int line_like;
 913
 914 ###### functions
 915         static void set_line_like(struct grammar *g)
 916         {
 917                 int check_again = 1;
 918                 g->symtab[TK_newline]->line_like = 1;
 919                 while (check_again) {
 920                         int p;
 921                         check_again = 0;
 922                         for (p = 0; p < g->production_count; p++) {
 923                                 struct production *pr = g->productions[p];
 924                                 int s;
 925
 926                                 if (pr->head->line_like)
 927                                         continue;
 928
 929                                 for (s = 0 ; s < pr->body_size; s++) {
 930                                         if (pr->body[s]->line_like) {
 931                                                 pr->head->line_like = 1;
 932                                                 check_again = 1;
 933                                                 break;
 934                                         }
 935                                 }
 936                         }
 937                 }
 938         }
 939
 940 ### Building the `first` sets
 941
 942 When calculating what can follow a particular non-terminal, we will need to
 943 know what the "first" terminal in any subsequent non-terminal might be.  So
 944 we calculate the `first` set for every non-terminal and store them in an
 945 array.  We don't bother recording the "first" set for terminals as they are
 946 trivial.
 947
 948 As the "first" for one symbol might depend on the "first" of another,
 949 we repeat the calculations until no changes happen, just like with
 950 `set_nullable`.  This makes use of the fact that `symset_union`
 951 reports if any change happens.
 952
 953 The core of this, which finds the "first" of part of a production body,
 954 will be reused for computing the "follow" sets, so we split it out
 955 into a separate function.
 956
 957 ###### grammar fields
 958         struct symset *first;
 959
 960 ###### functions
 961
 962         static int add_first(struct production *pr, int start,
 963                              struct symset *target, struct grammar *g,
 964                              int *to_end)
 965         {
 966                 int s;
 967                 int changed = 0;
 968                 for (s = start; s < pr->body_size; s++) {
 969                         struct symbol *bs = pr->body[s];
 970                         if (bs->type == Terminal) {
 971                                 if (symset_find(target, bs->num) < 0) {
 972                                         symset_add(target, bs->num, 0);
 973                                         changed = 1;
 974                                 }
 975                                 break;
 976                         } else if (symset_union(target, &g->first[bs->num]))
 977                                 changed = 1;
 978                         if (!bs->nullable)
 979                                 break;
 980                 }
 981                 if (to_end)
 982                         *to_end = (s == pr->body_size);
 983                 return changed;
 984         }
 985
 986         static void build_first(struct grammar *g)
 987         {
 988                 int check_again = 1;
 989                 int s;
 990                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
 991                 for (s = 0; s < g->num_syms; s++)
 992                         g->first[s] = INIT_SYMSET;
 993
 994                 while (check_again) {
 995                         int p;
 996                         check_again = 0;
 997                         for (p = 0; p < g->production_count; p++) {
 998                                 struct production *pr = g->productions[p];
 999                                 struct symset *head = &g->first[pr->head->num];
1000
1001                                 if (add_first(pr, 0, head, g, NULL))
1002                                         check_again = 1;
1003                         }
1004                 }
1005         }
1006
1007 ### Building the `follow` sets.
1008
1009 There are two different situations when we will want to generate "follow"
1010 sets.  If we are doing an SLR analysis, we want to generate a single
1011 "follow" set for each non-terminal in the grammar.  That is what is
1012 happening here.  If we are doing an LALR or LR analysis we will want
1013 to generate a separate "LA" set for each item.  We do that later
1014 in state generation.
1015
1016 There are two parts to generating a "follow" set.  Firstly we look at
1017 every place that any non-terminal appears in the body of any
1018 production, and we find the set of possible "first" symbols after
1019 there.  This is done using `add_first` above and only needs to be done
1020 once as the "first" sets are now stable and will not change.
1021
1022 ###### follow code
1023
1024         for (p = 0; p < g->production_count; p++) {
1025                 struct production *pr = g->productions[p];
1026                 int b;
1027
1028                 for (b = 0; b < pr->body_size - 1; b++) {
1029                         struct symbol *bs = pr->body[b];
1030                         if (bs->type == Terminal)
1031                                 continue;
1032                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1033                 }
1034         }
1035
1036 The second part is to add the "follow" set of the head of a production
1037 to the "follow" sets of the final symbol in the production, and any
1038 other symbol which is followed only by `nullable` symbols.  As this
1039 depends on "follow" itself we need to repeatedly perform the process
1040 until no further changes happen.
1041
1042 ###### follow code
1043
1044         for (again = 0, p = 0;
1045              p < g->production_count;
1046              p < g->production_count-1
1047                 ? p++ : again ? (again = 0, p = 0)
1048                               : p++) {
1049                 struct production *pr = g->productions[p];
1050                 int b;
1051
1052                 for (b = pr->body_size - 1; b >= 0; b--) {
1053                         struct symbol *bs = pr->body[b];
1054                         if (bs->type == Terminal)
1055                                 break;
1056                         if (symset_union(&g->follow[bs->num],
1057                                          &g->follow[pr->head->num]))
1058                                 again = 1;
1059                         if (!bs->nullable)
1060                                 break;
1061                 }
1062         }
1063
1064 We now just need to create and initialise the `follow` list to get a
1065 complete function.
1066
1067 ###### grammar fields
1068         struct symset *follow;
1069
1070 ###### functions
1071         static void build_follow(struct grammar *g)
1072         {
1073                 int s, again, p;
1074                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1075                 for (s = 0; s < g->num_syms; s++)
1076                         g->follow[s] = INIT_SYMSET;
1077                 ## follow code
1078         }
1079
1080 ### Building itemsets and states
1081
1082 There are three different levels of detail that can be chosen for
1083 building the itemsets and states for the LR grammar.  They are:
1084
1085 1. LR(0) or SLR(1), where no look-ahead is considered.
1086 2. LALR(1) where we build look-ahead sets with each item and merge
1087    the LA sets when we find two paths to the same "kernel" set of items.
1088 3. LR(1) where different look-ahead for any item in the set means
1089    a different state must be created.
1090
1091 ###### forward declarations
1092         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1093
1094 We need to be able to look through existing states to see if a newly
1095 generated state already exists.  For now we use a simple sorted linked
1096 list.
1097
1098 An item is a pair of numbers: the production number and the position of
1099 "DOT", which is an index into the body of the production.
1100 As the numbers are not enormous we can combine them into a single "short"
1101 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1102 production number (so 4000 productions with maximum of 15 symbols in the
1103 body).
1104
1105 Comparing the itemsets will be a little different to comparing symsets
1106 as we want to do the lookup after generating the "kernel" of an
1107 itemset, so we need to ignore the offset=zero items which are added during
1108 completion.
1109
1110 To facilitate this, we modify the "DOT" number so that "0" sorts to
1111 the end of the list in the symset, and then only compare items before
1112 the first "0".
1113
1114 ###### declarations
1115         static inline unsigned short item_num(int production, int index)
1116         {
1117                 return production | ((31-index) << 11);
1118         }
1119         static inline int item_prod(unsigned short item)
1120         {
1121                 return item & 0x7ff;
1122         }
1123         static inline int item_index(unsigned short item)
1124         {
1125                 return (31-(item >> 11)) & 0x1f;
1126         }
1127
1128 For LR(1) analysis we need to compare not just the itemset in a state
1129 but also the LA sets.  As we assign each unique LA set a number, we
1130 can just compare the symset and the data values together.
1131
1132 ###### functions
1133         static int itemset_cmp(struct symset a, struct symset b,
1134                                enum grammar_type type)
1135         {
1136                 int i;
1137                 int av, bv;
1138
1139                 for (i = 0;
1140                      i < a.cnt && i < b.cnt &&
1141                      item_index(a.syms[i]) > 0 &&
1142                      item_index(b.syms[i]) > 0;
1143                      i++) {
1144                         int diff = a.syms[i] - b.syms[i];
1145                         if (diff)
1146                                 return diff;
1147                         if (type == LR1) {
1148                                 diff = a.data[i] - b.data[i];
1149                                 if (diff)
1150                                         return diff;
1151                         }
1152                 }
1153                 if (i == a.cnt || item_index(a.syms[i]) == 0)
1154                         av = -1;
1155                 else
1156                         av = a.syms[i];
1157                 if (i == b.cnt || item_index(b.syms[i]) == 0)
1158                         bv = -1;
1159                 else
1160                         bv = b.syms[i];
1161                 if (av - bv)
1162                         return av - bv;
1163                 if (type < LR1 || av == -1)
1164                         return 0;
1165                 return
1166                         a.data[i] - b.data[i];
1167         }
1168
1169 It will be helpful to know if an itemset has been "completed" or not,
1170 particularly for LALR where itemsets get merged, at which point they
1171 need to be consider for completion again.  So  a `completed` flag is needed.
1172
1173 For correct handling of `TK_newline` when parsing, we will need to
1174 know which states (itemsets) can occur at the start of a line, so we
1175 will record a `starts_line` flag too whenever DOT is at the start of a
1176 `line_like` symbol.
1177
1178 Finally, for handling `TK_out` we need to know whether productions in the
1179 current state started *before* the most recent indent.  A state
1180 doesn't usually keep details of individual productions, so we need to
1181 add one extra detail. `min_prefix` is the smallest non-zero number of
1182 symbols *before* DOT in any production in an itemset.  This will allow
1183 us to determine if the the most recent indent is sufficiently recent
1184 to cancel it against a `TK_out`.  If it was seen longer ago than the
1185 `min_prefix`, and if the current state cannot be reduced, then the
1186 indented section must have ended in the middle of a syntactic unit, so
1187 an error must be signaled.
1188
1189 And now we can build the list of itemsets.  The lookup routine returns
1190 both a success flag and a pointer to where in the list an insert
1191 should happen, so we don't need to search a second time.
1192
1193 ###### declarations
1194         struct itemset {
1195                 struct itemset *next;
1196                 short state;
1197                 struct symset items;
1198                 struct symset go_to;
1199                 enum assoc assoc;
1200                 unsigned short precedence;
1201                 char completed;
1202                 char starts_line;
1203                 int min_prefix;
1204         };
1205
1206 ###### grammar fields
1207         struct itemset *items;
1208         int states;
1209
1210 ###### functions
1211         static int itemset_find(struct grammar *g, struct itemset ***where,
1212                                 struct symset kernel, enum grammar_type type)
1213         {
1214                 struct itemset **ip;
1215
1216                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1217                         struct itemset *i = *ip;
1218                         int diff;
1219                         diff = itemset_cmp(i->items, kernel, type);
1220                         if (diff < 0)
1221                                 continue;
1222                         if (diff > 0)
1223                                 break;
1224                         /* found */
1225                         *where = ip;
1226                         return 1;
1227                 }
1228                 *where = ip;
1229                 return 0;
1230         }
1231
1232 Adding an itemset may require merging the LA sets if LALR analysis is
1233 happening. If any new LA set adds any symbols that weren't in the old LA set, we
1234 clear the `completed` flag so that the dependants of this itemset will be
1235 recalculated and their LA sets updated.
1236
1237 `add_itemset` must consume the symsets it is passed, either by adding
1238 them to a data structure, of freeing them.
1239
1240         static int add_itemset(struct grammar *g, struct symset ss,
1241                                enum assoc assoc, unsigned short precedence,
1242                                enum grammar_type type)
1243         {
1244                 struct itemset **where, *is;
1245                 int i;
1246                 int found = itemset_find(g, &where, ss, type);
1247                 if (!found) {
1248                         is = calloc(1, sizeof(*is));
1249                         is->state = g->states;
1250                         g->states += 1;
1251                         is->items = ss;
1252                         is->assoc = assoc;
1253                         is->precedence = precedence;
1254                         is->next = *where;
1255                         is->go_to = INIT_DATASET;
1256                         *where = is;
1257                         return is->state;
1258                 }
1259                 is = *where;
1260                 if (type != LALR) {
1261                         symset_free(ss);
1262                         return is->state;
1263                 }
1264                 for (i = 0; i < ss.cnt; i++) {
1265                         struct symset temp = INIT_SYMSET, s;
1266                         if (ss.data[i] == is->items.data[i])
1267                                 continue;
1268                         s = set_find(g, is->items.data[i]);
1269                         symset_union(&temp, &s);
1270                         s = set_find(g, ss.data[i]);
1271                         if (symset_union(&temp, &s)) {
1272                                 is->items.data[i] = save_set(g, temp);
1273                                 is->completed = 0;
1274                         } else
1275                                 symset_free(temp);
1276                 }
1277                 symset_free(ss);
1278                 return is->state;
1279         }
1280
1281 #### The build
1282
1283 To build all the itemsets, we first insert the initial itemset made
1284 from production zero, complete each itemset, and then generate new
1285 itemsets from old until no new ones can be made.
1286
1287 Completing an itemset means finding all the items where "DOT" is followed by
1288 a nonterminal and adding "DOT=0" items for every production from that
1289 non-terminal - providing each item hasn't already been added.
1290
1291 If LA sets are needed, the LA set for each new item is found using
1292 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This may
1293 be supplemented by the LA set for the item which produce the new item.
1294
1295 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1296 is used in the next stage.
1297 If any of these symbols are flagged as `line_like`, then this
1298 state must be a `starts_line` state so now is a good time to record that.
1299
1300 When itemsets are created we assign a precedence to the itemset from
1301 the complete item, if there is one.  We ignore the possibility of
1302 there being two and don't (currently) handle precedence in such
1303 grammars.  When completing a grammar we ignore any item where DOT is
1304 followed by a terminal with a precedence lower than that for the
1305 itemset.  Unless the terminal has right associativity, we also ignore
1306 items where the terminal has the same precedence.  The result is that
1307 unwanted items are still in the itemset, but the terminal doesn't get
1308 into the go to set, so the item is ineffective.
1309
1310 ###### complete itemset
1311         for (i = 0; i < is->items.cnt; i++) {
1312                 int p = item_prod(is->items.syms[i]);
1313                 int bs = item_index(is->items.syms[i]);
1314                 struct production *pr = g->productions[p];
1315                 int p2;
1316                 struct symbol *s;
1317                 struct symset LA = INIT_SYMSET;
1318                 unsigned short sn = 0;
1319                 struct symset LAnl = INIT_SYMSET;
1320                 unsigned short snnl = 0;
1321
1322                 if (is->min_prefix == 0 ||
1323                     (bs > 0 && bs < is->min_prefix))
1324                         is->min_prefix = bs;
1325                 if (bs == pr->body_size)
1326                         continue;
1327                 s = pr->body[bs];
1328                 if (s->precedence && is->precedence &&
1329                     is->precedence > s->precedence)
1330                         /* This terminal has a low precedence and
1331                          * shouldn't be shifted
1332                          */
1333                         continue;
1334                 if (s->precedence && is->precedence &&
1335                     is->precedence == s->precedence && s->assoc != Right)
1336                         /* This terminal has a matching precedence and is
1337                          * not Right-associative, so we mustn't shift it.
1338                          */
1339                         continue;
1340                 if (symset_find(&done, s->num) < 0) {
1341                         symset_add(&done, s->num, 0);
1342                 }
1343                 if (s->type != Nonterminal)
1344                         continue;
1345                 if (s->line_like)
1346                         is->starts_line = 1;
1347                 again = 1;
1348                 if (type >= LALR) {
1349                         // Need the LA set.
1350                         int to_end;
1351                         add_first(pr, bs+1, &LA, g, &to_end);
1352                         if (to_end) {
1353                                 struct symset ss = set_find(g, is->items.data[i]);
1354                                 symset_union(&LA, &ss);
1355                         }
1356                         sn = save_set(g, LA);
1357                         LA = set_find(g, sn);
1358                         if (symset_find(&LA, TK_newline))
1359                                 symset_add(&LAnl, TK_newline, 0);
1360                         snnl = save_set(g, LAnl);
1361                         LAnl = set_find(g, snnl);
1362                 }
1363
1364                 /* Add productions for this symbol */
1365                 for (p2 = s->first_production;
1366                      p2 < g->production_count &&
1367                       g->productions[p2]->head == s;
1368                      p2++) {
1369                         int itm = item_num(p2, 0);
1370                         int pos = symset_find(&is->items, itm);
1371                         if (pos < 0) {
1372                                 if (g->productions[p2]->line_like)
1373                                         symset_add(&is->items, itm, snnl);
1374                                 else
1375                                         symset_add(&is->items, itm, sn);
1376                                 /* Will have re-ordered, so start
1377                                  * from beginning again */
1378                                 i = -1;
1379                         } else if (type >= LALR) {
1380                                 struct symset ss = set_find(g, is->items.data[pos]);
1381                                 struct symset tmp = INIT_SYMSET;
1382                                 struct symset *la = &LA;
1383
1384                                 if (g->productions[p2]->line_like)
1385                                         la = &LAnl;
1386                                 symset_union(&tmp, &ss);
1387                                 if (symset_union(&tmp, la)) {
1388                                         is->items.data[pos] = save_set(g, tmp);
1389                                         i = -1;
1390                                 } else
1391                                         symset_free(tmp);
1392                         }
1393                 }
1394         }
1395
1396 For each symbol we found (and placed in `done`) we collect all the items for
1397 which this symbol is next, and create a new itemset with "DOT" advanced over
1398 the symbol.  This is then added to the collection of itemsets (or merged
1399 with a pre-existing itemset).
1400
1401 ###### derive itemsets
1402         // Now we have a completed itemset, so we need to
1403         // compute all the 'next' itemsets and create them
1404         // if they don't exist.
1405         for (i = 0; i < done.cnt; i++) {
1406                 int j;
1407                 unsigned short state;
1408                 struct symbol *sym = g->symtab[done.syms[i]];
1409                 enum assoc assoc = Non;
1410                 unsigned short precedence = 0;
1411                 struct symset newitemset = INIT_SYMSET;
1412                 if (type >= LALR)
1413                         newitemset = INIT_DATASET;
1414
1415                 for (j = 0; j < is->items.cnt; j++) {
1416                         int itm = is->items.syms[j];
1417                         int p = item_prod(itm);
1418                         int bp = item_index(itm);
1419                         struct production *pr = g->productions[p];
1420                         unsigned short la = 0;
1421                         int pos;
1422
1423                         if (bp == pr->body_size)
1424                                 continue;
1425                         if (pr->body[bp] != sym)
1426                                 continue;
1427                         if (type >= LALR)
1428                                 la = is->items.data[j];
1429                         pos = symset_find(&newitemset, pr->head->num);
1430                         if (bp + 1 == pr->body_size &&
1431                             pr->precedence > 0 &&
1432                             pr->precedence > precedence) {
1433                                 // new itemset is reducible and has a precedence.
1434                                 precedence = pr->precedence;
1435                                 assoc = pr->assoc;
1436                         }
1437                         if (pos < 0)
1438                                 symset_add(&newitemset, item_num(p, bp+1), la);
1439                         else if (type >= LALR) {
1440                                 // Need to merge la set.
1441                                 int la2 = newitemset.data[pos];
1442                                 if (la != la2) {
1443                                         struct symset ss = set_find(g, la2);
1444                                         struct symset LA = INIT_SYMSET;
1445                                         symset_union(&LA, &ss);
1446                                         ss = set_find(g, la);
1447                                         if (symset_union(&LA, &ss))
1448                                                 newitemset.data[pos] = save_set(g, LA);
1449                                         else
1450                                                 symset_free(LA);
1451                                 }
1452                         }
1453                 }
1454                 state = add_itemset(g, newitemset, assoc, precedence, type);
1455                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1456                         symset_add(&is->go_to, done.syms[i], state);
1457         }
1458
1459 All that is left is to create the initial itemset from production zero, and
1460 with `TK_eof` as the LA set.
1461
1462 ###### functions
1463         static void build_itemsets(struct grammar *g, enum grammar_type type)
1464         {
1465                 struct symset first = INIT_SYMSET;
1466                 struct itemset *is;
1467                 int again;
1468                 unsigned short la = 0;
1469                 if (type >= LALR) {
1470                         // LA set just has eof
1471                         struct symset eof = INIT_SYMSET;
1472                         symset_add(&eof, TK_eof, 0);
1473                         la = save_set(g, eof);
1474                         first = INIT_DATASET;
1475                 }
1476                 // production 0, offset 0 (with no data)
1477                 symset_add(&first, item_num(0, 0), la);
1478                 add_itemset(g, first, Non, 0, type);
1479                 for (again = 0, is = g->items;
1480                      is;
1481                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
1482                         int i;
1483                         struct symset done = INIT_SYMSET;
1484                         if (is->completed)
1485                                 continue;
1486                         is->completed = 1;
1487                         again = 1;
1488                         ## complete itemset
1489                         ## derive itemsets
1490                         symset_free(done);
1491                 }
1492         }
1493
1494 ### Completing the analysis.
1495
1496 The exact process of analysis depends on which level was requested.  For
1497 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1498 `LR1` we need first, but not follow.  For `SLR` we need both.
1499
1500 We don't build the "action" tables that you might expect as the parser
1501 doesn't use them.  They will be calculated to some extent if needed for
1502 a report.
1503
1504 Once we have built everything we allocate arrays for the two lists:
1505 symbols and itemsets.  This allows more efficient access during reporting.
1506 The symbols are grouped as terminals and non-terminals and we record the
1507 changeover point in `first_nonterm`.
1508
1509 ###### grammar fields
1510         struct symbol **symtab;
1511         struct itemset **statetab;
1512         int first_nonterm;
1513
1514 ###### grammar_analyse
1515
1516         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1517         {
1518                 struct symbol *s;
1519                 struct itemset *is;
1520                 int snum = TK_reserved;
1521                 for (s = g->syms; s; s = s->next)
1522                         if (s->num < 0 && s->type == Terminal) {
1523                                 s->num = snum;
1524                                 snum++;
1525                         }
1526                 g->first_nonterm = snum;
1527                 for (s = g->syms; s; s = s->next)
1528                         if (s->num < 0) {
1529                                 s->num = snum;
1530                                 snum++;
1531                         }
1532                 g->num_syms = snum;
1533                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1534                 for (s = g->syms; s; s = s->next)
1535                         g->symtab[s->num] = s;
1536
1537                 set_nullable(g);
1538                 set_line_like(g);
1539                 if (type >= SLR)
1540                         build_first(g);
1541
1542                 if (type == SLR)
1543                         build_follow(g);
1544
1545                 build_itemsets(g, type);
1546
1547                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1548                 for (is = g->items; is ; is = is->next)
1549                         g->statetab[is->state] = is;
1550         }
1551
1552 ## Reporting on the Grammar
1553
1554 The purpose of the report is to give the grammar developer insight into
1555 how the grammar parser will work.  It is basically a structured dump of
1556 all the tables that have been generated, plus a description of any conflicts.
1557
1558 ###### grammar_report
1559         static int grammar_report(struct grammar *g, enum grammar_type type)
1560         {
1561                 report_symbols(g);
1562                 if (g->follow)
1563                         report_follow(g);
1564                 report_itemsets(g);
1565                 return report_conflicts(g, type);
1566         }
1567
1568 Firstly we have the complete list of symbols, together with the
1569 "FIRST" set if that was generated.  We add a mark to each symbol to
1570 show if it can end in a newline (`>`), if it is considered to be
1571 "line-like" (`<`), or if it is nullable (`.`).
1572
1573 ###### functions
1574
1575         static void report_symbols(struct grammar *g)
1576         {
1577                 int n;
1578                 if (g->first)
1579                         printf("SYMBOLS + FIRST:\n");
1580                 else
1581                         printf("SYMBOLS:\n");
1582
1583                 for (n = 0; n < g->num_syms; n++) {
1584                         struct symbol *s = g->symtab[n];
1585                         if (!s)
1586                                 continue;
1587
1588                         printf(" %c%c%3d%c: ",
1589                                s->nullable ? '.':' ',
1590                                s->line_like ? '<':' ',
1591                                s->num, symtypes[s->type]);
1592                         prtxt(s->name);
1593                         if (s->precedence)
1594                                 printf(" (%d%s)", s->precedence,
1595                                        assoc_names[s->assoc]);
1596
1597                         if (g->first && s->type == Nonterminal) {
1598                                 int j;
1599                                 char c = ':';
1600                                 for (j = 0; j < g->first[n].cnt; j++) {
1601                                         printf("%c ", c);
1602                                         c = ',';
1603                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1604                                 }
1605                         }
1606                         printf("\n");
1607                 }
1608         }
1609
1610 Then we have the follow sets if they were computed.
1611
1612         static void report_follow(struct grammar *g)
1613         {
1614                 int n;
1615                 printf("FOLLOW:\n");
1616                 for (n = 0; n < g->num_syms; n++)
1617                         if (g->follow[n].cnt) {
1618                                 int j;
1619                                 char c = ':';
1620                                 printf("  ");
1621                                 prtxt(g->symtab[n]->name);
1622                                 for (j = 0; j < g->follow[n].cnt; j++) {
1623                                         printf("%c ", c);
1624                                         c = ',';
1625                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1626                                 }
1627                                 printf("\n");
1628                         }
1629         }
1630
1631 And finally the item sets.  These include the GO TO tables and, for
1632 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1633 it up a bit.  First the items, with production number and associativity.
1634
1635         static void report_item(struct grammar *g, int itm)
1636         {
1637                 int p = item_prod(itm);
1638                 int dot = item_index(itm);
1639                 struct production *pr = g->productions[p];
1640                 int i;
1641
1642                 printf("    ");
1643                 prtxt(pr->head->name);
1644                 printf(" ->");
1645                 for (i = 0; i < pr->body_size; i++) {
1646                         printf(" %s", (dot == i ? ". ": ""));
1647                         prtxt(pr->body[i]->name);
1648                 }
1649                 if (dot == pr->body_size)
1650                         printf(" .");
1651                 printf(" [%d]", p);
1652                 if (pr->precedence && dot == pr->body_size)
1653                         printf(" (%d%s)", pr->precedence,
1654                                assoc_names[pr->assoc]);
1655                 if (dot < pr->body_size &&
1656                     pr->body[dot]->precedence) {
1657                         struct symbol *s = pr->body[dot];
1658                         printf(" [%d%s]", s->precedence,
1659                                assoc_names[s->assoc]);
1660                 }
1661                 if (pr->line_like == 1)
1662                         printf(" $$NEWLINE");
1663                 else if (pr->line_like)
1664                         printf(" $$OUT");
1665                 printf("\n");
1666         }
1667
1668 The LA sets which are (possibly) reported with each item:
1669
1670         static void report_la(struct grammar *g, int lanum)
1671         {
1672                 struct symset la = set_find(g, lanum);
1673                 int i;
1674                 char c = ':';
1675
1676                 printf("        LOOK AHEAD(%d)", lanum);
1677                 for (i = 0; i < la.cnt; i++) {
1678                         printf("%c ", c);
1679                         c = ',';
1680                         prtxt(g->symtab[la.syms[i]]->name);
1681                 }
1682                 printf("\n");
1683         }
1684
1685 Then the go to sets:
1686
1687         static void report_goto(struct grammar *g, struct symset gt)
1688         {
1689                 int i;
1690                 printf("    GOTO:\n");
1691
1692                 for (i = 0; i < gt.cnt; i++) {
1693                         printf("      ");
1694                         prtxt(g->symtab[gt.syms[i]]->name);
1695                         printf(" -> %d\n", gt.data[i]);
1696                 }
1697         }
1698
1699 Now we can report all the item sets complete with items, LA sets, and GO TO.
1700
1701         static void report_itemsets(struct grammar *g)
1702         {
1703                 int s;
1704                 printf("ITEM SETS(%d)\n", g->states);
1705                 for (s = 0; s < g->states; s++) {
1706                         int j;
1707                         struct itemset *is = g->statetab[s];
1708                         printf("  Itemset %d:%s min prefix=%d",
1709                                s, is->starts_line?" (startsline)":"", is->min_prefix);
1710                         if (is->precedence)
1711                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1712                         printf("\n");
1713                         for (j = 0; j < is->items.cnt; j++) {
1714                                 report_item(g, is->items.syms[j]);
1715                                 if (is->items.data != NO_DATA)
1716                                         report_la(g, is->items.data[j]);
1717                         }
1718                         report_goto(g, is->go_to);
1719                 }
1720         }
1721
1722 ### Reporting conflicts
1723
1724 Conflict detection varies a lot among different analysis levels.  However
1725 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1726 LR1 are also similar as they have FOLLOW or LA sets.
1727
1728 ###### functions
1729
1730         ## conflict functions
1731
1732         static int report_conflicts(struct grammar *g, enum grammar_type type)
1733         {
1734                 int cnt = 0;
1735                 printf("Conflicts:\n");
1736                 if (type < SLR)
1737                         cnt = conflicts_lr0(g, type);
1738                 else
1739                         cnt = conflicts_slr(g, type);
1740                 if (cnt == 0)
1741                         printf(" - no conflicts\n");
1742                 return cnt;
1743         }
1744
1745 LR0 conflicts are any state which have both a reducible item and
1746 a shiftable item, or two reducible items.
1747
1748 LR05 conflicts only occur if two possible reductions exist,
1749 as shifts always over-ride reductions.
1750
1751 ###### conflict functions
1752         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1753         {
1754                 int i;
1755                 int cnt = 0;
1756                 for (i = 0; i < g->states; i++) {
1757                         struct itemset *is = g->statetab[i];
1758                         int last_reduce = -1;
1759                         int prev_reduce = -1;
1760                         int last_shift = -1;
1761                         int j;
1762                         if (!is)
1763                                 continue;
1764                         for (j = 0; j < is->items.cnt; j++) {
1765                                 int itm = is->items.syms[j];
1766                                 int p = item_prod(itm);
1767                                 int bp = item_index(itm);
1768                                 struct production *pr = g->productions[p];
1769
1770                                 if (bp == pr->body_size) {
1771                                         prev_reduce = last_reduce;
1772                                         last_reduce = j;
1773                                         continue;
1774                                 }
1775                                 if (pr->body[bp]->type == Terminal)
1776                                         last_shift = j;
1777                         }
1778                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1779                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1780                                 report_item(g, is->items.syms[last_shift]);
1781                                 report_item(g, is->items.syms[last_reduce]);
1782                                 cnt++;
1783                         }
1784                         if (prev_reduce >= 0) {
1785                                 printf("  State %d has 2 (or more) reducible items\n", i);
1786                                 report_item(g, is->items.syms[prev_reduce]);
1787                                 report_item(g, is->items.syms[last_reduce]);
1788                                 cnt++;
1789                         }
1790                 }
1791                 return cnt;
1792         }
1793
1794 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1795 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1796 in the source of the look ahead set.
1797
1798 We build two datasets to reflect the "action" table: one which maps
1799 terminals to items where that terminal could be shifted and another
1800 which maps terminals to items that could be reduced when the terminal
1801 is in look-ahead.  We report when we get conflicts between the two.
1802
1803 As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
1804 terminal, we ignore it.  NEWLINES are handled specially with its own
1805 rules for when to shift and when to reduce.  Conflicts are expected,
1806 but handled internally.
1807
1808         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1809         {
1810                 int i;
1811                 int cnt = 0;
1812
1813                 for (i = 0; i < g->states; i++) {
1814                         struct itemset *is = g->statetab[i];
1815                         struct symset shifts = INIT_DATASET;
1816                         struct symset reduce = INIT_DATASET;
1817                         int j;
1818                         if (!is)
1819                                 continue;
1820                         /* First collect the shifts */
1821                         for (j = 0; j < is->items.cnt; j++) {
1822                                 unsigned short itm = is->items.syms[j];
1823                                 int p = item_prod(itm);
1824                                 int bp = item_index(itm);
1825                                 struct production *pr = g->productions[p];
1826                                 struct symbol *s;
1827
1828                                 if (bp >= pr->body_size ||
1829                                     pr->body[bp]->type != Terminal)
1830                                         /* not shiftable */
1831                                         continue;
1832
1833                                 s = pr->body[bp];
1834                                 if (s->precedence && is->precedence)
1835                                         /* Precedence resolves this, so no conflict */
1836                                         continue;
1837
1838                                 if (symset_find(&shifts, s->num) < 0)
1839                                         symset_add(&shifts, s->num, itm);
1840                         }
1841                         /* Now look for reductions and conflicts */
1842                         for (j = 0; j < is->items.cnt; j++) {
1843                                 unsigned short itm = is->items.syms[j];
1844                                 int p = item_prod(itm);
1845                                 int bp = item_index(itm);
1846                                 struct production *pr = g->productions[p];
1847
1848                                 if (bp < pr->body_size)
1849                                         continue;
1850                                 /* reducible */
1851                                 struct symset la;
1852                                 if (type == SLR)
1853                                         la = g->follow[pr->head->num];
1854                                 else
1855                                         la = set_find(g, is->items.data[j]);
1856                                 int k;
1857                                 for (k = 0; k < la.cnt; k++) {
1858                                         int pos = symset_find(&shifts, la.syms[k]);
1859                                         if (pos >= 0 && la.syms[k] != TK_newline) {
1860                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1861                                                 cnt++;
1862                                                         prtxt(g->symtab[la.syms[k]]->name);
1863                                                 printf(":\n");
1864                                                 report_item(g, shifts.data[pos]);
1865                                                 report_item(g, itm);
1866                                         }
1867                                         pos = symset_find(&reduce, la.syms[k]);
1868                                         if (pos < 0) {
1869                                                 symset_add(&reduce, la.syms[k], itm);
1870                                                 continue;
1871                                         }
1872                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1873                                         prtxt(g->symtab[la.syms[k]]->name);
1874                                         printf(":\n");
1875                                         report_item(g, itm);
1876                                         report_item(g, reduce.data[pos]);
1877                                         cnt++;
1878                                 }
1879                         }
1880                         symset_free(shifts);
1881                         symset_free(reduce);
1882                 }
1883                 return cnt;
1884         }
1885
1886 ## Generating the parser
1887
1888 The exported part of the parser is the `parse_XX` function, where the name
1889 `XX` is based on the name of the parser files.
1890
1891 This takes a `code_node`, a partially initialized `token_config`, and an
1892 optional `FILE` to send tracing to.  The `token_config` gets the list of
1893 known words added and then is used with the `code_node` to initialize the
1894 scanner.
1895
1896 `parse_XX` then calls the library function `parser_run` to actually complete
1897 the parse.  This needs the `states` table and function to call the various
1898 pieces of code provided in the grammar file, so they are generated first.
1899
1900 ###### parser_generate
1901
1902         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1903                                struct code_node *pre_reduce)
1904         {
1905                 gen_known(f, g);
1906                 gen_non_term(f, g);
1907                 gen_goto(f, g);
1908                 gen_states(f, g);
1909                 gen_reduce(f, g, file, pre_reduce);
1910                 gen_free(f, g);
1911
1912                 fprintf(f, "#line 0 \"gen_parser\"\n");
1913                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1914                         name);
1915                 fprintf(f, "{\n");
1916                 fprintf(f, "\tstruct token_state *tokens;\n");
1917                 fprintf(f, "\tconfig->words_marks = known;\n");
1918                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1919                 fprintf(f, "\ttokens = token_open(code, config);\n");
1920                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1921                 fprintf(f, "\ttoken_close(tokens);\n");
1922                 fprintf(f, "\treturn rv;\n");
1923                 fprintf(f, "}\n\n");
1924         }
1925
1926 ### Known words table
1927
1928 The known words table is simply an array of terminal symbols.
1929 The table of nonterminals used for tracing is a similar array.  We
1930 include virtual symbols in the table of non_terminals to keep the
1931 numbers right.
1932
1933 ###### functions
1934
1935         static void gen_known(FILE *f, struct grammar *g)
1936         {
1937                 int i;
1938                 fprintf(f, "#line 0 \"gen_known\"\n");
1939                 fprintf(f, "static const char *known[] = {\n");
1940                 for (i = TK_reserved;
1941                      i < g->num_syms && g->symtab[i]->type == Terminal;
1942                      i++)
1943                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1944                                 g->symtab[i]->name.txt);
1945                 fprintf(f, "};\n\n");
1946         }
1947
1948         static void gen_non_term(FILE *f, struct grammar *g)
1949         {
1950                 int i;
1951                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1952                 fprintf(f, "static const char *non_term[] = {\n");
1953                 for (i = TK_reserved;
1954                      i < g->num_syms;
1955                      i++)
1956                         if (g->symtab[i]->type != Terminal)
1957                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1958                                         g->symtab[i]->name.txt);
1959                 fprintf(f, "};\n\n");
1960         }
1961
1962 ### States and the goto tables.
1963
1964 For each state we record the goto table, the reducible production if
1965 there is one, or a symbol to shift for error recovery.
1966 Some of the details of the reducible production are stored in the
1967 `do_reduce` function to come later.  Here we store the production number,
1968 the body size (useful for stack management) and the resulting symbol (useful
1969 for knowing how to free data later).
1970
1971 The go to table is stored in a simple array of `sym` and corresponding
1972 `state`.
1973
1974 ###### exported types
1975
1976         struct lookup {
1977                 short sym;
1978                 short state;
1979         };
1980         struct state {
1981                 short go_to_cnt;
1982                 const struct lookup * go_to;
1983                 short reduce_prod;
1984                 short reduce_size;
1985                 short reduce_sym;
1986                 char starts_line;
1987                 char newline_only;
1988                 short min_prefix;
1989         };
1990
1991 ###### functions
1992
1993         static void gen_goto(FILE *f, struct grammar *g)
1994         {
1995                 int i;
1996                 fprintf(f, "#line 0 \"gen_goto\"\n");
1997                 for (i = 0; i < g->states; i++) {
1998                         int j;
1999                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
2000                                 i);
2001                         struct symset gt = g->statetab[i]->go_to;
2002                         for (j = 0; j < gt.cnt; j++)
2003                                 fprintf(f, "\t{ %d, %d },\n",
2004                                         gt.syms[j], gt.data[j]);
2005                         fprintf(f, "};\n");
2006                 }
2007         }
2008
2009 ###### functions
2010
2011         static void gen_states(FILE *f, struct grammar *g)
2012         {
2013                 int i;
2014                 fprintf(f, "#line 0 \"gen_states\"\n");
2015                 fprintf(f, "static const struct state states[] = {\n");
2016                 for (i = 0; i < g->states; i++) {
2017                         struct itemset *is = g->statetab[i];
2018                         int j, prod = -1, prod_len;
2019
2020                         for (j = 0; j < is->items.cnt; j++) {
2021                                 int itm = is->items.syms[j];
2022                                 int p = item_prod(itm);
2023                                 int bp = item_index(itm);
2024                                 struct production *pr = g->productions[p];
2025
2026                                 if (bp < pr->body_size)
2027                                         continue;
2028                                 /* This is what we reduce */
2029                                 if (prod < 0 || prod_len < pr->body_size) {
2030                                         prod = p;
2031                                         prod_len = pr->body_size;
2032                                 }
2033                         }
2034
2035                         if (prod >= 0)
2036                                 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n",
2037                                         i, is->go_to.cnt, i, prod,
2038                                         g->productions[prod]->body_size,
2039                                         g->productions[prod]->head->num,
2040                                         is->starts_line,
2041                                         g->productions[prod]->line_like,
2042                                         is->min_prefix);
2043                         else
2044                                 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n",
2045                                         i, is->go_to.cnt, i,
2046                                         is->starts_line, is->min_prefix);
2047                 }
2048                 fprintf(f, "};\n\n");
2049         }
2050
2051 ### The `do_reduce` function and the code
2052
2053 When the parser engine decides to reduce a production, it calls `do_reduce`.
2054 This has two functions.
2055
2056 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
2057 production being reduced.  Secondly it runs the code that was included with
2058 the production if any.
2059
2060 This code needs to be able to store data somewhere.  Rather than requiring
2061 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
2062 `do_reduce` return the size to be saved.
2063
2064 In order for the code to access "global" context, we pass in the
2065 "config" pointer that was passed to parser function.  If the `struct
2066 token_config` is embedded in some larger structure, the reducing code
2067 can access the larger structure using pointer manipulation.
2068
2069 The code fragment requires translation when written out.  Any `$N` needs to
2070 be converted to a reference either to that buffer (if `$0`) or to the
2071 structure returned by a previous reduction.  These pointers need to be cast
2072 to the appropriate type for each access.  All this is handled in
2073 `gen_code`.
2074
2075 `gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
2076 This applied only to symbols with references (or pointers), not those with structures.
2077 The `<` implies that the reference it being moved out, so the object will not be
2078 automatically freed.  This is equivalent to assigning `NULL` to the pointer.
2079
2080 ###### functions
2081
2082         static void gen_code(struct production *p, FILE *f, struct grammar *g)
2083         {
2084                 char *c;
2085                 char *used = calloc(1, p->body_size);
2086                 int i;
2087
2088                 fprintf(f, "\t\t\t");
2089                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2090                         int n;
2091                         int use = 0;
2092                         if (*c != '$') {
2093                                 fputc(*c, f);
2094                                 if (*c == '\n')
2095                                         fputs("\t\t\t", f);
2096                                 continue;
2097                         }
2098                         c++;
2099                         if (*c == '<') {
2100                                 use = 1;
2101                                 c++;
2102                         }
2103                         if (*c < '0' || *c > '9') {
2104                                 if (use)
2105                                         fputc('<', f);
2106                                 fputc(*c, f);
2107                                 continue;
2108                         }
2109                         n = *c - '0';
2110                         while (c[1] >= '0' && c[1] <= '9') {
2111                                 c += 1;
2112                                 n = n * 10 + *c - '0';
2113                         }
2114                         if (n == 0)
2115                                 fprintf(f, "(*(struct %.*s*%s)ret)",
2116                                         p->head->struct_name.len,
2117                                         p->head->struct_name.txt,
2118                                         p->head->isref ? "*":"");
2119                         else if (n > p->body_size)
2120                                 fprintf(f, "$%d", n);
2121                         else if (p->body[n-1]->type == Terminal)
2122                                 fprintf(f, "(*(struct token *)body[%d])",
2123                                         n-1);
2124                         else if (p->body[n-1]->struct_name.txt == NULL)
2125                                 fprintf(f, "$%d", n);
2126                         else {
2127                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2128                                         p->body[n-1]->struct_name.len,
2129                                         p->body[n-1]->struct_name.txt,
2130                                         p->body[n-1]->isref ? "*":"", n-1);
2131                                 used[n-1] = use;
2132                         }
2133                 }
2134                 fputs("\n", f);
2135                 for (i = 0; i < p->body_size; i++) {
2136                         if (p->body[i]->struct_name.txt &&
2137                             used[i]) {
2138                                 // assume this has been copied out
2139                                 if (p->body[i]->isref)
2140                                         fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2141                                 else
2142                                         fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2143                         }
2144                 }
2145                 free(used);
2146         }
2147
2148 ###### functions
2149
2150         static void gen_reduce(FILE *f, struct grammar *g, char *file,
2151                                struct code_node *code)
2152         {
2153                 int i;
2154                 fprintf(f, "#line 1 \"gen_reduce\"\n");
2155                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2156                 fprintf(f, "{\n");
2157                 fprintf(f, "\tint ret_size = 0;\n");
2158                 if (code)
2159                         code_node_print(f, code, file);
2160
2161                 fprintf(f, "#line 4 \"gen_reduce\"\n");
2162                 fprintf(f, "\tswitch(prod) {\n");
2163                 for (i = 0; i < g->production_count; i++) {
2164                         struct production *p = g->productions[i];
2165                         fprintf(f, "\tcase %d:\n", i);
2166
2167                         if (p->code.txt) {
2168                                 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2169                                 gen_code(p, f, g);
2170                         }
2171
2172                         if (p->head->struct_name.txt)
2173                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2174                                         p->head->struct_name.len,
2175                                         p->head->struct_name.txt,
2176                                         p->head->isref ? "*":"");
2177
2178                         fprintf(f, "\t\tbreak;\n");
2179                 }
2180                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2181         }
2182
2183 ### `do_free`
2184
2185 As each non-terminal can potentially cause a different type of data
2186 structure to be allocated and filled in, we need to be able to free it when
2187 done.
2188
2189 It is particularly important to have fine control over freeing during error
2190 recovery where individual stack frames might need to be freed.
2191
2192 For this, the grammar author is required to defined a `free_XX` function for
2193 each structure that is used by a non-terminal.  `do_free` will call whichever
2194 is appropriate given a symbol number, and will call `free` (as is
2195 appropriate for tokens) on any terminal symbol.
2196
2197 ###### functions
2198
2199         static void gen_free(FILE *f, struct grammar *g)
2200         {
2201                 int i;
2202
2203                 fprintf(f, "#line 0 \"gen_free\"\n");
2204                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2205                 fprintf(f, "{\n");
2206                 fprintf(f, "\tif (!asn) return;\n");
2207                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2208                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2209                 fprintf(f, "\tswitch(sym) {\n");
2210
2211                 for (i = 0; i < g->num_syms; i++) {
2212                         struct symbol *s = g->symtab[i];
2213                         if (!s ||
2214                             s->type != Nonterminal ||
2215                             s->struct_name.txt == NULL)
2216                                 continue;
2217
2218                         fprintf(f, "\tcase %d:\n", s->num);
2219                         if (s->isref) {
2220                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2221                                         s->struct_name.len,
2222                                         s->struct_name.txt);
2223                                 fprintf(f, "\t\tfree(asn);\n");
2224                         } else
2225                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2226                                         s->struct_name.len,
2227                                         s->struct_name.txt);
2228                         fprintf(f, "\t\tbreak;\n");
2229                 }
2230                 fprintf(f, "\t}\n}\n\n");
2231         }
2232
2233 ## The main routine.
2234
2235 There are three key parts to the "main" function of parsergen: processing
2236 the arguments, loading the grammar file, and dealing with the grammar.
2237
2238 ### Argument processing.
2239
2240 Command line options allow the selection of analysis mode, name of output
2241 file, and whether or not a report should be generated.  By default we create
2242 a report only if no code output was requested.
2243
2244 The `parse_XX` function name uses the basename of the output file which
2245 should not have a suffix (`.c`).  `.c` is added to the given name for the
2246 code, and `.h` is added for the header (if header text is specifed in the
2247 grammar file).
2248
2249 ###### includes
2250         #include <getopt.h>
2251
2252 ###### declarations
2253         static const struct option long_options[] = {
2254                 { "LR0",        0, NULL, '0' },
2255                 { "LR05",       0, NULL, '5' },
2256                 { "SLR",        0, NULL, 'S' },
2257                 { "LALR",       0, NULL, 'L' },
2258                 { "LR1",        0, NULL, '1' },
2259                 { "tag",        1, NULL, 't' },
2260                 { "report",     0, NULL, 'R' },
2261                 { "output",     1, NULL, 'o' },
2262                 { NULL,         0, NULL, 0   }
2263         };
2264         const char *options = "05SL1t:Ro:";
2265
2266 ###### process arguments
2267         int opt;
2268         char *outfile = NULL;
2269         char *infile;
2270         char *name;
2271         char *tag = NULL;
2272         int report = 1;
2273         enum grammar_type type = LR05;
2274         while ((opt = getopt_long(argc, argv, options,
2275                                   long_options, NULL)) != -1) {
2276                 switch(opt) {
2277                 case '0':
2278                         type = LR0; break;
2279                 case '5':
2280                         type = LR05; break;
2281                 case 'S':
2282                         type = SLR; break;
2283                 case 'L':
2284                         type = LALR; break;
2285                 case '1':
2286                         type = LR1; break;
2287                 case 'R':
2288                         report = 2; break;
2289                 case 'o':
2290                         outfile = optarg; break;
2291                 case 't':
2292                         tag = optarg; break;
2293                 default:
2294                         fprintf(stderr, "Usage: parsergen ...\n");
2295                         exit(1);
2296                 }
2297         }
2298         if (optind < argc)
2299                 infile = argv[optind++];
2300         else {
2301                 fprintf(stderr, "No input file given\n");
2302                 exit(1);
2303         }
2304         if (outfile && report == 1)
2305                 report = 0;
2306         name = outfile;
2307         if (name && strchr(name, '/'))
2308                 name = strrchr(name, '/')+1;
2309
2310         if (optind < argc) {
2311                 fprintf(stderr, "Excess command line arguments\n");
2312                 exit(1);
2313         }
2314
2315 ### Loading the grammar file
2316
2317 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2318 map it.
2319
2320 Once we have extracted the code (with `mdcode`) we expect to find three
2321 sections: header, code, and grammar.  Anything else that is not
2322 excluded by the `--tag` option is an error.
2323
2324 "header" and "code" are optional, though it is hard to build a working
2325 parser with neither. "grammar" must be provided.
2326
2327 ###### includes
2328         #include <fcntl.h>
2329         #include <sys/mman.h>
2330         #include <errno.h>
2331
2332 ###### functions
2333         static int errs;
2334         static void pr_err(char *msg)
2335         {
2336                 errs++;
2337                 fprintf(stderr, "%s\n", msg);
2338         }
2339
2340 ###### load file
2341         struct section *table;
2342         int fd;
2343         int len;
2344         char *file;
2345         fd = open(infile, O_RDONLY);
2346         if (fd < 0) {
2347                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2348                         infile, strerror(errno));
2349                 exit(1);
2350         }
2351         len = lseek(fd, 0, 2);
2352         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2353         table = code_extract(file, file+len, pr_err);
2354
2355         struct code_node *hdr = NULL;
2356         struct code_node *code = NULL;
2357         struct code_node *gram = NULL;
2358         struct code_node *pre_reduce = NULL;
2359         for (s = table; s; s = s->next) {
2360                 struct text sec = s->section;
2361                 if (tag && !strip_tag(&sec, tag))
2362                         continue;
2363                 if (text_is(sec, "header"))
2364                         hdr = s->code;
2365                 else if (text_is(sec, "code"))
2366                         code = s->code;
2367                 else if (text_is(sec, "grammar"))
2368                         gram = s->code;
2369                 else if (text_is(sec, "reduce"))
2370                         pre_reduce = s->code;
2371                 else {
2372                         fprintf(stderr, "Unknown content section: %.*s\n",
2373                                 s->section.len, s->section.txt);
2374                         rv |= 2;
2375                 }
2376         }
2377
2378 ### Processing the input
2379
2380 As we need to append an extention to a filename and then open it for
2381 writing, and we need to do this twice, it helps to have a separate function.
2382
2383 ###### functions
2384
2385         static FILE *open_ext(char *base, char *ext)
2386         {
2387                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2388                 FILE *f;
2389                 strcat(strcpy(fn, base), ext);
2390                 f = fopen(fn, "w");
2391                 free(fn);
2392                 return f;
2393         }
2394
2395 If we can read the grammar, then we analyse and optionally report on it.  If
2396 the report finds conflicts we will exit with an error status.
2397
2398 ###### process input
2399         struct grammar *g = NULL;
2400         if (gram == NULL) {
2401                 fprintf(stderr, "No grammar section provided\n");
2402                 rv |= 2;
2403         } else {
2404                 g = grammar_read(gram);
2405                 if (!g) {
2406                         fprintf(stderr, "Failure to parse grammar\n");
2407                         rv |= 2;
2408                 }
2409         }
2410         if (g) {
2411                 grammar_analyse(g, type);
2412                 if (report)
2413                         if (grammar_report(g, type))
2414                                 rv |= 1;
2415         }
2416
2417 If a "headers" section is defined, we write it out and include a declaration
2418 for the `parse_XX` function so it can be used from separate code.
2419
2420         if (rv == 0 && hdr && outfile) {
2421                 FILE *f = open_ext(outfile, ".h");
2422                 if (f) {
2423                         code_node_print(f, hdr, infile);
2424                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2425                                 name);
2426                         fclose(f);
2427                 } else {
2428                         fprintf(stderr, "Cannot create %s.h\n",
2429                                 outfile);
2430                         rv |= 4;
2431                 }
2432         }
2433
2434 And if all goes well and an output file was provided, we create the `.c`
2435 file with the code section (if any) and the parser tables and function.
2436
2437         if (rv == 0 && outfile) {
2438                 FILE *f = open_ext(outfile, ".c");
2439                 if (f) {
2440                         if (code)
2441                                 code_node_print(f, code, infile);
2442                         gen_parser(f, g, infile, name, pre_reduce);
2443                         fclose(f);
2444                 } else {
2445                         fprintf(stderr, "Cannot create %s.c\n",
2446                                 outfile);
2447                         rv |= 4;
2448                 }
2449         }
2450
2451 And that about wraps it up.  We need to set the locale so that UTF-8 is
2452 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2453
2454 ###### File: parsergen.mk
2455         parsergen : parsergen.o libscanner.o libmdcode.o
2456                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2457
2458 ###### includes
2459         #include <locale.h>
2460
2461 ###### main
2462
2463         int main(int argc, char *argv[])
2464         {
2465                 struct section *s;
2466                 int rv = 0;
2467
2468                 setlocale(LC_ALL,"");
2469
2470                 ## process arguments
2471                 ## load file
2472                 ## process input
2473
2474                 return rv;
2475         }
2476
2477 ## The SHIFT/REDUCE parser
2478
2479 Having analysed the grammar and generated all the tables, we only need the
2480 shift/reduce engine to bring it all together.
2481
2482 ### Goto table lookup
2483
2484 The parser generator has nicely provided us with goto tables sorted by
2485 symbol number.  We need a binary search function to find a symbol in the
2486 table.
2487
2488 ###### parser functions
2489
2490         static int search(const struct state *l, int sym)
2491         {
2492                 int lo = 0;
2493                 int hi = l->go_to_cnt;
2494
2495                 if (hi == 0)
2496                         return -1;
2497                 while (lo + 1 < hi) {
2498                         int mid = (lo + hi) / 2;
2499                         if (l->go_to[mid].sym <= sym)
2500                                 lo = mid;
2501                         else
2502                                 hi = mid;
2503                 }
2504                 if (l->go_to[lo].sym == sym)
2505                         return l->go_to[lo].state;
2506                 else
2507                         return -1;
2508         }
2509
2510 ### The state stack.
2511
2512 The core data structure for the parser is the stack.  This tracks all the
2513 symbols that have been recognised or partially recognised.
2514
2515 The stack usually won't grow very large - maybe a few tens of entries.  So
2516 we dynamically resize and array as required but never bother to shrink it
2517 down again.
2518
2519 We keep the stack as two separate allocations.  One, `asn_stack` stores the
2520 "abstract syntax nodes" which are created by each reduction.  When we call
2521 `do_reduce` we need to pass an array of the `asn`s of the body of the
2522 production, and by keeping a separate `asn` stack, we can just pass a
2523 pointer into this stack.
2524
2525 The other allocation stores all other stack fields of which there are six.
2526 The `state` is the most important one and guides the parsing process.  The
2527 `sym` is nearly unnecessary.  However when we want to free entries from the
2528 `asn_stack`, it helps to know what type they are so we can call the right
2529 freeing function.  The symbol leads us to the right free function through
2530 `do_free`.
2531
2532 The `indents` count tracks the line indents with-in the symbol or
2533 immediately follow it.  These are used to allow indent information to
2534 guide parsing and error recovery.
2535
2536 `since_newline` tracks how many stack frames since the last
2537 start-of-line (whether indented or not).  So if `since_newline` is
2538 zero, then this symbol is at the start of a line.  Similarly
2539 `since_indent` counts the number of states since an indent, it is zero
2540 precisely when `indents` is not zero.
2541
2542 `newline_permitted` keeps track of whether newlines should be ignored
2543 or not.
2544
2545 The stack is most properly seen as alternating states and symbols -
2546 states, like the 'DOT' in items, are between symbols.  Each frame in
2547 our stack holds a state and the symbol that was before it.  The
2548 bottom of stack holds the start state but no symbol, as nothing came
2549 before the beginning.
2550
2551 ###### parser functions
2552
2553         struct parser {
2554                 struct frame {
2555                         short state;
2556                         short newline_permitted;
2557
2558                         short sym;
2559                         short indents;
2560                         short since_newline;
2561                         short since_indent;
2562                 } *stack;
2563                 void **asn_stack;
2564                 int stack_size;
2565                 int tos;
2566         };
2567
2568 #### Shift and pop
2569
2570 Two operations are needed on the stack - shift (which is like push) and pop.
2571
2572 Shift applies not only to terminals but also to non-terminals.  When
2573 we reduce a production we will pop off entries corresponding to the
2574 body symbols, then push on an item for the head of the production.
2575 This last is exactly the same process as shifting in a terminal so we
2576 use the same function for both.  In both cases we provide the symbol,
2577 the number of indents the symbol contains (which will be zero for a
2578 terminal symbol) and a flag indicating the the symbol was at (or was
2579 reduced from a symbol which was at) the start of a line.  The state is
2580 deduced from the current top-of-stack state and the new symbol.
2581
2582 To simplify other code we arrange for `shift` to fail if there is no `goto`
2583 state for the symbol.  This is useful in basic parsing due to our design
2584 that we shift when we can, and reduce when we cannot.  So the `shift`
2585 function reports if it could.
2586
2587 `shift` is also used to push state zero onto the stack, so if the
2588 stack is empty, it always chooses zero as the next state.
2589
2590 So `shift` finds the next state.  If that succeeds it extends the
2591 allocations if needed and pushes all the information onto the stacks.
2592
2593 Newlines are permitted after a `starts_line` state until an internal
2594 indent.  If the new frame has neither a `starts_line` state nor an
2595 indent, newlines are permitted if the previous stack frame permitted
2596 them.
2597
2598 ###### parser functions
2599
2600         static int shift(struct parser *p,
2601                          short sym, short indents, short start_of_line,
2602                          void *asn,
2603                          const struct state states[])
2604         {
2605                 // Push an entry onto the stack
2606                 struct frame next = {0};
2607                 int newstate = p->tos
2608                         ? search(&states[p->stack[p->tos-1].state],
2609                                  sym)
2610                         : 0;
2611                 if (newstate < 0)
2612                         return 0;
2613                 if (p->tos >= p->stack_size) {
2614                         p->stack_size += 10;
2615                         p->stack = realloc(p->stack, p->stack_size
2616                                            * sizeof(p->stack[0]));
2617                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2618                                            * sizeof(p->asn_stack[0]));
2619                 }
2620                 next.sym = sym;
2621                 next.indents = indents;
2622                 next.state = newstate;
2623                 if (states[newstate].starts_line)
2624                         next.newline_permitted = 1;
2625                 else if (indents)
2626                         next.newline_permitted = 0;
2627                 else if (p->tos)
2628                         next.newline_permitted =
2629                                 p->stack[p->tos-1].newline_permitted;
2630                 else
2631                         next.newline_permitted = 0;
2632
2633                 if (!start_of_line) {
2634                         if (p->tos)
2635                                 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2636                         else
2637                                 next.since_newline = 1;
2638                 }
2639                 if (indents)
2640                         next.since_indent = 0;
2641                 else if (p->tos)
2642                         next.since_indent = p->stack[p->tos-1].since_indent + 1;
2643                 else
2644                         next.since_indent = 1;
2645
2646                 p->stack[p->tos] = next;
2647                 p->asn_stack[p->tos] = asn;
2648                 p->tos++;
2649                 return 1;
2650         }
2651
2652 `pop` primarily moves the top of stack (`tos`) back down the required
2653 amount and frees any `asn` entries that need to be freed.  It also
2654 collects a summary of the indents and line starts in the symbols that
2655 are being removed. It is called _after_ we reduce a production, just
2656 before we `shift` the nonterminal in.
2657
2658 ###### parser functions
2659
2660         static int pop(struct parser *p, int num,
2661                        short *start_of_line,
2662                        void(*do_free)(short sym, void *asn))
2663         {
2664                 int i;
2665                 short indents = 0;
2666                 int sol = 0;
2667
2668                 p->tos -= num;
2669                 for (i = 0; i < num; i++) {
2670                         sol |= !p->stack[p->tos+i].since_newline;
2671                         indents += p->stack[p->tos+i].indents;
2672                         do_free(p->stack[p->tos+i].sym,
2673                                 p->asn_stack[p->tos+i]);
2674                 }
2675                 if (start_of_line)
2676                         *start_of_line = sol;
2677                 return indents;
2678         }
2679
2680 ### Memory allocation
2681
2682 The `scanner` returns tokens in a local variable - we want them in allocated
2683 memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
2684 a reduce is in a large buffer.  Both of these require some allocation and
2685 copying, hence `memdup` and `tokcopy`.
2686
2687 ###### parser includes
2688         #include <memory.h>
2689
2690 ###### parser functions
2691
2692         void *memdup(void *m, int len)
2693         {
2694                 void *ret;
2695                 if (len == 0)
2696                         return NULL;
2697                 ret = malloc(len);
2698                 memcpy(ret, m, len);
2699                 return ret;
2700         }
2701
2702         static struct token *tok_copy(struct token tk)
2703         {
2704                 struct token *new = malloc(sizeof(*new));
2705                 *new = tk;
2706                 return new;
2707         }
2708
2709 ### The heart of the parser.
2710
2711 Now we have the parser.  If we can shift we do, though newlines and
2712 reducing indenting may block that.  If not and we can reduce we do
2713 that.  If the production we reduced was production zero, then we have
2714 accepted the input and can finish.
2715
2716 We return whatever `asn` was returned by reducing production zero.
2717
2718 If we can neither shift nor reduce we have an error to handle.  We pop
2719 single entries off the stack until we can shift the `TK_error` symbol, then
2720 drop input tokens until we find one we can shift into the new error state.
2721
2722 When we find `TK_in` and `TK_out` tokens which report indents we need
2723 to handle them directly as the grammar cannot express what we want to
2724 do with them.
2725
2726 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2727 record how many indents there are following the previous token.
2728
2729 `TK_out` tokens must be canceled against an indent count
2730 within the stack.  If we can reduce some symbols that are all since
2731 the most recent indent, then we do that first.  If the minimum prefix
2732 of the current state then extends back before the most recent indent,
2733 that indent can be cancelled.  If the minimum prefix is shorter then
2734 the indent had ended prematurely and we must start error handling, which
2735 is still a work-in-progress.
2736
2737 `TK_newline` tokens are ignored unless the top stack frame records
2738 that they are permitted.  In that case they will not be considered for
2739 shifting if it is possible to reduce some symbols that are all since
2740 the most recent start of line.  This is how a newline forcibly
2741 terminates any line-like structure - we try to reduce down to at most
2742 one symbol for each line where newlines are allowed.
2743 A consequence of this is that a rule like
2744
2745 ###### Example: newlines - broken
2746
2747         Newlines ->
2748                 | NEWLINE Newlines
2749         IfStatement -> Newlines if ....
2750
2751 cannot work, as the NEWLINE will never be shifted as the empty string
2752 will be reduced first.  Optional sets of newlines need to be include
2753 in the thing that preceed:
2754
2755 ###### Example: newlines - works
2756
2757         If -> if
2758                 | NEWLINE If
2759         IfStatement -> If ....
2760
2761 Here the NEWLINE will be shifted because nothing can be reduced until
2762 the `if` is seen.
2763
2764 When, during error handling, we discard token read in, we want to keep
2765 discarding until we see one that is recognised.  If we had a full set
2766 of LR(1) grammar states, this will mean looking in the look-ahead set,
2767 but we don't keep a full look-ahead set.  We only record the subset
2768 that leads to SHIFT.  We can, however, deduce the look-ahead set but
2769 looking at the SHIFT subsets for all states that we can get to by
2770 reducing zero or more times.  So we need a little function which
2771 checks if a given token is in any of these look-ahead sets.
2772
2773 ###### parser includes
2774         #include "parser.h"
2775
2776 ###### parser_run
2777
2778         static int in_lookahead(struct token *tk, const struct state *states, int state)
2779         {
2780                 while (state >= 0) {
2781                         if (search(&states[state], tk->num) >= 0)
2782                                 return 1;
2783                         if (states[state].reduce_prod < 0)
2784                                 return 0;
2785                         state = search(&states[state], states[state].reduce_sym);
2786                 }
2787                 return 0;
2788         }
2789
2790         void *parser_run(struct token_state *tokens,
2791                          const struct state states[],
2792                          int (*do_reduce)(int, void**, struct token_config*, void*),
2793                          void (*do_free)(short, void*),
2794                          FILE *trace, const char *non_term[],
2795                          struct token_config *config)
2796         {
2797                 struct parser p = { 0 };
2798                 struct token *tk = NULL;
2799                 int accepted = 0;
2800                 void *ret = NULL;
2801
2802                 shift(&p, TK_eof, 0, 1, NULL, states);
2803                 while (!accepted) {
2804                         struct token *err_tk;
2805                         struct frame *tos = &p.stack[p.tos-1];
2806                         if (!tk)
2807                                 tk = tok_copy(token_next(tokens));
2808                         parser_trace(trace, &p,
2809                                      tk, states, non_term, config->known_count);
2810
2811                         if (tk->num == TK_in) {
2812                                 tos->indents += 1;
2813                                 tos->since_newline = 0;
2814                                 tos->since_indent = 0;
2815                                 if (!states[tos->state].starts_line)
2816                                         tos->newline_permitted = 0;
2817                                 free(tk);
2818                                 tk = NULL;
2819                                 parser_trace_action(trace, "Record");
2820                                 continue;
2821                         }
2822                         if (tk->num == TK_out) {
2823                                 if (states[tos->state].reduce_size >= 0 &&
2824                                     states[tos->state].reduce_size <= tos->since_indent)
2825                                         goto force_reduce;
2826                                 if (states[tos->state].min_prefix >= tos->since_indent) {
2827                                         // OK to cancel
2828                                         struct frame *in = tos - tos->since_indent;
2829                                         in->indents -= 1;
2830                                         if (in->indents == 0) {
2831                                                 /* Reassess since_indent and newline_permitted */
2832                                                 if (in > p.stack) {
2833                                                         in->since_indent = in[-1].since_indent + 1;
2834                                                         in->newline_permitted = in[-1].newline_permitted;
2835                                                 } else {
2836                                                         in->since_indent = 0;
2837                                                         in->newline_permitted = 0;
2838                                                 }
2839                                                 if (states[in->state].starts_line)
2840                                                         in->newline_permitted = 1;
2841                                                 while (in < tos) {
2842                                                         in += 1;
2843                                                         in->since_indent = in[-1].since_indent + 1;
2844                                                         if (states[in->state].starts_line)
2845                                                                 in->newline_permitted = 1;
2846                                                         else
2847                                                                 in->newline_permitted = in[-1].newline_permitted;
2848                                                 }
2849                                         }
2850                                         free(tk);
2851                                         tk = NULL;
2852                                         parser_trace_action(trace, "Cancel");
2853                                         continue;
2854                                 }
2855                                 // fall through to error handling as both SHIFT and REDUCE
2856                                 // will fail.
2857                         }
2858                         if (tk->num == TK_newline) {
2859                                 if (!tos->newline_permitted) {
2860                                         free(tk);
2861                                         tk = NULL;
2862                                         parser_trace_action(trace, "Discard");
2863                                         continue;
2864                                 }
2865                                 if (tos->since_newline > 1 &&
2866                                     states[tos->state].reduce_size >= 0 &&
2867                                     states[tos->state].reduce_size <= tos->since_newline)
2868                                         goto force_reduce;
2869                         }
2870                         if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
2871                                 tk = NULL;
2872                                 parser_trace_action(trace, "Shift");
2873                                 continue;
2874                         }
2875                 force_reduce:
2876                         if (states[tos->state].reduce_prod >= 0 &&
2877                             states[tos->state].newline_only &&
2878                             !(tk->num == TK_newline ||
2879                               tk->num == TK_eof ||
2880                               tk->num == TK_out ||
2881                               (tos->indents == 0 && tos->since_newline == 0))) {
2882                                 /* Anything other than newline or out or eof
2883                                  * in an error unless we are already at start
2884                                  * of line, as this production must end at EOL.
2885                                  */
2886                         } else if (states[tos->state].reduce_prod >= 0) {
2887                                 void **body;
2888                                 void *res;
2889                                 const struct state *nextstate = &states[tos->state];
2890                                 int prod = nextstate->reduce_prod;
2891                                 int size = nextstate->reduce_size;
2892                                 int bufsize;
2893                                 static char buf[16*1024];
2894                                 short indents, start_of_line;
2895
2896                                 body = p.asn_stack + (p.tos - size);
2897
2898                                 bufsize = do_reduce(prod, body, config, buf);
2899
2900                                 indents = pop(&p, size, &start_of_line,
2901                                               do_free);
2902                                 res = memdup(buf, bufsize);
2903                                 memset(buf, 0, bufsize);
2904                                 if (!shift(&p, nextstate->reduce_sym,
2905                                            indents, start_of_line,
2906                                            res, states)) {
2907                                         if (prod != 0) abort();
2908                                         accepted = 1;
2909                                         ret = res;
2910                                 }
2911                                 parser_trace_action(trace, "Reduce");
2912                                 continue;
2913                         }
2914                         /* Error. We walk up the stack until we
2915                          * find a state which will accept TK_error.
2916                          * We then shift in TK_error and see what state
2917                          * that takes us too.
2918                          * Then we discard input tokens until
2919                          * we find one that is acceptable.
2920                          */
2921                         parser_trace_action(trace, "ERROR");
2922                         short indents = 0, start_of_line;
2923
2924                         err_tk = tok_copy(*tk);
2925                         while (p.tos > 0 &&
2926                                shift(&p, TK_error, 0, 0,
2927                                      err_tk, states) == 0)
2928                                 // discard this state
2929                                 indents += pop(&p, 1, &start_of_line, do_free);
2930                         if (p.tos == 0) {
2931                                 free(err_tk);
2932                                 // no state accepted TK_error
2933                                 break;
2934                         }
2935                         tos = &p.stack[p.tos-1];
2936                         while (!in_lookahead(tk, states, tos->state) &&
2937                                tk->num != TK_eof) {
2938                                 free(tk);
2939                                 tk = tok_copy(token_next(tokens));
2940                                 if (tk->num == TK_in)
2941                                         indents += 1;
2942                                 if (tk->num == TK_out) {
2943                                         if (indents == 0)
2944                                                 break;
2945                                         indents -= 1;
2946                                         // FIXME update since_indent here
2947                                 }
2948                         }
2949                         tos->indents += indents;
2950                 }
2951                 free(tk);
2952                 pop(&p, p.tos, NULL, do_free);
2953                 free(p.asn_stack);
2954                 free(p.stack);
2955                 return ret;
2956         }
2957
2958 ###### exported functions
2959         void *parser_run(struct token_state *tokens,
2960                          const struct state states[],
2961                          int (*do_reduce)(int, void**, struct token_config*, void*),
2962                          void (*do_free)(short, void*),
2963                          FILE *trace, const char *non_term[],
2964                          struct token_config *config);
2965
2966 ### Tracing
2967
2968 Being able to visualize the parser in action can be invaluable when
2969 debugging the parser code, or trying to understand how the parse of a
2970 particular grammar progresses.  The stack contains all the important
2971 state, so just printing out the stack every time around the parse loop
2972 can make it possible to see exactly what is happening.
2973
2974 This doesn't explicitly show each SHIFT and REDUCE action.  However they
2975 are easily deduced from the change between consecutive lines, and the
2976 details of each state can be found by cross referencing the states list
2977 in the stack with the "report" that parsergen can generate.
2978
2979 For terminal symbols, we just dump the token.  For non-terminals we
2980 print the name of the symbol.  The look ahead token is reported at the
2981 end inside square brackets.
2982
2983 ###### parser functions
2984
2985         static char *reserved_words[] = {
2986                 [TK_error]        = "ERROR",
2987                 [TK_in]           = "IN",
2988                 [TK_out]          = "OUT",
2989                 [TK_newline]      = "NEWLINE",
2990                 [TK_eof]          = "$eof",
2991         };
2992         static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2993         {
2994                 fprintf(trace, "(%d", f->state);
2995                 if (states[f->state].starts_line)
2996                         fprintf(trace, "s");
2997                 if (f->newline_permitted)
2998                         fprintf(trace, "n%d", f->since_newline);
2999                 fprintf(trace, ") ");
3000         }
3001
3002         void parser_trace(FILE *trace, struct parser *p,
3003                           struct token *tk, const struct state states[],
3004                           const char *non_term[], int knowns)
3005         {
3006                 int i;
3007                 if (!trace)
3008                         return;
3009                 for (i = 0; i < p->tos; i++) {
3010                         struct frame *f = &p->stack[i];
3011                         if (i) {
3012                                 int sym = f->sym;
3013                                 if (sym < TK_reserved &&
3014                                     reserved_words[sym] != NULL)
3015                                         fputs(reserved_words[sym], trace);
3016                                 else if (sym < TK_reserved + knowns) {
3017                                         struct token *t = p->asn_stack[i];
3018                                         text_dump(trace, t->txt, 20);
3019                                 } else
3020                                         fputs(non_term[sym - TK_reserved - knowns],
3021                                               trace);
3022                                 if (f->indents)
3023                                         fprintf(trace, ".%d", f->indents);
3024                                 if (f->since_newline == 0)
3025                                         fputs("/", trace);
3026                                 fputs(" ", trace);
3027                         }
3028                         parser_trace_state(trace, f, states);
3029                 }
3030                 fprintf(trace, "[");
3031                 if (tk->num < TK_reserved &&
3032                     reserved_words[tk->num] != NULL)
3033                         fputs(reserved_words[tk->num], trace);
3034                 else
3035                         text_dump(trace, tk->txt, 20);
3036                 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3037         }
3038
3039         void parser_trace_action(FILE *trace, char *action)
3040         {
3041                 if (trace)
3042                         fprintf(trace, " - %s\n", action);
3043         }
3044
3045 # A Worked Example
3046
3047 The obvious example for a parser is a calculator.
3048
3049 As `scanner` provides number parsing function using `libgmp` is it not much
3050 work to perform arbitrary rational number calculations.
3051
3052 This calculator takes one expression, or an equality test, per line.  The
3053 results are printed and if any equality test fails, the program exits with
3054 an error.
3055
3056 ###### File: parsergen.mk
3057         calc.c calc.h : parsergen parsergen.mdc
3058                 ./parsergen --tag calc -o calc parsergen.mdc
3059         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3060                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3061         calctest : calc
3062                 ./calc parsergen.mdc
3063         demos :: calctest
3064
3065 # calc: header
3066
3067         #include "parse_number.h"
3068         // what do we use for a demo-grammar?  A calculator of course.
3069         struct number {
3070                 mpq_t val;
3071                 char tail[2];
3072                 int err;
3073         };
3074
3075 # calc: code
3076
3077         #include <stdlib.h>
3078         #include <unistd.h>
3079         #include <fcntl.h>
3080         #include <sys/mman.h>
3081         #include <stdio.h>
3082         #include <malloc.h>
3083         #include <gmp.h>
3084         #include <string.h>
3085         #include "mdcode.h"
3086         #include "scanner.h"
3087         #include "parser.h"
3088
3089         #include "calc.h"
3090
3091         static void free_number(struct number *n)
3092         {
3093                 mpq_clear(n->val);
3094                 free(n);
3095         }
3096
3097         static int text_is(struct text t, char *s)
3098         {
3099                 return (strlen(s) == t.len &&
3100                         strncmp(s, t.txt, t.len) == 0);
3101         }
3102
3103         int main(int argc, char *argv[])
3104         {
3105                 int fd = open(argv[1], O_RDONLY);
3106                 int len = lseek(fd, 0, 2);
3107                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3108                 struct section *table = code_extract(file, file+len, NULL);
3109                 struct section *s;
3110                 struct token_config config = {
3111                         .ignored = (1 << TK_line_comment)
3112                                  | (1 << TK_in)
3113                                  | (1 << TK_out),
3114                         .number_chars = ".,_+-",
3115                         .word_start = "",
3116                         .word_cont = "",
3117                 };
3118                 for (s = table; s; s = s->next)
3119                         if (text_is(s->section, "example: input"))
3120                                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3121                 while (table) {
3122                         struct section *t = table->next;
3123                         code_free(table->code);
3124                         free(table);
3125                         table = t;
3126                 }
3127                 exit(0);
3128         }
3129
3130 # calc: grammar
3131
3132         $LEFT + -
3133         $LEFT * / //
3134
3135         Session -> Session Line
3136                 | Line
3137
3138         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3139                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3140                                         gmp_printf("  or as a decimal: %Fg\n", fl);
3141                                         mpf_clear(fl);
3142                                         }
3143                                      }$
3144                 | Expression = Expression NEWLINE ${
3145                         if (mpq_equal($1.val, $3.val))
3146                                 gmp_printf("Both equal %Qd\n", $1.val);
3147                         else {
3148                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
3149                                         $1.val, $3.val);
3150                                 exit(1);
3151                         }
3152                 }$
3153                 | NEWLINE ${ printf("Blank line\n"); }$
3154                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3155
3156         $number
3157         Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3158                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3159                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3160                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3161                 | Expression // Expression ${ {
3162                         mpz_t z0, z1, z2;
3163                         mpq_init($0.val);
3164                         mpz_init(z0); mpz_init(z1); mpz_init(z2);
3165                         mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3166                         mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3167                         mpz_tdiv_q(z0, z1, z2);
3168                         mpq_set_z($0.val, z0);
3169                         mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3170                 } }$
3171                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3172                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3173
3174 # example: input
3175
3176         355/113
3177         3.1415926535 - 355/113
3178         2 + 4 * 5
3179         1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3180         10 * 9 / 2
3181         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
3182
3183         355//113
3184
3185         error