ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 There are several distinct sections.
   8
   9 1. `grammar_read` will read a grammar from a literate-code file and
  10    store the productions, symbols, and code fragments.
  11 2. `grammar_analyse` will take that grammar and build LR parsing
  12    states and look-ahead tables.
  13 3. `grammar_report` will present the details of the analysis
  14    in a readable format and will report any conflicts.
  15 4. `parser_generate` will write out C code files with various
  16    tables and with the code fragments arranged in useful places.
  17 5. `parser_run` is a library function intended to be linked together
  18    with the generated parser tables to complete the implementation of
  19    a parser.
  20 6. Finally `calc` is a test grammar for a simple calculator.  The
  21    `parsergen` program built from the C code in this file can extract
  22    that grammar directly from this file and process it.
  23
  24 ###### File: parsergen.c
  25         #include <unistd.h>
  26         #include <stdlib.h>
  27         #include <stdio.h>
  28         ## includes
  29         ## forward declarations
  30         ## declarations
  31         ## functions
  32         ## grammar_read
  33         ## grammar_analyse
  34         ## grammar_report
  35         ## parser_generate
  36         ## main
  37 ###### File: parser.h
  38         ## exported types
  39         ## exported functions
  40 ###### File: libparser.c
  41         #include <unistd.h>
  42         #include <stdlib.h>
  43         #include <stdio.h>
  44         ## parser includes
  45         ## parser functions
  46         ## parser_run
  47 ###### File: parsergen.mk
  48         CFLAGS += -Wall -g
  49         all :: parsergen calc
  50         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  51                 ./md2c parsergen.mdc
  52
  53 ## Reading the grammar
  54
  55 The grammar must be stored in a markdown literate code file as parsed
  56 by "[mdcode][]".  It must have three top level (i.e. unreferenced)
  57 sections: `header`, `code`, and `grammar`.  The first two will be
  58 literally copied into the generated `.c` and `.h`. files.  The last
  59 contains the grammar.  This is tokenised with "[scanner][]".
  60
  61 If the `--tag` option is given, then any top level header that doesn't
  62 start with the tag is ignored, and the tag is striped from the rest.  So
  63 `--tag Foo`
  64 means that the three needed sections must be `Foo: header`, `Foo: code`,
  65 and `Foo: grammar`.  The tag `calc` is used to extract the same calculator
  66 from this file.
  67
  68 [mdcode]: mdcode.html
  69 [scanner]: scanner.html
  70
  71 ###### includes
  72         #include "mdcode.h"
  73         #include "scanner.h"
  74
  75 ###### parser includes
  76         #include "mdcode.h"
  77         #include "scanner.h"
  78
  79 The grammar contains production sets, precedence/associativity
  80 declarations, and data type declarations.  These are all parsed with
  81 _ad hoc_ parsing as we don't have a parser generator yet.
  82
  83 The precedence and associativity can be set for each production, but
  84 can be inherited from symbols.  The data type (either structure or a
  85 reference to a structure) is potentially defined for each non-terminal
  86 and describes what C structure is used to store information about each
  87 symbol.
  88
  89 ###### declarations
  90         enum assoc {Left, Right, Non};
  91         char *assoc_names[] = {"Left","Right","Non"};
  92
  93         struct symbol {
  94                 struct text struct_name;
  95                 int isref;
  96                 enum assoc assoc;
  97                 unsigned short precedence;
  98                 ## symbol fields
  99         };
 100         struct production {
 101                 unsigned short precedence;
 102                 enum assoc assoc;
 103                 char line_like;
 104                 ## production fields
 105         };
 106         struct grammar {
 107                 ## grammar fields
 108         };
 109
 110 The strings reported by `mdcode` and `scanner` are `struct text` which have
 111 length rather than being null terminated.  To help with printing and
 112 comparing we define `text_is` and `prtxt`, which should possibly go in
 113 `mdcode`.  `scanner` does provide `text_dump` which is useful for strings
 114 which might contain control characters.
 115
 116 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 117 because that is what we need to detect tags.
 118
 119 ###### functions
 120         static int text_is(struct text t, char *s)
 121         {
 122                 return (strlen(s) == t.len &&
 123                         strncmp(s, t.txt, t.len) == 0);
 124         }
 125         static void prtxt(struct text t)
 126         {
 127                 printf("%.*s", t.len, t.txt);
 128         }
 129
 130         static int strip_tag(struct text *t, char *tag)
 131         {
 132                 int skip = strlen(tag) + 1;
 133                 if (skip >= t->len ||
 134                     strncmp(t->txt, tag, skip-1) != 0 ||
 135                     t->txt[skip-1] != ':')
 136                         return 0;
 137                 while (skip < t->len && t->txt[skip] == ' ')
 138                         skip++;
 139                 t->len -= skip;
 140                 t->txt += skip;
 141                 return 1;
 142         }
 143
 144 ### Symbols
 145
 146 Productions are comprised primarily of symbols - terminal and
 147 non-terminal.  We do not make a syntactic distinction between the two,
 148 though non-terminals must be identifiers.  Non-terminal symbols are
 149 those which appear in the head of a production, terminal symbols are
 150 those which don't.  There are also "virtual" symbols used for precedence
 151 marking discussed later, and sometimes we won't know what type a symbol
 152 is yet.
 153
 154 To help with code safety it is possible to declare the terminal symbols.
 155 If this is done, then any symbol used in a production that does not
 156 appear in a head and is not declared is treated as an error.
 157
 158 ###### forward declarations
 159         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 160         char *symtypes = "UVTN";
 161 ###### symbol fields
 162         enum symtype type;
 163 ###### grammar fields
 164         int terminals_declared;
 165
 166 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 167 table of known symbols and the resulting parser will report them as
 168 `TK_reserved + N`.  A small set of identifiers are reserved for the
 169 different token types that `scanner` can report.
 170
 171 ###### declarations
 172
 173         static struct { int num; char *name; } reserved_words[] = {
 174                 { TK_error,        "ERROR" },
 175                 { TK_number,       "NUMBER" },
 176                 { TK_ident,        "IDENTIFIER" },
 177                 { TK_mark,         "MARK" },
 178                 { TK_string,       "STRING" },
 179                 { TK_multi_string, "MULTI_STRING" },
 180                 { TK_in,           "IN" },
 181                 { TK_out,          "OUT" },
 182                 { TK_newline,      "NEWLINE" },
 183                 { TK_eof,          "$eof" },
 184         };
 185 ###### symbol fields
 186         short num;
 187
 188 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 189 recognised.  The former is automatically expected at the end of the text
 190 being parsed. The latter are always ignored by the parser.
 191
 192 All of these symbols are stored in a simple symbol table.  We use the
 193 `struct text` from `mdcode` to store the name and link them together
 194 into a sorted list using an insertion sort.
 195
 196 We don't have separate `find` and `insert` functions as any symbol we
 197 find needs to be remembered.  We simply expect `find` to always return a
 198 symbol, but its type might be `Unknown`.
 199
 200 ###### includes
 201         #include <string.h>
 202
 203 ###### symbol fields
 204         struct text name;
 205         struct symbol *next;
 206
 207 ###### grammar fields
 208         struct symbol *syms;
 209         int num_syms;
 210
 211 ###### functions
 212         static struct symbol *sym_find(struct grammar *g, struct text s)
 213         {
 214                 struct symbol **l = &g->syms;
 215                 struct symbol *n;
 216                 int cmp = 1;
 217
 218                 while (*l &&
 219                         (cmp = text_cmp((*l)->name, s)) < 0)
 220                                 l = & (*l)->next;
 221                 if (cmp == 0)
 222                         return *l;
 223                 n = calloc(1, sizeof(*n));
 224                 n->name = s;
 225                 n->type = Unknown;
 226                 n->next = *l;
 227                 n->num = -1;
 228                 *l = n;
 229                 return n;
 230         }
 231
 232         static void symbols_init(struct grammar *g)
 233         {
 234                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 235                 int i;
 236                 for (i = 0; i < entries; i++) {
 237                         struct text t;
 238                         struct symbol *s;
 239                         t.txt = reserved_words[i].name;
 240                         t.len = strlen(t.txt);
 241                         s = sym_find(g, t);
 242                         s->type = Terminal;
 243                         s->num = reserved_words[i].num;
 244                 }
 245         }
 246
 247 ### Data types and precedence.
 248
 249 Data type specification, precedence specification, and declaration of
 250 terminals are all introduced by a dollar sign at the start of the line.
 251 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 252 precedence, if it is `TERM` the the line declares terminals without
 253 precedence, otherwise it specifies a data type.
 254
 255 The data type name is simply stored and applied to the head of all
 256 subsequent productions.  It must be the name of a structure optionally
 257 preceded by an asterisk which means a reference or "pointer".  So
 258 `$expression` maps to `struct expression` and `$*statement` maps to
 259 `struct statement *`.
 260
 261 Any productions given before the first data type declaration will have
 262 no data type associated with them and can carry no information.  In
 263 order to allow other non-terminals to have no type, the data type
 264 `$void` can be given.  This does *not* mean that `struct void` will be
 265 used, but rather than no type will be associated with future
 266 non-terminals.
 267
 268 The precedence line must contain a list of symbols - typically
 269 terminal symbols, but not necessarily.  It can only contain symbols
 270 that have not been seen yet, so precedence declaration must precede
 271 the productions that they affect.
 272
 273 A precedence line may also contain a "virtual" symbol which is an
 274 identifier preceded by `$$`.  Virtual terminals carry precedence
 275 information but are not included in the grammar.  A production can
 276 declare that it inherits the precedence of a given virtual symbol.
 277
 278 This use for `$$` precludes it from being used as a symbol in the
 279 described language.  Two other symbols: `${` and `}$` are also
 280 unavailable.
 281
 282 Each new precedence line introduces a new precedence level and
 283 declares how it associates.  This level is stored in each symbol
 284 listed and may be inherited by any production which uses the symbol.  A
 285 production inherits from the last symbol which has a precedence.
 286
 287 The symbols on the first precedence line have the lowest precedence.
 288 Subsequent lines introduce symbols with higher precedence.
 289
 290 ###### grammar fields
 291         struct text current_type;
 292         int type_isref;
 293         int prec_levels;
 294
 295 ###### declarations
 296         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 297         static const char *known[] = { "$$", "${", "}$" };
 298
 299 ###### functions
 300         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 301         {
 302                 struct token t = token_next(ts);
 303                 char *err;
 304                 enum assoc assoc;
 305                 int term = 0;
 306                 int found;
 307
 308                 if (t.num != TK_ident) {
 309                         err = "type or assoc expected after '$'";
 310                         goto abort;
 311                 }
 312                 if (text_is(t.txt, "LEFT"))
 313                         assoc = Left;
 314                 else if (text_is(t.txt, "RIGHT"))
 315                         assoc = Right;
 316                 else if (text_is(t.txt, "NON"))
 317                         assoc = Non;
 318                 else if (text_is(t.txt, "TERM")) {
 319                         term = 1;
 320                         g->terminals_declared = 1;
 321                 } else {
 322                         g->current_type = t.txt;
 323                         g->type_isref = isref;
 324                         if (text_is(t.txt, "void"))
 325                                 g->current_type.txt = NULL;
 326                         t = token_next(ts);
 327                         if (t.num != TK_newline) {
 328                                 err = "Extra tokens after type name";
 329                                 goto abort;
 330                         }
 331                         return NULL;
 332                 }
 333
 334                 if (isref) {
 335                         err = "$* cannot be followed by a precedence";
 336                         goto abort;
 337                 }
 338
 339                 // This is a precedence or TERM line, need some symbols.
 340                 found = 0;
 341                 g->prec_levels += 1;
 342                 t = token_next(ts);
 343                 while (t.num != TK_newline) {
 344                         enum symtype type = Terminal;
 345                         struct symbol *s;
 346                         if (t.num == TK_virtual) {
 347                                 type = Virtual;
 348                                 t = token_next(ts);
 349                                 if (t.num != TK_ident) {
 350                                         err = "$$ must be followed by a word";
 351                                         goto abort;
 352                                 }
 353                                 if (term) {
 354                                         err = "Virtual symbols not permitted on $TERM line";
 355                                         goto abort;
 356                                 }
 357                         } else if (t.num != TK_ident &&
 358                                    t.num != TK_mark) {
 359                                 err = "Illegal token in precedence line";
 360                                 goto abort;
 361                         }
 362                         s = sym_find(g, t.txt);
 363                         if (s->type != Unknown) {
 364                                 err = "Symbols in precedence/TERM line must not already be known.";
 365                                 goto abort;
 366                         }
 367                         s->type = type;
 368                         if (!term) {
 369                                 s->precedence = g->prec_levels;
 370                                 s->assoc = assoc;
 371                         }
 372                         found += 1;
 373                         t = token_next(ts);
 374                 }
 375                 if (found == 0)
 376                         err = "No symbols given on precedence/TERM line";
 377                         goto abort;
 378                 return NULL;
 379         abort:
 380                 while (t.num != TK_newline && t.num != TK_eof)
 381                         t = token_next(ts);
 382                 return err;
 383         }
 384
 385 ### Productions
 386
 387 A production either starts with an identifier which is the head
 388 non-terminal, or a vertical bar (`|`) in which case this production
 389 uses the same head as the previous one.  The identifier must be
 390 followed by a `->` mark.  All productions for a given non-terminal must
 391 be grouped together with the `nonterminal ->` given only once.
 392
 393 After this a (possibly empty) sequence of identifiers and marks form
 394 the body of the production.  A virtual symbol may be given after the
 395 body by preceding it with `$$`.  If a virtual symbol is given, the
 396 precedence of the production is that for the virtual symbol.  If none
 397 is given, the precedence is inherited from the last symbol in the
 398 production which has a precedence specified.
 399
 400 After the optional precedence may come the `${` mark.  This indicates
 401 the start of a code fragment.  If present, this must be on the same
 402 line as the start of the production.
 403
 404 All of the text from the `${` through to the matching `}$` is
 405 collected and forms the code-fragment for the production.  It must all
 406 be in one `code_node` of the literate code.  The `}$` must be
 407 at the end of a line.
 408
 409 Text in the code fragment will undergo substitutions where `$N` or
 410 `$<N`,for some numeric `N` (or non-numeric indicator as described
 411 later), will be replaced with a variable holding the parse information
 412 for the particular symbol in the production.  `$0` is the head of the
 413 production, `$1` is the first symbol of the body, etc.  The type of `$N`
 414 for a terminal symbol is `struct token`.  For a non-terminal, it is
 415 whatever has been declared for that symbol.  The `<` may be included and
 416 means that the value (usually a reference) is being moved out, so it
 417 will not automatically be freed.  The effect of using '<' is that the
 418 variable is cleareed to all-zeros.
 419
 420 While building productions we will need to add to an array which needs to
 421 grow dynamically.
 422
 423 ###### functions
 424         static void array_add(void *varray, int *cnt, void *new)
 425         {
 426                 void ***array = varray;
 427                 int current = 0;
 428                 const int step = 8;
 429                 current = ((*cnt-1) | (step-1))+1;
 430                 if (*cnt == current) {
 431                         /* must grow */
 432                         current += step;
 433                         *array = realloc(*array, current * sizeof(void*));
 434                 }
 435                 (*array)[*cnt] = new;
 436                 (*cnt) += 1;
 437         }
 438
 439 Collecting the code fragment simply involves reading tokens until we
 440 hit the end token `}$`, and noting the character position of the start and
 441 the end.
 442
 443 ###### functions
 444         static struct text collect_code(struct token_state *state,
 445                                         struct token start)
 446         {
 447                 struct text code;
 448                 struct token t;
 449                 code.txt = start.txt.txt + start.txt.len;
 450                 do
 451                         t = token_next(state);
 452                 while (t.node == start.node &&
 453                        t.num != TK_close && t.num != TK_error &&
 454                        t.num != TK_eof);
 455                 if (t.num == TK_close && t.node == start.node)
 456                         code.len = t.txt.txt - code.txt;
 457                 else
 458                         code.txt = NULL;
 459                 return code;
 460         }
 461
 462 Now we have all the bits we need to parse a full production.
 463
 464 ###### includes
 465         #include <memory.h>
 466
 467 ###### grammar fields
 468         struct production **productions;
 469         int production_count;
 470
 471 ###### production fields
 472         struct symbol  *head;
 473         struct symbol **body;
 474         int             body_size;
 475         struct text     code;
 476         int             code_line;
 477
 478 ###### symbol fields
 479         int first_production;
 480
 481 ###### functions
 482         static char *parse_production(struct grammar *g,
 483                                       struct symbol *head,
 484                                       struct token_state *state)
 485         {
 486                 /* Head has already been parsed. */
 487                 struct token tk;
 488                 char *err;
 489                 struct production p, *pp;
 490
 491                 memset(&p, 0, sizeof(p));
 492                 p.head = head;
 493                 tk = token_next(state);
 494                 while (tk.num == TK_ident || tk.num == TK_mark) {
 495                         struct symbol *bs = sym_find(g, tk.txt);
 496                         if (bs->type == Unknown) {
 497                                 if (!g->terminals_declared)
 498                                         bs->type = Terminal;
 499                         }
 500                         if (bs->type == Virtual) {
 501                                 err = "Virtual symbol not permitted in production";
 502                                 goto abort;
 503                         }
 504                         if (bs->precedence) {
 505                                 p.precedence = bs->precedence;
 506                                 p.assoc = bs->assoc;
 507                         }
 508                         array_add(&p.body, &p.body_size, bs);
 509                         tk = token_next(state);
 510                 }
 511                 if (tk.num == TK_virtual) {
 512                         struct symbol *vs;
 513                         tk = token_next(state);
 514                         if (tk.num != TK_ident) {
 515                                 err = "word required after $$";
 516                                 goto abort;
 517                         }
 518                         vs = sym_find(g, tk.txt);
 519                         if (vs->num == TK_newline)
 520                                 p.line_like = 1;
 521                         else if (vs->num == TK_out)
 522                                 p.line_like = 2;
 523                         else if (vs->precedence == 0) {
 524                                 err = "symbol after $$ must have precedence";
 525                                 goto abort;
 526                         } else {
 527                                 p.precedence = vs->precedence;
 528                                 p.assoc = vs->assoc;
 529                         }
 530                         tk = token_next(state);
 531                 }
 532                 if (tk.num == TK_open) {
 533                         p.code_line = tk.line;
 534                         p.code = collect_code(state, tk);
 535                         if (p.code.txt == NULL) {
 536                                 err = "code fragment not closed properly";
 537                                 goto abort;
 538                         }
 539                         tk = token_next(state);
 540                 }
 541
 542                 if (tk.num != TK_newline && tk.num != TK_eof) {
 543                         err = "stray tokens at end of line";
 544                         goto abort;
 545                 }
 546                 pp = malloc(sizeof(*pp));
 547                 *pp = p;
 548                 array_add(&g->productions, &g->production_count, pp);
 549                 return NULL;
 550         abort:
 551                 while (tk.num != TK_newline && tk.num != TK_eof)
 552                         tk = token_next(state);
 553                 return err;
 554         }
 555
 556 With the ability to parse production and dollar-lines, we have nearly all
 557 that we need to parse a grammar from a `code_node`.
 558
 559 The head of the first production will effectively be the `start` symbol of
 560 the grammar.  However it won't _actually_ be so.  Processing the grammar is
 561 greatly simplified if the real start symbol only has a single production,
 562 and expects `$eof` as the final terminal.  So when we find the first
 563 explicit production we insert an extra production as production zero which
 564 looks like
 565
 566 ###### Example: production 0
 567         $start -> START $eof
 568
 569 where `START` is the first non-terminal given.
 570
 571 ###### create production zero
 572         struct production *p = calloc(1,sizeof(*p));
 573         struct text start = {"$start",6};
 574         struct text eof = {"$eof",4};
 575         struct text code = {"$0 = $<1;", 9};
 576         p->head = sym_find(g, start);
 577         p->head->type = Nonterminal;
 578         p->head->struct_name = g->current_type;
 579         p->head->isref = g->type_isref;
 580         if (g->current_type.txt)
 581                 p->code = code;
 582         array_add(&p->body, &p->body_size, head);
 583         array_add(&p->body, &p->body_size, sym_find(g, eof));
 584         p->head->first_production = g->production_count;
 585         array_add(&g->productions, &g->production_count, p);
 586
 587 Now we are ready to read in the grammar.  We ignore comments
 588 and strings so that the marks which introduce them can be
 589 used as terminals in the grammar.  We don't ignore numbers
 590 even though we don't allow them as that causes the scanner
 591 to produce errors that the parser is better positioned to handle.
 592
 593 ###### grammar_read
 594         static struct grammar *grammar_read(struct code_node *code)
 595         {
 596                 struct token_config conf = {
 597                         .word_start = "",
 598                         .word_cont = "",
 599                         .words_marks = known,
 600                         .known_count = sizeof(known)/sizeof(known[0]),
 601                         .number_chars = "",
 602                         .ignored = (1 << TK_line_comment)
 603                                  | (1 << TK_block_comment)
 604                                  | (0 << TK_number)
 605                                  | (1 << TK_string)
 606                                  | (1 << TK_multi_string)
 607                                  | (1 << TK_in)
 608                                  | (1 << TK_out),
 609                 };
 610
 611                 struct token_state *state = token_open(code, &conf);
 612                 struct token tk;
 613                 struct symbol *head = NULL;
 614                 struct grammar *g;
 615                 char *err = NULL;
 616
 617                 g = calloc(1, sizeof(*g));
 618                 symbols_init(g);
 619
 620                 for (tk = token_next(state); tk.num != TK_eof;
 621                      tk = token_next(state)) {
 622                         if (tk.num == TK_newline)
 623                                 continue;
 624                         if (tk.num == TK_ident) {
 625                                 // new non-terminal
 626                                 head = sym_find(g, tk.txt);
 627                                 if (head->type == Nonterminal)
 628                                         err = "This non-terminal has already be used.";
 629                                 else if (head->type == Virtual)
 630                                         err = "Virtual symbol not permitted in head of production";
 631                                 else {
 632                                         head->type = Nonterminal;
 633                                         head->struct_name = g->current_type;
 634                                         head->isref = g->type_isref;
 635                                         if (g->production_count == 0) {
 636                                                 ## create production zero
 637                                         }
 638                                         head->first_production = g->production_count;
 639                                         tk = token_next(state);
 640                                         if (tk.num == TK_mark &&
 641                                             text_is(tk.txt, "->"))
 642                                                 err = parse_production(g, head, state);
 643                                         else
 644                                                 err = "'->' missing in production";
 645                                 }
 646                         } else if (tk.num == TK_mark
 647                                    && text_is(tk.txt, "|")) {
 648                                 // another production for same non-term
 649                                 if (head)
 650                                         err = parse_production(g, head, state);
 651                                 else
 652                                         err = "First production must have a head";
 653                         } else if (tk.num == TK_mark
 654                                    && text_is(tk.txt, "$")) {
 655                                 err = dollar_line(state, g, 0);
 656                         } else if (tk.num == TK_mark
 657                                    && text_is(tk.txt, "$*")) {
 658                                 err = dollar_line(state, g, 1);
 659                         } else if (tk.num == TK_mark
 660                                    && text_is(tk.txt, "//")) {
 661                                 while (tk.num != TK_newline &&
 662                                        tk.num != TK_eof)
 663                                         tk = token_next(state);
 664                         } else {
 665                                 err = "Unrecognised token at start of line.";
 666                         }
 667                         if (err)
 668                                 goto abort;
 669                 }
 670                 token_close(state);
 671                 if (g->terminals_declared) {
 672                         struct symbol *s;
 673                         int errs = 0;
 674                         for (s = g->syms; s; s = s->next) {
 675                                 if (s->type != Unknown)
 676                                         continue;
 677                                 errs += 1;
 678                                 fprintf(stderr, "Token %.*s not declared\n",
 679                                         s->name.len, s->name.txt);
 680                         }
 681                         if (errs) {
 682                                 free(g);
 683                                 g = NULL;
 684                         }
 685                 }
 686                 return g;
 687         abort:
 688                 fprintf(stderr, "Error at line %d: %s\n",
 689                         tk.line, err);
 690                 token_close(state);
 691                 free(g);
 692                 return NULL;
 693         }
 694
 695 ## Analysing the grammar
 696
 697 The central task in analysing the grammar is to determine a set of
 698 states to drive the parsing process.  This will require finding
 699 various sets of symbols and of "items".  Some of these sets will need
 700 to attach information to each element in the set, so they are more
 701 like maps.
 702
 703 Each "item" may have a 'look-ahead' or `LA` set associated with
 704 it.  Many of these will be the same as each other.  To avoid memory
 705 wastage, and to simplify some comparisons of sets, these sets will be
 706 stored in a list of unique sets, each assigned a number.
 707
 708 Once we have the data structures in hand to manage these sets and
 709 lists, we can start setting the 'nullable' flag, build the 'FIRST'
 710 sets, and then create the item sets which define the various states.
 711
 712 ### Symbol sets.
 713
 714 Though we don't only store symbols in these sets, they are the main
 715 things we store, so they are called symbol sets or "symsets".
 716
 717 A symset has a size, an array of shorts, and an optional array of data
 718 values, which are also shorts.  If the array of data is not present,
 719 we store an impossible pointer, as `NULL` is used to indicate that no
 720 memory has been allocated yet;
 721
 722 ###### declarations
 723         struct symset {
 724                 short cnt;
 725                 unsigned short *syms, *data;
 726         };
 727         #define NO_DATA ((unsigned short *)1)
 728         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 729         const struct symset INIT_DATASET = { 0, NULL, NULL };
 730
 731 The arrays of shorts are allocated in blocks of 8 and are kept sorted
 732 by using an insertion sort.  We don't explicitly record the amount of
 733 allocated space as it can be derived directly from the current `cnt` using
 734 `((cnt - 1) | 7) + 1`.
 735
 736 ###### functions
 737         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 738         {
 739                 int i;
 740                 int current = ((s->cnt-1) | 7) + 1;
 741                 if (current == s->cnt) {
 742                         current += 8;
 743                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 744                         if (s->data != NO_DATA)
 745                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 746                 }
 747                 i = s->cnt;
 748                 while (i > 0 && s->syms[i-1] > key) {
 749                         s->syms[i] = s->syms[i-1];
 750                         if (s->data != NO_DATA)
 751                                 s->data[i] = s->data[i-1];
 752                         i--;
 753                 }
 754                 s->syms[i] = key;
 755                 if (s->data != NO_DATA)
 756                         s->data[i] = val;
 757                 s->cnt += 1;
 758         }
 759
 760 Finding a symbol (or item) in a `symset` uses a simple binary search.
 761 We return the index where the value was found (so data can be accessed),
 762 or `-1` to indicate failure.
 763
 764         static int symset_find(struct symset *ss, unsigned short key)
 765         {
 766                 int lo = 0;
 767                 int hi = ss->cnt;
 768
 769                 if (hi == 0)
 770                         return -1;
 771                 while (lo + 1 < hi) {
 772                         int mid = (lo + hi) / 2;
 773                         if (ss->syms[mid] <= key)
 774                                 lo = mid;
 775                         else
 776                                 hi = mid;
 777                 }
 778                 if (ss->syms[lo] == key)
 779                         return lo;
 780                 return -1;
 781         }
 782
 783 We will often want to form the union of two symsets.  When we do, we
 784 will often want to know if anything changed (as that might mean there
 785 is more work to do).  So `symset_union` reports whether anything was
 786 added to the first set.  We use a slow quadratic approach as these
 787 sets don't really get very big.  If profiles shows this to be a problem it
 788 can be optimised later.
 789
 790         static int symset_union(struct symset *a, struct symset *b)
 791         {
 792                 int i;
 793                 int added = 0;
 794                 for (i = 0; i < b->cnt; i++)
 795                         if (symset_find(a, b->syms[i]) < 0) {
 796                                 unsigned short data = 0;
 797                                 if (b->data != NO_DATA)
 798                                         data = b->data[i];
 799                                 symset_add(a, b->syms[i], data);
 800                                 added++;
 801                         }
 802                 return added;
 803         }
 804
 805 And of course we must be able to free a symset.
 806
 807         static void symset_free(struct symset ss)
 808         {
 809                 free(ss.syms);
 810                 if (ss.data != NO_DATA)
 811                         free(ss.data);
 812         }
 813
 814 ### Symset Storage
 815
 816 Some symsets are simply stored somewhere appropriate in the `grammar`
 817 data structure, others needs a bit of indirection.  This applies
 818 particularly to "LA" sets.  These will be paired with "items" in an
 819 "itemset".  As itemsets will be stored in a symset, the "LA" set needs to be
 820 stored in the `data` field, so we need a mapping from a small (short)
 821 number to an LA `symset`.
 822
 823 As mentioned earlier this involves creating a list of unique symsets.
 824
 825 For now, we just use a linear list sorted by insertion.  A skiplist
 826 would be more efficient and may be added later.
 827
 828 ###### declarations
 829
 830         struct setlist {
 831                 struct symset ss;
 832                 int num;
 833                 struct setlist *next;
 834         };
 835
 836 ###### grammar fields
 837         struct setlist *sets;
 838         int nextset;
 839
 840 ###### functions
 841
 842         static int ss_cmp(struct symset a, struct symset b)
 843         {
 844                 int i;
 845                 int diff = a.cnt - b.cnt;
 846                 if (diff)
 847                         return diff;
 848                 for (i = 0; i < a.cnt; i++) {
 849                         diff = (int)a.syms[i] - (int)b.syms[i];
 850                         if (diff)
 851                                 return diff;
 852                 }
 853                 return 0;
 854         }
 855
 856         static int save_set(struct grammar *g, struct symset ss)
 857         {
 858                 struct setlist **sl = &g->sets;
 859                 int cmp = 1;
 860                 struct setlist *s;
 861
 862                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 863                         sl = & (*sl)->next;
 864                 if (cmp == 0) {
 865                         symset_free(ss);
 866                         return (*sl)->num;
 867                 }
 868
 869                 s = malloc(sizeof(*s));
 870                 s->ss = ss;
 871                 s->num = g->nextset;
 872                 g->nextset += 1;
 873                 s->next = *sl;
 874                 *sl = s;
 875                 return s->num;
 876         }
 877
 878 Finding a set by number is currently performed by a simple linear search.
 879 If this turns out to hurt performance, we can store the sets in a dynamic
 880 array like the productions.
 881
 882         static struct symset set_find(struct grammar *g, int num)
 883         {
 884                 struct setlist *sl = g->sets;
 885                 while (sl && sl->num != num)
 886                         sl = sl->next;
 887                 return sl->ss;
 888         }
 889
 890 ### Setting `nullable`
 891
 892 We set `nullable` on the head symbol for any production for which all
 893 the body symbols (if any) are nullable.  As this is a recursive
 894 definition, any change in the `nullable` setting means that we need to
 895 re-evaluate where it needs to be set.
 896
 897 We simply loop around performing the same calculations until no more
 898 changes happen.
 899
 900 ###### symbol fields
 901         int nullable;
 902
 903 ###### functions
 904         static void set_nullable(struct grammar *g)
 905         {
 906                 int check_again = 1;
 907                 while (check_again) {
 908                         int p;
 909                         check_again = 0;
 910                         for (p = 0; p < g->production_count; p++) {
 911                                 struct production *pr = g->productions[p];
 912                                 int s;
 913
 914                                 if (pr->head->nullable)
 915                                         continue;
 916                                 for (s = 0; s < pr->body_size; s++)
 917                                         if (! pr->body[s]->nullable)
 918                                                 break;
 919                                 if (s == pr->body_size) {
 920                                         pr->head->nullable = 1;
 921                                         check_again = 1;
 922                                 }
 923                         }
 924                 }
 925         }
 926
 927 ### Setting `line_like`
 928
 929 In order to be able to ignore newline tokens when not relevant, but
 930 still include them in the parse when needed, we will need to know
 931 which states can start a "line-like" section of code.  We ignore
 932 newlines when there is an indent since the most recent start of a
 933 line-like symbol.
 934
 935 A "line_like" symbol is simply any symbol that can derive a NEWLINE.
 936 If a symbol cannot derive a NEWLINE, then it is only part of a line -
 937 so is word-like.  If it can derive a NEWLINE, then we consider it to
 938 be like a line.
 939
 940 Clearly the `TK_newline` token can derive a NEWLINE.  Any symbol which
 941 is the head of a production that contains a line_like symbol is also a
 942 line-like symbol.  We use a new field `line_like` to record this
 943 attribute of symbols, and compute it in a repetitive manner similar to
 944 `set_nullable`.
 945
 946 ###### symbol fields
 947         int line_like;
 948
 949 ###### functions
 950         static void set_line_like(struct grammar *g)
 951         {
 952                 int check_again = 1;
 953                 g->symtab[TK_newline]->line_like = 1;
 954                 while (check_again) {
 955                         int p;
 956                         check_again = 0;
 957                         for (p = 0; p < g->production_count; p++) {
 958                                 struct production *pr = g->productions[p];
 959                                 int s;
 960
 961                                 if (pr->head->line_like)
 962                                         continue;
 963
 964                                 for (s = 0 ; s < pr->body_size; s++) {
 965                                         if (pr->body[s]->line_like) {
 966                                                 pr->head->line_like = 1;
 967                                                 check_again = 1;
 968                                                 break;
 969                                         }
 970                                 }
 971                         }
 972                 }
 973         }
 974
 975 ### Building the `first` sets
 976
 977 When calculating what can follow a particular non-terminal, we will need to
 978 know what the "first" terminal in any subsequent non-terminal might be.  So
 979 we calculate the `first` set for every non-terminal and store them in an
 980 array.  We don't bother recording the "first" set for terminals as they are
 981 trivial.
 982
 983 As the "first" for one symbol might depend on the "first" of another,
 984 we repeat the calculations until no changes happen, just like with
 985 `set_nullable`.  This makes use of the fact that `symset_union`
 986 reports if any change happens.
 987
 988 The core of this, which finds the "first" of part of a production body,
 989 will be reused for computing the "follow" sets, so we split it out
 990 into a separate function.
 991
 992 ###### grammar fields
 993         struct symset *first;
 994
 995 ###### functions
 996
 997         static int add_first(struct production *pr, int start,
 998                              struct symset *target, struct grammar *g,
 999                              int *to_end)
1000         {
1001                 int s;
1002                 int changed = 0;
1003                 for (s = start; s < pr->body_size; s++) {
1004                         struct symbol *bs = pr->body[s];
1005                         if (bs->type == Terminal) {
1006                                 if (symset_find(target, bs->num) < 0) {
1007                                         symset_add(target, bs->num, 0);
1008                                         changed = 1;
1009                                 }
1010                                 break;
1011                         } else if (symset_union(target, &g->first[bs->num]))
1012                                 changed = 1;
1013                         if (!bs->nullable)
1014                                 break;
1015                 }
1016                 if (to_end)
1017                         *to_end = (s == pr->body_size);
1018                 return changed;
1019         }
1020
1021         static void build_first(struct grammar *g)
1022         {
1023                 int check_again = 1;
1024                 int s;
1025                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1026                 for (s = 0; s < g->num_syms; s++)
1027                         g->first[s] = INIT_SYMSET;
1028
1029                 while (check_again) {
1030                         int p;
1031                         check_again = 0;
1032                         for (p = 0; p < g->production_count; p++) {
1033                                 struct production *pr = g->productions[p];
1034                                 struct symset *head = &g->first[pr->head->num];
1035
1036                                 if (add_first(pr, 0, head, g, NULL))
1037                                         check_again = 1;
1038                         }
1039                 }
1040         }
1041
1042 ### Building the `follow` sets.
1043
1044 There are two different situations when we will want to generate "follow"
1045 sets.  If we are doing an SLR analysis, we want to generate a single
1046 "follow" set for each non-terminal in the grammar.  That is what is
1047 happening here.  If we are doing an LALR or LR analysis we will want
1048 to generate a separate "LA" set for each item.  We do that later
1049 in state generation.
1050
1051 There are two parts to generating a "follow" set.  Firstly we look at
1052 every place that any non-terminal appears in the body of any
1053 production, and we find the set of possible "first" symbols after
1054 there.  This is done using `add_first` above and only needs to be done
1055 once as the "first" sets are now stable and will not change.
1056
1057 ###### follow code
1058
1059         for (p = 0; p < g->production_count; p++) {
1060                 struct production *pr = g->productions[p];
1061                 int b;
1062
1063                 for (b = 0; b < pr->body_size - 1; b++) {
1064                         struct symbol *bs = pr->body[b];
1065                         if (bs->type == Terminal)
1066                                 continue;
1067                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1068                 }
1069         }
1070
1071 The second part is to add the "follow" set of the head of a production
1072 to the "follow" sets of the final symbol in the production, and any
1073 other symbol which is followed only by `nullable` symbols.  As this
1074 depends on "follow" itself we need to repeatedly perform the process
1075 until no further changes happen.
1076
1077 ###### follow code
1078
1079         for (again = 0, p = 0;
1080              p < g->production_count;
1081              p < g->production_count-1
1082                 ? p++ : again ? (again = 0, p = 0)
1083                               : p++) {
1084                 struct production *pr = g->productions[p];
1085                 int b;
1086
1087                 for (b = pr->body_size - 1; b >= 0; b--) {
1088                         struct symbol *bs = pr->body[b];
1089                         if (bs->type == Terminal)
1090                                 break;
1091                         if (symset_union(&g->follow[bs->num],
1092                                          &g->follow[pr->head->num]))
1093                                 again = 1;
1094                         if (!bs->nullable)
1095                                 break;
1096                 }
1097         }
1098
1099 We now just need to create and initialise the `follow` list to get a
1100 complete function.
1101
1102 ###### grammar fields
1103         struct symset *follow;
1104
1105 ###### functions
1106         static void build_follow(struct grammar *g)
1107         {
1108                 int s, again, p;
1109                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1110                 for (s = 0; s < g->num_syms; s++)
1111                         g->follow[s] = INIT_SYMSET;
1112                 ## follow code
1113         }
1114
1115 ### Building itemsets and states
1116
1117 There are three different levels of detail that can be chosen for
1118 building the itemsets and states for the LR grammar.  They are:
1119
1120 1. LR(0) or SLR(1), where no look-ahead is considered.
1121 2. LALR(1) where we build look-ahead sets with each item and merge
1122    the LA sets when we find two paths to the same "kernel" set of items.
1123 3. LR(1) where different look-ahead for any item in the set means
1124    a different state must be created.
1125
1126 ###### forward declarations
1127         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1128
1129 We need to be able to look through existing states to see if a newly
1130 generated state already exists.  For now we use a simple sorted linked
1131 list.
1132
1133 An item is a pair of numbers: the production number and the position of
1134 "DOT", which is an index into the body of the production.
1135 As the numbers are not enormous we can combine them into a single "short"
1136 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1137 production number (so 4000 productions with maximum of 15 symbols in the
1138 body).
1139
1140 Comparing the itemsets will be a little different to comparing symsets
1141 as we want to do the lookup after generating the "kernel" of an
1142 itemset, so we need to ignore the offset=zero items which are added during
1143 completion.
1144
1145 To facilitate this, we modify the "DOT" number so that "0" sorts to
1146 the end of the list in the symset, and then only compare items before
1147 the first "0".
1148
1149 ###### declarations
1150         static inline unsigned short item_num(int production, int dot)
1151         {
1152                 return production | ((31-dot) << 11);
1153         }
1154         static inline int item_prod(unsigned short item)
1155         {
1156                 return item & 0x7ff;
1157         }
1158         static inline int item_dot(unsigned short item)
1159         {
1160                 return (31-(item >> 11)) & 0x1f;
1161         }
1162
1163 For LR(1) analysis we need to compare not just the itemset in a state
1164 but also the LA sets.  As we assign each unique LA set a number, we
1165 can just compare the symset and the data values together.
1166
1167 ###### functions
1168         static int itemset_cmp(struct symset a, struct symset b,
1169                                enum grammar_type type)
1170         {
1171                 int i;
1172                 int av, bv;
1173
1174                 for (i = 0;
1175                      i < a.cnt && i < b.cnt &&
1176                      item_dot(a.syms[i]) > 0 &&
1177                      item_dot(b.syms[i]) > 0;
1178                      i++) {
1179                         int diff = a.syms[i] - b.syms[i];
1180                         if (diff)
1181                                 return diff;
1182                         if (type == LR1) {
1183                                 diff = a.data[i] - b.data[i];
1184                                 if (diff)
1185                                         return diff;
1186                         }
1187                 }
1188                 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1189                         av = -1;
1190                 else
1191                         av = a.syms[i];
1192                 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1193                         bv = -1;
1194                 else
1195                         bv = b.syms[i];
1196                 if (av - bv)
1197                         return av - bv;
1198                 if (type < LR1 || av == -1)
1199                         return 0;
1200                 return
1201                         a.data[i] - b.data[i];
1202         }
1203
1204 It will be helpful to know if an itemset has been "completed" or not,
1205 particularly for LALR where itemsets get merged, at which point they
1206 need to be consider for completion again.  So  a `completed` flag is needed.
1207
1208 For correct handling of `TK_newline` when parsing, we will need to
1209 know which states (itemsets) can occur at the start of a line, so we
1210 will record a `starts_line` flag too whenever DOT is at the start of a
1211 `line_like` symbol.
1212
1213 Finally, for handling `TK_out` we need to know whether productions in the
1214 current state started *before* the most recent indent.  A state
1215 doesn't usually keep details of individual productions, so we need to
1216 add one extra detail. `min_prefix` is the smallest non-zero number of
1217 symbols *before* DOT in any production in an itemset.  This will allow
1218 us to determine if the the most recent indent is sufficiently recent
1219 to cancel it against a `TK_out`.  If it was seen longer ago than the
1220 `min_prefix`, and if the current state cannot be reduced, then the
1221 indented section must have ended in the middle of a syntactic unit, so
1222 an error must be signaled.
1223
1224 And now we can build the list of itemsets.  The lookup routine returns
1225 both a success flag and a pointer to where in the list an insert
1226 should happen, so we don't need to search a second time.
1227
1228 ###### declarations
1229         struct itemset {
1230                 struct itemset *next;
1231                 short state;
1232                 struct symset items;
1233                 struct symset go_to;
1234                 enum assoc assoc;
1235                 unsigned short precedence;
1236                 char completed;
1237                 char starts_line;
1238                 int min_prefix;
1239         };
1240
1241 ###### grammar fields
1242         struct itemset *items;
1243         int states;
1244
1245 ###### functions
1246         static int itemset_find(struct grammar *g, struct itemset ***where,
1247                                 struct symset kernel, enum grammar_type type)
1248         {
1249                 struct itemset **ip;
1250
1251                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1252                         struct itemset *i = *ip;
1253                         int diff;
1254                         diff = itemset_cmp(i->items, kernel, type);
1255                         if (diff < 0)
1256                                 continue;
1257                         if (diff > 0)
1258                                 break;
1259                         /* found */
1260                         *where = ip;
1261                         return 1;
1262                 }
1263                 *where = ip;
1264                 return 0;
1265         }
1266
1267 Adding an itemset may require merging the LA sets if LALR analysis is
1268 happening. If any new LA set adds any symbols that weren't in the old LA set, we
1269 clear the `completed` flag so that the dependants of this itemset will be
1270 recalculated and their LA sets updated.
1271
1272 `add_itemset` must consume the symsets it is passed, either by adding
1273 them to a data structure, of freeing them.
1274
1275         static int add_itemset(struct grammar *g, struct symset ss,
1276                                enum assoc assoc, unsigned short precedence,
1277                                enum grammar_type type)
1278         {
1279                 struct itemset **where, *is;
1280                 int i;
1281                 int found = itemset_find(g, &where, ss, type);
1282                 if (!found) {
1283                         is = calloc(1, sizeof(*is));
1284                         is->state = g->states;
1285                         g->states += 1;
1286                         is->items = ss;
1287                         is->assoc = assoc;
1288                         is->precedence = precedence;
1289                         is->next = *where;
1290                         is->go_to = INIT_DATASET;
1291                         *where = is;
1292                         return is->state;
1293                 }
1294                 is = *where;
1295                 if (type != LALR) {
1296                         symset_free(ss);
1297                         return is->state;
1298                 }
1299                 for (i = 0; i < ss.cnt; i++) {
1300                         struct symset temp = INIT_SYMSET, s;
1301                         if (ss.data[i] == is->items.data[i])
1302                                 continue;
1303                         s = set_find(g, is->items.data[i]);
1304                         symset_union(&temp, &s);
1305                         s = set_find(g, ss.data[i]);
1306                         if (symset_union(&temp, &s)) {
1307                                 is->items.data[i] = save_set(g, temp);
1308                                 is->completed = 0;
1309                         } else
1310                                 symset_free(temp);
1311                 }
1312                 symset_free(ss);
1313                 return is->state;
1314         }
1315
1316 #### The build
1317
1318 To build all the itemsets, we first insert the initial itemset made
1319 from production zero, complete each itemset, and then generate new
1320 itemsets from old until no new ones can be made.
1321
1322 Completing an itemset means finding all the items where "DOT" is followed by
1323 a nonterminal and adding "DOT=0" items for every production from that
1324 non-terminal - providing each item hasn't already been added.
1325
1326 If LA sets are needed, the LA set for each new item is found using
1327 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This may
1328 be supplemented by the LA set for the item which produce the new item.
1329
1330 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1331 is used in the next stage.
1332 If any of these symbols are flagged as `line_like`, then this
1333 state must be a `starts_line` state so now is a good time to record that.
1334
1335 When itemsets are created we assign a precedence to the itemset from
1336 the complete item, if there is one.  We ignore the possibility of
1337 there being two and don't (currently) handle precedence in such
1338 grammars.  When completing a grammar we ignore any item where DOT is
1339 followed by a terminal with a precedence lower than that for the
1340 itemset.  Unless the terminal has right associativity, we also ignore
1341 items where the terminal has the same precedence.  The result is that
1342 unwanted items are still in the itemset, but the terminal doesn't get
1343 into the go to set, so the item is ineffective.
1344
1345 ###### complete itemset
1346         for (i = 0; i < is->items.cnt; i++) {
1347                 int p = item_prod(is->items.syms[i]);
1348                 int bs = item_dot(is->items.syms[i]);
1349                 struct production *pr = g->productions[p];
1350                 int p2;
1351                 struct symbol *s;
1352                 struct symset LA = INIT_SYMSET;
1353                 unsigned short sn = 0;
1354                 struct symset LAnl = INIT_SYMSET;
1355                 unsigned short snnl = 0;
1356
1357                 if (is->min_prefix == 0 ||
1358                     (bs > 0 && bs < is->min_prefix))
1359                         is->min_prefix = bs;
1360                 if (bs == pr->body_size)
1361                         continue;
1362                 s = pr->body[bs];
1363                 if (s->precedence && is->precedence &&
1364                     is->precedence > s->precedence)
1365                         /* This terminal has a low precedence and
1366                          * shouldn't be shifted
1367                          */
1368                         continue;
1369                 if (s->precedence && is->precedence &&
1370                     is->precedence == s->precedence && s->assoc != Right)
1371                         /* This terminal has a matching precedence and is
1372                          * not Right-associative, so we mustn't shift it.
1373                          */
1374                         continue;
1375                 if (symset_find(&done, s->num) < 0) {
1376                         symset_add(&done, s->num, 0);
1377                 }
1378                 if (s->type != Nonterminal)
1379                         continue;
1380                 if (s->line_like)
1381                         is->starts_line = 1;
1382                 again = 1;
1383                 if (type >= LALR) {
1384                         // Need the LA set.
1385                         int to_end;
1386                         add_first(pr, bs+1, &LA, g, &to_end);
1387                         if (to_end) {
1388                                 struct symset ss = set_find(g, is->items.data[i]);
1389                                 symset_union(&LA, &ss);
1390                         }
1391                         sn = save_set(g, LA);
1392                         LA = set_find(g, sn);
1393                         if (symset_find(&LA, TK_newline))
1394                                 symset_add(&LAnl, TK_newline, 0);
1395                         snnl = save_set(g, LAnl);
1396                         LAnl = set_find(g, snnl);
1397                 }
1398
1399                 /* Add productions for this symbol */
1400                 for (p2 = s->first_production;
1401                      p2 < g->production_count &&
1402                       g->productions[p2]->head == s;
1403                      p2++) {
1404                         int itm = item_num(p2, 0);
1405                         int pos = symset_find(&is->items, itm);
1406                         if (pos < 0) {
1407                                 if (g->productions[p2]->line_like)
1408                                         symset_add(&is->items, itm, snnl);
1409                                 else
1410                                         symset_add(&is->items, itm, sn);
1411                                 /* Will have re-ordered, so start
1412                                  * from beginning again */
1413                                 i = -1;
1414                         } else if (type >= LALR) {
1415                                 struct symset ss = set_find(g, is->items.data[pos]);
1416                                 struct symset tmp = INIT_SYMSET;
1417                                 struct symset *la = &LA;
1418
1419                                 if (g->productions[p2]->line_like)
1420                                         la = &LAnl;
1421                                 symset_union(&tmp, &ss);
1422                                 if (symset_union(&tmp, la)) {
1423                                         is->items.data[pos] = save_set(g, tmp);
1424                                         i = -1;
1425                                 } else
1426                                         symset_free(tmp);
1427                         }
1428                 }
1429         }
1430
1431 For each symbol we found (and placed in `done`) we collect all the items for
1432 which this symbol is next, and create a new itemset with "DOT" advanced over
1433 the symbol.  This is then added to the collection of itemsets (or merged
1434 with a pre-existing itemset).
1435
1436 ###### derive itemsets
1437         // Now we have a completed itemset, so we need to
1438         // compute all the 'next' itemsets and create them
1439         // if they don't exist.
1440         for (i = 0; i < done.cnt; i++) {
1441                 int j;
1442                 unsigned short state;
1443                 struct symbol *sym = g->symtab[done.syms[i]];
1444                 enum assoc assoc = Non;
1445                 unsigned short precedence = 0;
1446                 struct symset newitemset = INIT_SYMSET;
1447                 if (type >= LALR)
1448                         newitemset = INIT_DATASET;
1449
1450                 for (j = 0; j < is->items.cnt; j++) {
1451                         int itm = is->items.syms[j];
1452                         int p = item_prod(itm);
1453                         int bp = item_dot(itm);
1454                         struct production *pr = g->productions[p];
1455                         unsigned short la = 0;
1456                         int pos;
1457
1458                         if (bp == pr->body_size)
1459                                 continue;
1460                         if (pr->body[bp] != sym)
1461                                 continue;
1462                         if (type >= LALR)
1463                                 la = is->items.data[j];
1464                         pos = symset_find(&newitemset, pr->head->num);
1465                         if (bp + 1 == pr->body_size &&
1466                             pr->precedence > 0 &&
1467                             pr->precedence > precedence) {
1468                                 // new itemset is reducible and has a precedence.
1469                                 precedence = pr->precedence;
1470                                 assoc = pr->assoc;
1471                         }
1472                         if (pos < 0)
1473                                 symset_add(&newitemset, item_num(p, bp+1), la);
1474                         else if (type >= LALR) {
1475                                 // Need to merge la set.
1476                                 int la2 = newitemset.data[pos];
1477                                 if (la != la2) {
1478                                         struct symset ss = set_find(g, la2);
1479                                         struct symset LA = INIT_SYMSET;
1480                                         symset_union(&LA, &ss);
1481                                         ss = set_find(g, la);
1482                                         if (symset_union(&LA, &ss))
1483                                                 newitemset.data[pos] = save_set(g, LA);
1484                                         else
1485                                                 symset_free(LA);
1486                                 }
1487                         }
1488                 }
1489                 state = add_itemset(g, newitemset, assoc, precedence, type);
1490                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1491                         symset_add(&is->go_to, done.syms[i], state);
1492         }
1493
1494 All that is left is to create the initial itemset from production zero, and
1495 with `TK_eof` as the LA set.
1496
1497 ###### functions
1498         static void build_itemsets(struct grammar *g, enum grammar_type type)
1499         {
1500                 struct symset first = INIT_SYMSET;
1501                 struct itemset *is;
1502                 int again;
1503                 unsigned short la = 0;
1504                 if (type >= LALR) {
1505                         // LA set just has eof
1506                         struct symset eof = INIT_SYMSET;
1507                         symset_add(&eof, TK_eof, 0);
1508                         la = save_set(g, eof);
1509                         first = INIT_DATASET;
1510                 }
1511                 // production 0, offset 0 (with no data)
1512                 symset_add(&first, item_num(0, 0), la);
1513                 add_itemset(g, first, Non, 0, type);
1514                 for (again = 0, is = g->items;
1515                      is;
1516                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
1517                         int i;
1518                         struct symset done = INIT_SYMSET;
1519                         if (is->completed)
1520                                 continue;
1521                         is->completed = 1;
1522                         again = 1;
1523                         ## complete itemset
1524                         ## derive itemsets
1525                         symset_free(done);
1526                 }
1527         }
1528
1529 ### Completing the analysis.
1530
1531 The exact process of analysis depends on which level was requested.  For
1532 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1533 `LR1` we need first, but not follow.  For `SLR` we need both.
1534
1535 We don't build the "action" tables that you might expect as the parser
1536 doesn't use them.  They will be calculated to some extent if needed for
1537 a report.
1538
1539 Once we have built everything we allocate arrays for the two lists:
1540 symbols and itemsets.  This allows more efficient access during reporting.
1541 The symbols are grouped as terminals and non-terminals and we record the
1542 changeover point in `first_nonterm`.
1543
1544 ###### grammar fields
1545         struct symbol **symtab;
1546         struct itemset **statetab;
1547         int first_nonterm;
1548
1549 ###### grammar_analyse
1550
1551         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1552         {
1553                 struct symbol *s;
1554                 struct itemset *is;
1555                 int snum = TK_reserved;
1556                 for (s = g->syms; s; s = s->next)
1557                         if (s->num < 0 && s->type == Terminal) {
1558                                 s->num = snum;
1559                                 snum++;
1560                         }
1561                 g->first_nonterm = snum;
1562                 for (s = g->syms; s; s = s->next)
1563                         if (s->num < 0 && s->type != Virtual) {
1564                                 s->num = snum;
1565                                 snum++;
1566                         }
1567                 for (s = g->syms; s; s = s->next)
1568                         if (s->num < 0) {
1569                                 s->num = snum;
1570                                 snum++;
1571                         }
1572                 g->num_syms = snum;
1573                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1574                 for (s = g->syms; s; s = s->next)
1575                         g->symtab[s->num] = s;
1576
1577                 set_nullable(g);
1578                 set_line_like(g);
1579                 if (type >= SLR)
1580                         build_first(g);
1581
1582                 if (type == SLR)
1583                         build_follow(g);
1584
1585                 build_itemsets(g, type);
1586
1587                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1588                 for (is = g->items; is ; is = is->next)
1589                         g->statetab[is->state] = is;
1590         }
1591
1592 ## Reporting on the Grammar
1593
1594 The purpose of the report is to give the grammar developer insight into
1595 how the grammar parser will work.  It is basically a structured dump of
1596 all the tables that have been generated, plus a description of any conflicts.
1597
1598 ###### grammar_report
1599         static int grammar_report(struct grammar *g, enum grammar_type type)
1600         {
1601                 report_symbols(g);
1602                 if (g->follow)
1603                         report_follow(g);
1604                 report_itemsets(g);
1605                 return report_conflicts(g, type);
1606         }
1607
1608 Firstly we have the complete list of symbols, together with the
1609 "FIRST" set if that was generated.  We add a mark to each symbol to
1610 show if it can end in a newline (`>`), if it is considered to be
1611 "line-like" (`<`), or if it is nullable (`.`).
1612
1613 ###### functions
1614
1615         static void report_symbols(struct grammar *g)
1616         {
1617                 int n;
1618                 if (g->first)
1619                         printf("SYMBOLS + FIRST:\n");
1620                 else
1621                         printf("SYMBOLS:\n");
1622
1623                 for (n = 0; n < g->num_syms; n++) {
1624                         struct symbol *s = g->symtab[n];
1625                         if (!s)
1626                                 continue;
1627
1628                         printf(" %c%c%3d%c: ",
1629                                s->nullable ? '.':' ',
1630                                s->line_like ? '<':' ',
1631                                s->num, symtypes[s->type]);
1632                         prtxt(s->name);
1633                         if (s->precedence)
1634                                 printf(" (%d%s)", s->precedence,
1635                                        assoc_names[s->assoc]);
1636
1637                         if (g->first && s->type == Nonterminal) {
1638                                 int j;
1639                                 char c = ':';
1640                                 for (j = 0; j < g->first[n].cnt; j++) {
1641                                         printf("%c ", c);
1642                                         c = ',';
1643                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1644                                 }
1645                         }
1646                         printf("\n");
1647                 }
1648         }
1649
1650 Then we have the follow sets if they were computed.
1651
1652         static void report_follow(struct grammar *g)
1653         {
1654                 int n;
1655                 printf("FOLLOW:\n");
1656                 for (n = 0; n < g->num_syms; n++)
1657                         if (g->follow[n].cnt) {
1658                                 int j;
1659                                 char c = ':';
1660                                 printf("  ");
1661                                 prtxt(g->symtab[n]->name);
1662                                 for (j = 0; j < g->follow[n].cnt; j++) {
1663                                         printf("%c ", c);
1664                                         c = ',';
1665                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1666                                 }
1667                                 printf("\n");
1668                         }
1669         }
1670
1671 And finally the item sets.  These include the GO TO tables and, for
1672 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1673 it up a bit.  First the items, with production number and associativity.
1674
1675         static void report_item(struct grammar *g, int itm)
1676         {
1677                 int p = item_prod(itm);
1678                 int dot = item_dot(itm);
1679                 struct production *pr = g->productions[p];
1680                 int i;
1681
1682                 printf("    ");
1683                 prtxt(pr->head->name);
1684                 printf(" ->");
1685                 for (i = 0; i < pr->body_size; i++) {
1686                         printf(" %s", (dot == i ? ". ": ""));
1687                         prtxt(pr->body[i]->name);
1688                 }
1689                 if (dot == pr->body_size)
1690                         printf(" .");
1691                 printf(" [%d]", p);
1692                 if (pr->precedence && dot == pr->body_size)
1693                         printf(" (%d%s)", pr->precedence,
1694                                assoc_names[pr->assoc]);
1695                 if (dot < pr->body_size &&
1696                     pr->body[dot]->precedence) {
1697                         struct symbol *s = pr->body[dot];
1698                         printf(" [%d%s]", s->precedence,
1699                                assoc_names[s->assoc]);
1700                 }
1701                 if (pr->line_like == 1)
1702                         printf(" $$NEWLINE");
1703                 else if (pr->line_like)
1704                         printf(" $$OUT");
1705                 printf("\n");
1706         }
1707
1708 The LA sets which are (possibly) reported with each item:
1709
1710         static void report_la(struct grammar *g, int lanum)
1711         {
1712                 struct symset la = set_find(g, lanum);
1713                 int i;
1714                 char c = ':';
1715
1716                 printf("        LOOK AHEAD(%d)", lanum);
1717                 for (i = 0; i < la.cnt; i++) {
1718                         printf("%c ", c);
1719                         c = ',';
1720                         prtxt(g->symtab[la.syms[i]]->name);
1721                 }
1722                 printf("\n");
1723         }
1724
1725 Then the go to sets:
1726
1727         static void report_goto(struct grammar *g, struct symset gt)
1728         {
1729                 int i;
1730                 printf("    GOTO:\n");
1731
1732                 for (i = 0; i < gt.cnt; i++) {
1733                         printf("      ");
1734                         prtxt(g->symtab[gt.syms[i]]->name);
1735                         printf(" -> %d\n", gt.data[i]);
1736                 }
1737         }
1738
1739 Now we can report all the item sets complete with items, LA sets, and GO TO.
1740
1741         static void report_itemsets(struct grammar *g)
1742         {
1743                 int s;
1744                 printf("ITEM SETS(%d)\n", g->states);
1745                 for (s = 0; s < g->states; s++) {
1746                         int j;
1747                         struct itemset *is = g->statetab[s];
1748                         printf("  Itemset %d:%s min prefix=%d",
1749                                s, is->starts_line?" (startsline)":"", is->min_prefix);
1750                         if (is->precedence)
1751                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1752                         printf("\n");
1753                         for (j = 0; j < is->items.cnt; j++) {
1754                                 report_item(g, is->items.syms[j]);
1755                                 if (is->items.data != NO_DATA)
1756                                         report_la(g, is->items.data[j]);
1757                         }
1758                         report_goto(g, is->go_to);
1759                 }
1760         }
1761
1762 ### Reporting conflicts
1763
1764 Conflict detection varies a lot among different analysis levels.  However
1765 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1766 LR1 are also similar as they have FOLLOW or LA sets.
1767
1768 ###### functions
1769
1770         ## conflict functions
1771
1772         static int report_conflicts(struct grammar *g, enum grammar_type type)
1773         {
1774                 int cnt = 0;
1775                 printf("Conflicts:\n");
1776                 if (type < SLR)
1777                         cnt = conflicts_lr0(g, type);
1778                 else
1779                         cnt = conflicts_slr(g, type);
1780                 if (cnt == 0)
1781                         printf(" - no conflicts\n");
1782                 return cnt;
1783         }
1784
1785 LR0 conflicts are any state which have both a reducible item and
1786 a shiftable item, or two reducible items.
1787
1788 LR05 conflicts only occur if two possible reductions exist,
1789 as shifts always over-ride reductions.
1790
1791 ###### conflict functions
1792         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1793         {
1794                 int i;
1795                 int cnt = 0;
1796                 for (i = 0; i < g->states; i++) {
1797                         struct itemset *is = g->statetab[i];
1798                         int last_reduce = -1;
1799                         int prev_reduce = -1;
1800                         int last_shift = -1;
1801                         int j;
1802                         if (!is)
1803                                 continue;
1804                         for (j = 0; j < is->items.cnt; j++) {
1805                                 int itm = is->items.syms[j];
1806                                 int p = item_prod(itm);
1807                                 int bp = item_dot(itm);
1808                                 struct production *pr = g->productions[p];
1809
1810                                 if (bp == pr->body_size) {
1811                                         prev_reduce = last_reduce;
1812                                         last_reduce = j;
1813                                         continue;
1814                                 }
1815                                 if (pr->body[bp]->type == Terminal)
1816                                         last_shift = j;
1817                         }
1818                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1819                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1820                                 report_item(g, is->items.syms[last_shift]);
1821                                 report_item(g, is->items.syms[last_reduce]);
1822                                 cnt++;
1823                         }
1824                         if (prev_reduce >= 0) {
1825                                 printf("  State %d has 2 (or more) reducible items\n", i);
1826                                 report_item(g, is->items.syms[prev_reduce]);
1827                                 report_item(g, is->items.syms[last_reduce]);
1828                                 cnt++;
1829                         }
1830                 }
1831                 return cnt;
1832         }
1833
1834 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1835 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1836 in the source of the look ahead set.
1837
1838 We build two datasets to reflect the "action" table: one which maps
1839 terminals to items where that terminal could be shifted and another
1840 which maps terminals to items that could be reduced when the terminal
1841 is in look-ahead.  We report when we get conflicts between the two.
1842
1843 As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
1844 terminal, we ignore it.  NEWLINES are handled specially with its own
1845 rules for when to shift and when to reduce.  Conflicts are expected,
1846 but handled internally.
1847
1848         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1849         {
1850                 int i;
1851                 int cnt = 0;
1852
1853                 for (i = 0; i < g->states; i++) {
1854                         struct itemset *is = g->statetab[i];
1855                         struct symset shifts = INIT_DATASET;
1856                         struct symset reduce = INIT_DATASET;
1857                         int j;
1858                         if (!is)
1859                                 continue;
1860                         /* First collect the shifts */
1861                         for (j = 0; j < is->items.cnt; j++) {
1862                                 unsigned short itm = is->items.syms[j];
1863                                 int p = item_prod(itm);
1864                                 int bp = item_dot(itm);
1865                                 struct production *pr = g->productions[p];
1866                                 struct symbol *s;
1867
1868                                 if (bp >= pr->body_size ||
1869                                     pr->body[bp]->type != Terminal)
1870                                         /* not shiftable */
1871                                         continue;
1872
1873                                 s = pr->body[bp];
1874                                 if (s->precedence && is->precedence)
1875                                         /* Precedence resolves this, so no conflict */
1876                                         continue;
1877
1878                                 if (symset_find(&shifts, s->num) < 0)
1879                                         symset_add(&shifts, s->num, itm);
1880                         }
1881                         /* Now look for reductions and conflicts */
1882                         for (j = 0; j < is->items.cnt; j++) {
1883                                 unsigned short itm = is->items.syms[j];
1884                                 int p = item_prod(itm);
1885                                 int bp = item_dot(itm);
1886                                 struct production *pr = g->productions[p];
1887
1888                                 if (bp < pr->body_size)
1889                                         continue;
1890                                 /* reducible */
1891                                 struct symset la;
1892                                 if (type == SLR)
1893                                         la = g->follow[pr->head->num];
1894                                 else
1895                                         la = set_find(g, is->items.data[j]);
1896                                 int k;
1897                                 for (k = 0; k < la.cnt; k++) {
1898                                         int pos = symset_find(&shifts, la.syms[k]);
1899                                         if (pos >= 0 && la.syms[k] != TK_newline) {
1900                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1901                                                 cnt++;
1902                                                         prtxt(g->symtab[la.syms[k]]->name);
1903                                                 printf(":\n");
1904                                                 report_item(g, shifts.data[pos]);
1905                                                 report_item(g, itm);
1906                                         }
1907                                         pos = symset_find(&reduce, la.syms[k]);
1908                                         if (pos < 0) {
1909                                                 symset_add(&reduce, la.syms[k], itm);
1910                                                 continue;
1911                                         }
1912                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1913                                         prtxt(g->symtab[la.syms[k]]->name);
1914                                         printf(":\n");
1915                                         report_item(g, itm);
1916                                         report_item(g, reduce.data[pos]);
1917                                         cnt++;
1918                                 }
1919                         }
1920                         symset_free(shifts);
1921                         symset_free(reduce);
1922                 }
1923                 return cnt;
1924         }
1925
1926
1927 ## Generating the parser
1928
1929 The exported part of the parser is the `parse_XX` function, where the name
1930 `XX` is based on the name of the parser files.
1931
1932 This takes a `code_node`, a partially initialized `token_config`, and an
1933 optional `FILE` to send tracing to.  The `token_config` gets the list of
1934 known words added and then is used with the `code_node` to initialize the
1935 scanner.
1936
1937 `parse_XX` then calls the library function `parser_run` to actually complete
1938 the parse.  This needs the `states` table and function to call the various
1939 pieces of code provided in the grammar file, so they are generated first.
1940
1941 ###### parser_generate
1942
1943         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1944                                struct code_node *pre_reduce)
1945         {
1946                 gen_known(f, g);
1947                 gen_non_term(f, g);
1948                 gen_goto(f, g);
1949                 gen_states(f, g);
1950                 gen_reduce(f, g, file, pre_reduce);
1951                 gen_free(f, g);
1952
1953                 fprintf(f, "#line 0 \"gen_parser\"\n");
1954                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1955                         name);
1956                 fprintf(f, "{\n");
1957                 fprintf(f, "\tstruct token_state *tokens;\n");
1958                 fprintf(f, "\tconfig->words_marks = known;\n");
1959                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1960                 fprintf(f, "\ttokens = token_open(code, config);\n");
1961                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1962                 fprintf(f, "\ttoken_close(tokens);\n");
1963                 fprintf(f, "\treturn rv;\n");
1964                 fprintf(f, "}\n\n");
1965         }
1966
1967 ### Known words table
1968
1969 The known words table is simply an array of terminal symbols.
1970 The table of nonterminals used for tracing is a similar array.  We
1971 include virtual symbols in the table of non_terminals to keep the
1972 numbers right.
1973
1974 ###### functions
1975
1976         static void gen_known(FILE *f, struct grammar *g)
1977         {
1978                 int i;
1979                 fprintf(f, "#line 0 \"gen_known\"\n");
1980                 fprintf(f, "static const char *known[] = {\n");
1981                 for (i = TK_reserved;
1982                      i < g->num_syms && g->symtab[i]->type == Terminal;
1983                      i++)
1984                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1985                                 g->symtab[i]->name.txt);
1986                 fprintf(f, "};\n\n");
1987         }
1988
1989         static void gen_non_term(FILE *f, struct grammar *g)
1990         {
1991                 int i;
1992                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1993                 fprintf(f, "static const char *non_term[] = {\n");
1994                 for (i = TK_reserved;
1995                      i < g->num_syms;
1996                      i++)
1997                         if (g->symtab[i]->type == Nonterminal)
1998                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1999                                         g->symtab[i]->name.txt);
2000                 fprintf(f, "};\n\n");
2001         }
2002
2003 ### States and the goto tables.
2004
2005 For each state we record the goto table, the reducible production if
2006 there is one, or a symbol to shift for error recovery.
2007 Some of the details of the reducible production are stored in the
2008 `do_reduce` function to come later.  Here we store the production number,
2009 the body size (useful for stack management) and the resulting symbol (useful
2010 for knowing how to free data later).
2011
2012 The go to table is stored in a simple array of `sym` and corresponding
2013 `state`.
2014
2015 ###### exported types
2016
2017         struct lookup {
2018                 short sym;
2019                 short state;
2020         };
2021         struct state {
2022                 short go_to_cnt;
2023                 const struct lookup * go_to;
2024                 short reduce_prod;
2025                 short reduce_size;
2026                 short reduce_sym;
2027                 char starts_line;
2028                 char newline_only;
2029                 short min_prefix;
2030         };
2031
2032 ###### functions
2033
2034         static void gen_goto(FILE *f, struct grammar *g)
2035         {
2036                 int i;
2037                 fprintf(f, "#line 0 \"gen_goto\"\n");
2038                 for (i = 0; i < g->states; i++) {
2039                         int j;
2040                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
2041                                 i);
2042                         struct symset gt = g->statetab[i]->go_to;
2043                         for (j = 0; j < gt.cnt; j++)
2044                                 fprintf(f, "\t{ %d, %d },\n",
2045                                         gt.syms[j], gt.data[j]);
2046                         fprintf(f, "};\n");
2047                 }
2048         }
2049
2050 ###### functions
2051
2052         static void gen_states(FILE *f, struct grammar *g)
2053         {
2054                 int i;
2055                 fprintf(f, "#line 0 \"gen_states\"\n");
2056                 fprintf(f, "static const struct state states[] = {\n");
2057                 for (i = 0; i < g->states; i++) {
2058                         struct itemset *is = g->statetab[i];
2059                         int j, prod = -1, prod_len;
2060
2061                         for (j = 0; j < is->items.cnt; j++) {
2062                                 int itm = is->items.syms[j];
2063                                 int p = item_prod(itm);
2064                                 int bp = item_dot(itm);
2065                                 struct production *pr = g->productions[p];
2066
2067                                 if (bp < pr->body_size)
2068                                         continue;
2069                                 /* This is what we reduce */
2070                                 if (prod < 0 || prod_len < pr->body_size) {
2071                                         prod = p;
2072                                         prod_len = pr->body_size;
2073                                 }
2074                         }
2075
2076                         if (prod >= 0)
2077                                 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n",
2078                                         i, is->go_to.cnt, i, prod,
2079                                         g->productions[prod]->body_size,
2080                                         g->productions[prod]->head->num,
2081                                         is->starts_line,
2082                                         g->productions[prod]->line_like,
2083                                         is->min_prefix);
2084                         else
2085                                 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n",
2086                                         i, is->go_to.cnt, i,
2087                                         is->starts_line, is->min_prefix);
2088                 }
2089                 fprintf(f, "};\n\n");
2090         }
2091
2092 ### The `do_reduce` function and the code
2093
2094 When the parser engine decides to reduce a production, it calls `do_reduce`.
2095 This has two functions.
2096
2097 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
2098 production being reduced.  Secondly it runs the code that was included with
2099 the production if any.
2100
2101 This code needs to be able to store data somewhere.  Rather than requiring
2102 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
2103 `do_reduce` return the size to be saved.
2104
2105 In order for the code to access "global" context, we pass in the
2106 "config" pointer that was passed to parser function.  If the `struct
2107 token_config` is embedded in some larger structure, the reducing code
2108 can access the larger structure using pointer manipulation.
2109
2110 The code fragment requires translation when written out.  Any `$N` needs to
2111 be converted to a reference either to that buffer (if `$0`) or to the
2112 structure returned by a previous reduction.  These pointers need to be cast
2113 to the appropriate type for each access.  All this is handled in
2114 `gen_code`.
2115
2116 `gen_code` also allows symbol references to contain a '`<`' as in
2117 '`$<2`'.  This is particularly useful for references (or pointers), but
2118 can be used with structures too.  The `<` implies that the value it
2119 being moved out, so the object will not be automatically freed.  It is
2120 equivalent to assigning `NULL` to the pointer or filling a structure
2121 with zeros.
2122
2123 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2124 and possibly a number.  A number by itself (other than zero) selects a
2125 symbol from the body of the production.  A sequence of letters selects
2126 the shortest symbol in the body which contains those letters in the given
2127 order.  If a number follows the letters, then a later occurrence of
2128 that symbol is chosen.  So "`$AB2`" will refer to the structure attached
2129 to the second occurrence of the shortest symbol which contains an `A`
2130 followed by a `B`.  If there is no unique shortest system, or if the
2131 number given is too large, then the symbol reference is not transformed,
2132 and will cause an error when the code is compiled.
2133
2134 ###### functions
2135
2136         static int textchr(struct text t, char c, int s)
2137         {
2138                 int i;
2139                 for (i = s; i < t.len; i++)
2140                         if (t.txt[i] == c)
2141                                 return i;
2142                 return -1;
2143         }
2144
2145         static int subseq_match(char *seq, int slen, struct text name)
2146         {
2147                 int st = 0;
2148                 while (slen > 0) {
2149                         st = textchr(name, *seq, st);
2150                         if (st < 0)
2151                                 return 0;
2152                         slen -= 1;
2153                         seq += 1;
2154                         st += 1;
2155                 }
2156                 return 1;
2157         }
2158
2159         static int choose_sym(char **namep, int len, struct production *p)
2160         {
2161                 char *name = *namep;
2162                 char *nam = name;
2163                 int namlen;
2164                 int n = 0;
2165                 int i, s, slen;
2166                 char c;
2167
2168                 c = *name;
2169                 while (len > 0 &&
2170                        ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2171                         name += 1;
2172                         len -= 1;
2173                         c = *name;
2174                 }
2175                 namlen = name-nam;
2176                 while (len > 0 && (c >= '0' && c <= '9')) {
2177                         name += 1;
2178                         len -= 1;
2179                         n = n * 10 + (c - '0');
2180                         c = *name;
2181                 }
2182                 if (namlen == 0) {
2183                         if (name == *namep)
2184                                 return -1;
2185                         *namep = name;
2186                         return n;
2187                 }
2188                 slen = 0; s = -1;
2189                 for (i = 0; i < p->body_size; i++) {
2190                         if (!subseq_match(nam, namlen, p->body[i]->name))
2191                                 continue;
2192                         if (slen == 0 || p->body[i]->name.len < slen)
2193                                 s = i;
2194                         if (s >= 0 && p->body[i] != p->body[s] &&
2195                             p->body[i]->name.len == p->body[s]->name.len)
2196                                 /* not unique, so s cannot be used */
2197                                 s = -1;
2198                 }
2199                 if (s < 0)
2200                         return -1;
2201                 if (n == 0);
2202                         n = 1;
2203                 for (i = 0; i < p->body_size; i++)
2204                         if (p->body[i] == p->body[s]) {
2205                                 n -= 1;
2206                                 if (n == 0)
2207                                         break;
2208                         }
2209                 if (n > 1)
2210                         return -1;
2211                 *namep = name;
2212                 return i + 1;
2213         }
2214
2215         static void gen_code(struct production *p, FILE *f, struct grammar *g)
2216         {
2217                 char *c;
2218                 char *used = calloc(1, p->body_size);
2219                 int i;
2220
2221                 fprintf(f, "\t\t\t");
2222                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2223                         int n;
2224                         int use = 0;
2225                         if (*c != '$') {
2226                                 fputc(*c, f);
2227                                 if (*c == '\n')
2228                                         fputs("\t\t\t", f);
2229                                 continue;
2230                         }
2231                         c++;
2232                         if (*c == '<') {
2233                                 use = 1;
2234                                 c++;
2235                         }
2236                         n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2237                         if (n < 0) {
2238                                 fputc('$', f);
2239                                 if (use)
2240                                         fputc('<', f);
2241                                 fputc(*c, f);
2242                                 continue;
2243                         }
2244                         if (n == 0)
2245                                 fprintf(f, "(*(struct %.*s*%s)ret)",
2246                                         p->head->struct_name.len,
2247                                         p->head->struct_name.txt,
2248                                         p->head->isref ? "*":"");
2249                         else if (p->body[n-1]->type == Terminal)
2250                                 fprintf(f, "(*(struct token *)body[%d])",
2251                                         n-1);
2252                         else if (p->body[n-1]->struct_name.txt == NULL)
2253                                 fprintf(f, "$%d", n);
2254                         else {
2255                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2256                                         p->body[n-1]->struct_name.len,
2257                                         p->body[n-1]->struct_name.txt,
2258                                         p->body[n-1]->isref ? "*":"", n-1);
2259                                 used[n-1] = use;
2260                         }
2261                         c -= 1;
2262                 }
2263                 fputs("\n", f);
2264                 for (i = 0; i < p->body_size; i++) {
2265                         if (p->body[i]->struct_name.txt &&
2266                             used[i]) {
2267                                 // assume this has been copied out
2268                                 if (p->body[i]->isref)
2269                                         fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2270                                 else
2271                                         fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2272                         }
2273                 }
2274                 free(used);
2275         }
2276
2277 ###### functions
2278
2279         static void gen_reduce(FILE *f, struct grammar *g, char *file,
2280                                struct code_node *code)
2281         {
2282                 int i;
2283                 fprintf(f, "#line 1 \"gen_reduce\"\n");
2284                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2285                 fprintf(f, "{\n");
2286                 fprintf(f, "\tint ret_size = 0;\n");
2287                 if (code)
2288                         code_node_print(f, code, file);
2289
2290                 fprintf(f, "#line 4 \"gen_reduce\"\n");
2291                 fprintf(f, "\tswitch(prod) {\n");
2292                 for (i = 0; i < g->production_count; i++) {
2293                         struct production *p = g->productions[i];
2294                         fprintf(f, "\tcase %d:\n", i);
2295
2296                         if (p->code.txt) {
2297                                 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2298                                 gen_code(p, f, g);
2299                         }
2300
2301                         if (p->head->struct_name.txt)
2302                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2303                                         p->head->struct_name.len,
2304                                         p->head->struct_name.txt,
2305                                         p->head->isref ? "*":"");
2306
2307                         fprintf(f, "\t\tbreak;\n");
2308                 }
2309                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2310         }
2311
2312 ### `do_free`
2313
2314 As each non-terminal can potentially cause a different type of data
2315 structure to be allocated and filled in, we need to be able to free it when
2316 done.
2317
2318 It is particularly important to have fine control over freeing during error
2319 recovery where individual stack frames might need to be freed.
2320
2321 For this, the grammar author is required to defined a `free_XX` function for
2322 each structure that is used by a non-terminal.  `do_free` will call whichever
2323 is appropriate given a symbol number, and will call `free` (as is
2324 appropriate for tokens) on any terminal symbol.
2325
2326 ###### functions
2327
2328         static void gen_free(FILE *f, struct grammar *g)
2329         {
2330                 int i;
2331
2332                 fprintf(f, "#line 0 \"gen_free\"\n");
2333                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2334                 fprintf(f, "{\n");
2335                 fprintf(f, "\tif (!asn) return;\n");
2336                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2337                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2338                 fprintf(f, "\tswitch(sym) {\n");
2339
2340                 for (i = 0; i < g->num_syms; i++) {
2341                         struct symbol *s = g->symtab[i];
2342                         if (!s ||
2343                             s->type != Nonterminal ||
2344                             s->struct_name.txt == NULL)
2345                                 continue;
2346
2347                         fprintf(f, "\tcase %d:\n", s->num);
2348                         if (s->isref) {
2349                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2350                                         s->struct_name.len,
2351                                         s->struct_name.txt);
2352                                 fprintf(f, "\t\tfree(asn);\n");
2353                         } else
2354                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2355                                         s->struct_name.len,
2356                                         s->struct_name.txt);
2357                         fprintf(f, "\t\tbreak;\n");
2358                 }
2359                 fprintf(f, "\t}\n}\n\n");
2360         }
2361
2362 ## The main routine.
2363
2364 There are three key parts to the "main" function of parsergen: processing
2365 the arguments, loading the grammar file, and dealing with the grammar.
2366
2367 ### Argument processing.
2368
2369 Command line options allow the selection of analysis mode, name of output
2370 file, and whether or not a report should be generated.  By default we create
2371 a report only if no code output was requested.
2372
2373 The `parse_XX` function name uses the basename of the output file which
2374 should not have a suffix (`.c`).  `.c` is added to the given name for the
2375 code, and `.h` is added for the header (if header text is specifed in the
2376 grammar file).
2377
2378 ###### includes
2379         #include <getopt.h>
2380
2381 ###### declarations
2382         static const struct option long_options[] = {
2383                 { "LR0",        0, NULL, '0' },
2384                 { "LR05",       0, NULL, '5' },
2385                 { "SLR",        0, NULL, 'S' },
2386                 { "LALR",       0, NULL, 'L' },
2387                 { "LR1",        0, NULL, '1' },
2388                 { "tag",        1, NULL, 't' },
2389                 { "report",     0, NULL, 'R' },
2390                 { "output",     1, NULL, 'o' },
2391                 { NULL,         0, NULL, 0   }
2392         };
2393         const char *options = "05SL1t:Ro:";
2394
2395 ###### process arguments
2396         int opt;
2397         char *outfile = NULL;
2398         char *infile;
2399         char *name;
2400         char *tag = NULL;
2401         int report = 1;
2402         enum grammar_type type = LR05;
2403         while ((opt = getopt_long(argc, argv, options,
2404                                   long_options, NULL)) != -1) {
2405                 switch(opt) {
2406                 case '0':
2407                         type = LR0; break;
2408                 case '5':
2409                         type = LR05; break;
2410                 case 'S':
2411                         type = SLR; break;
2412                 case 'L':
2413                         type = LALR; break;
2414                 case '1':
2415                         type = LR1; break;
2416                 case 'R':
2417                         report = 2; break;
2418                 case 'o':
2419                         outfile = optarg; break;
2420                 case 't':
2421                         tag = optarg; break;
2422                 default:
2423                         fprintf(stderr, "Usage: parsergen ...\n");
2424                         exit(1);
2425                 }
2426         }
2427         if (optind < argc)
2428                 infile = argv[optind++];
2429         else {
2430                 fprintf(stderr, "No input file given\n");
2431                 exit(1);
2432         }
2433         if (outfile && report == 1)
2434                 report = 0;
2435         name = outfile;
2436         if (name && strchr(name, '/'))
2437                 name = strrchr(name, '/')+1;
2438
2439         if (optind < argc) {
2440                 fprintf(stderr, "Excess command line arguments\n");
2441                 exit(1);
2442         }
2443
2444 ### Loading the grammar file
2445
2446 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2447 map it.
2448
2449 Once we have extracted the code (with `mdcode`) we expect to find three
2450 sections: header, code, and grammar.  Anything else that is not
2451 excluded by the `--tag` option is an error.
2452
2453 "header" and "code" are optional, though it is hard to build a working
2454 parser with neither. "grammar" must be provided.
2455
2456 ###### includes
2457         #include <fcntl.h>
2458         #include <sys/mman.h>
2459         #include <errno.h>
2460
2461 ###### functions
2462         static int errs;
2463         static void pr_err(char *msg)
2464         {
2465                 errs++;
2466                 fprintf(stderr, "%s\n", msg);
2467         }
2468
2469 ###### load file
2470         struct section *table;
2471         int fd;
2472         int len;
2473         char *file;
2474         fd = open(infile, O_RDONLY);
2475         if (fd < 0) {
2476                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2477                         infile, strerror(errno));
2478                 exit(1);
2479         }
2480         len = lseek(fd, 0, 2);
2481         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2482         table = code_extract(file, file+len, pr_err);
2483
2484         struct code_node *hdr = NULL;
2485         struct code_node *code = NULL;
2486         struct code_node *gram = NULL;
2487         struct code_node *pre_reduce = NULL;
2488         for (s = table; s; s = s->next) {
2489                 struct text sec = s->section;
2490                 if (tag && !strip_tag(&sec, tag))
2491                         continue;
2492                 if (text_is(sec, "header"))
2493                         hdr = s->code;
2494                 else if (text_is(sec, "code"))
2495                         code = s->code;
2496                 else if (text_is(sec, "grammar"))
2497                         gram = s->code;
2498                 else if (text_is(sec, "reduce"))
2499                         pre_reduce = s->code;
2500                 else {
2501                         fprintf(stderr, "Unknown content section: %.*s\n",
2502                                 s->section.len, s->section.txt);
2503                         rv |= 2;
2504                 }
2505         }
2506
2507 ### Processing the input
2508
2509 As we need to append an extention to a filename and then open it for
2510 writing, and we need to do this twice, it helps to have a separate function.
2511
2512 ###### functions
2513
2514         static FILE *open_ext(char *base, char *ext)
2515         {
2516                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2517                 FILE *f;
2518                 strcat(strcpy(fn, base), ext);
2519                 f = fopen(fn, "w");
2520                 free(fn);
2521                 return f;
2522         }
2523
2524 If we can read the grammar, then we analyse and optionally report on it.  If
2525 the report finds conflicts we will exit with an error status.
2526
2527 ###### process input
2528         struct grammar *g = NULL;
2529         if (gram == NULL) {
2530                 fprintf(stderr, "No grammar section provided\n");
2531                 rv |= 2;
2532         } else {
2533                 g = grammar_read(gram);
2534                 if (!g) {
2535                         fprintf(stderr, "Failure to parse grammar\n");
2536                         rv |= 2;
2537                 }
2538         }
2539         if (g) {
2540                 grammar_analyse(g, type);
2541                 if (report)
2542                         if (grammar_report(g, type))
2543                                 rv |= 1;
2544         }
2545
2546 If a "headers" section is defined, we write it out and include a declaration
2547 for the `parse_XX` function so it can be used from separate code.
2548
2549         if (rv == 0 && hdr && outfile) {
2550                 FILE *f = open_ext(outfile, ".h");
2551                 if (f) {
2552                         code_node_print(f, hdr, infile);
2553                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2554                                 name);
2555                         fclose(f);
2556                 } else {
2557                         fprintf(stderr, "Cannot create %s.h\n",
2558                                 outfile);
2559                         rv |= 4;
2560                 }
2561         }
2562
2563 And if all goes well and an output file was provided, we create the `.c`
2564 file with the code section (if any) and the parser tables and function.
2565
2566         if (rv == 0 && outfile) {
2567                 FILE *f = open_ext(outfile, ".c");
2568                 if (f) {
2569                         if (code)
2570                                 code_node_print(f, code, infile);
2571                         gen_parser(f, g, infile, name, pre_reduce);
2572                         fclose(f);
2573                 } else {
2574                         fprintf(stderr, "Cannot create %s.c\n",
2575                                 outfile);
2576                         rv |= 4;
2577                 }
2578         }
2579
2580 And that about wraps it up.  We need to set the locale so that UTF-8 is
2581 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2582
2583 ###### File: parsergen.mk
2584         parsergen : parsergen.o libscanner.o libmdcode.o
2585                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2586
2587 ###### includes
2588         #include <locale.h>
2589
2590 ###### main
2591
2592         int main(int argc, char *argv[])
2593         {
2594                 struct section *s;
2595                 int rv = 0;
2596
2597                 setlocale(LC_ALL,"");
2598
2599                 ## process arguments
2600                 ## load file
2601                 ## process input
2602
2603                 return rv;
2604         }
2605
2606 ## The SHIFT/REDUCE parser
2607
2608 Having analysed the grammar and generated all the tables, we only need the
2609 shift/reduce engine to bring it all together.
2610
2611 ### Goto table lookup
2612
2613 The parser generator has nicely provided us with goto tables sorted by
2614 symbol number.  We need a binary search function to find a symbol in the
2615 table.
2616
2617 ###### parser functions
2618
2619         static int search(const struct state *l, int sym)
2620         {
2621                 int lo = 0;
2622                 int hi = l->go_to_cnt;
2623
2624                 if (hi == 0)
2625                         return -1;
2626                 while (lo + 1 < hi) {
2627                         int mid = (lo + hi) / 2;
2628                         if (l->go_to[mid].sym <= sym)
2629                                 lo = mid;
2630                         else
2631                                 hi = mid;
2632                 }
2633                 if (l->go_to[lo].sym == sym)
2634                         return l->go_to[lo].state;
2635                 else
2636                         return -1;
2637         }
2638
2639 ### The state stack.
2640
2641 The core data structure for the parser is the stack.  This tracks all the
2642 symbols that have been recognised or partially recognised.
2643
2644 The stack usually won't grow very large - maybe a few tens of entries.  So
2645 we dynamically resize and array as required but never bother to shrink it
2646 down again.
2647
2648 We keep the stack as two separate allocations.  One, `asn_stack` stores the
2649 "abstract syntax nodes" which are created by each reduction.  When we call
2650 `do_reduce` we need to pass an array of the `asn`s of the body of the
2651 production, and by keeping a separate `asn` stack, we can just pass a
2652 pointer into this stack.
2653
2654 The other allocation stores all other stack fields of which there are six.
2655 The `state` is the most important one and guides the parsing process.  The
2656 `sym` is nearly unnecessary.  However when we want to free entries from the
2657 `asn_stack`, it helps to know what type they are so we can call the right
2658 freeing function.  The symbol leads us to the right free function through
2659 `do_free`.
2660
2661 The `indents` count tracks the line indents with-in the symbol or
2662 immediately follow it.  These are used to allow indent information to
2663 guide parsing and error recovery.
2664
2665 `since_newline` tracks how many stack frames since the last
2666 start-of-line (whether indented or not).  So if `since_newline` is
2667 zero, then this symbol is at the start of a line.  Similarly
2668 `since_indent` counts the number of states since an indent, it is zero
2669 precisely when `indents` is not zero.
2670
2671 `newline_permitted` keeps track of whether newlines should be ignored
2672 or not.
2673
2674 The stack is most properly seen as alternating states and symbols -
2675 states, like the 'DOT' in items, are between symbols.  Each frame in
2676 our stack holds a state and the symbol that was before it.  The
2677 bottom of stack holds the start state but no symbol, as nothing came
2678 before the beginning.
2679
2680 ###### parser functions
2681
2682         struct parser {
2683                 struct frame {
2684                         short state;
2685                         short newline_permitted;
2686
2687                         short sym;
2688                         short indents;
2689                         short since_newline;
2690                         short since_indent;
2691                 } *stack;
2692                 void **asn_stack;
2693                 int stack_size;
2694                 int tos;
2695         };
2696
2697 #### Shift and pop
2698
2699 Two operations are needed on the stack - shift (which is like push) and pop.
2700
2701 Shift applies not only to terminals but also to non-terminals.  When
2702 we reduce a production we will pop off entries corresponding to the
2703 body symbols, then push on an item for the head of the production.
2704 This last is exactly the same process as shifting in a terminal so we
2705 use the same function for both.  In both cases we provide the symbol,
2706 the number of indents the symbol contains (which will be zero for a
2707 terminal symbol) and a flag indicating the the symbol was at (or was
2708 reduced from a symbol which was at) the start of a line.  The state is
2709 deduced from the current top-of-stack state and the new symbol.
2710
2711 To simplify other code we arrange for `shift` to fail if there is no `goto`
2712 state for the symbol.  This is useful in basic parsing due to our design
2713 that we shift when we can, and reduce when we cannot.  So the `shift`
2714 function reports if it could.
2715
2716 `shift` is also used to push state zero onto the stack, so if the
2717 stack is empty, it always chooses zero as the next state.
2718
2719 So `shift` finds the next state.  If that succeeds it extends the
2720 allocations if needed and pushes all the information onto the stacks.
2721
2722 Newlines are permitted after a `starts_line` state until an internal
2723 indent.  If the new frame has neither a `starts_line` state nor an
2724 indent, newlines are permitted if the previous stack frame permitted
2725 them.
2726
2727 ###### parser functions
2728
2729         static int shift(struct parser *p,
2730                          short sym, short indents, short start_of_line,
2731                          void *asn,
2732                          const struct state states[])
2733         {
2734                 // Push an entry onto the stack
2735                 struct frame next = {0};
2736                 int newstate = p->tos
2737                         ? search(&states[p->stack[p->tos-1].state],
2738                                  sym)
2739                         : 0;
2740                 if (newstate < 0)
2741                         return 0;
2742                 if (p->tos >= p->stack_size) {
2743                         p->stack_size += 10;
2744                         p->stack = realloc(p->stack, p->stack_size
2745                                            * sizeof(p->stack[0]));
2746                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2747                                            * sizeof(p->asn_stack[0]));
2748                 }
2749                 next.sym = sym;
2750                 next.indents = indents;
2751                 next.state = newstate;
2752                 if (states[newstate].starts_line)
2753                         next.newline_permitted = 1;
2754                 else if (indents)
2755                         next.newline_permitted = 0;
2756                 else if (p->tos)
2757                         next.newline_permitted =
2758                                 p->stack[p->tos-1].newline_permitted;
2759                 else
2760                         next.newline_permitted = 0;
2761
2762                 if (!start_of_line) {
2763                         if (p->tos)
2764                                 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2765                         else
2766                                 next.since_newline = 1;
2767                 }
2768                 if (indents)
2769                         next.since_indent = 0;
2770                 else if (p->tos)
2771                         next.since_indent = p->stack[p->tos-1].since_indent + 1;
2772                 else
2773                         next.since_indent = 1;
2774
2775                 p->stack[p->tos] = next;
2776                 p->asn_stack[p->tos] = asn;
2777                 p->tos++;
2778                 return 1;
2779         }
2780
2781 `pop` primarily moves the top of stack (`tos`) back down the required
2782 amount and frees any `asn` entries that need to be freed.  It also
2783 collects a summary of the indents and line starts in the symbols that
2784 are being removed. It is called _after_ we reduce a production, just
2785 before we `shift` the nonterminal in.
2786
2787 ###### parser functions
2788
2789         static int pop(struct parser *p, int num,
2790                        short *start_of_line,
2791                        void(*do_free)(short sym, void *asn))
2792         {
2793                 int i;
2794                 short indents = 0;
2795                 int sol = 0;
2796
2797                 p->tos -= num;
2798                 for (i = 0; i < num; i++) {
2799                         sol |= !p->stack[p->tos+i].since_newline;
2800                         indents += p->stack[p->tos+i].indents;
2801                         do_free(p->stack[p->tos+i].sym,
2802                                 p->asn_stack[p->tos+i]);
2803                 }
2804                 if (start_of_line)
2805                         *start_of_line = sol;
2806                 return indents;
2807         }
2808
2809 ### Memory allocation
2810
2811 The `scanner` returns tokens in a local variable - we want them in allocated
2812 memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
2813 a reduce is in a large buffer.  Both of these require some allocation and
2814 copying, hence `memdup` and `tokcopy`.
2815
2816 ###### parser includes
2817         #include <memory.h>
2818
2819 ###### parser functions
2820
2821         void *memdup(void *m, int len)
2822         {
2823                 void *ret;
2824                 if (len == 0)
2825                         return NULL;
2826                 ret = malloc(len);
2827                 memcpy(ret, m, len);
2828                 return ret;
2829         }
2830
2831         static struct token *tok_copy(struct token tk)
2832         {
2833                 struct token *new = malloc(sizeof(*new));
2834                 *new = tk;
2835                 return new;
2836         }
2837
2838 ### The heart of the parser.
2839
2840 Now we have the parser.  If we can shift we do, though newlines and
2841 reducing indenting may block that.  If not and we can reduce we do
2842 that.  If the production we reduced was production zero, then we have
2843 accepted the input and can finish.
2844
2845 We return whatever `asn` was returned by reducing production zero.
2846
2847 If we can neither shift nor reduce we have an error to handle.  We pop
2848 single entries off the stack until we can shift the `TK_error` symbol,
2849 then drop input tokens until we find one we can shift into the new error
2850 state.  We need to ensure that something is dropped or shifted after an
2851 error, or we could get into an infinite loop, only shifting in
2852 `TK_error`, then reducing.  So we track if there has been a shift since
2853 the last error, and if not the next error always discards one token.
2854
2855 When we find `TK_in` and `TK_out` tokens which report indents we need
2856 to handle them directly as the grammar cannot express what we want to
2857 do with them.
2858
2859 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2860 record how many indents there are following the previous token.
2861
2862 `TK_out` tokens must be canceled against an indent count
2863 within the stack.  If we can reduce some symbols that are all since
2864 the most recent indent, then we do that first.  If the minimum prefix
2865 of the current state then extends back before the most recent indent,
2866 that indent can be cancelled.  If the minimum prefix is shorter then
2867 the indent had ended prematurely and we must start error handling, which
2868 is still a work-in-progress.
2869
2870 `TK_newline` tokens are ignored unless the top stack frame records
2871 that they are permitted.  In that case they will not be considered for
2872 shifting if it is possible to reduce some symbols that are all since
2873 the most recent start of line.  This is how a newline forcibly
2874 terminates any line-like structure - we try to reduce down to at most
2875 one symbol for each line where newlines are allowed.
2876 A consequence of this is that a rule like
2877
2878 ###### Example: newlines - broken
2879
2880         Newlines ->
2881                 | NEWLINE Newlines
2882         IfStatement -> Newlines if ....
2883
2884 cannot work, as the NEWLINE will never be shifted as the empty string
2885 will be reduced first.  Optional sets of newlines need to be include
2886 in the thing that preceed:
2887
2888 ###### Example: newlines - works
2889
2890         If -> if
2891                 | NEWLINE If
2892         IfStatement -> If ....
2893
2894 Here the NEWLINE will be shifted because nothing can be reduced until
2895 the `if` is seen.
2896
2897 When during error handling we discard tokens read in, we want to keep
2898 discarding until we see one that is recognised.  If we had a full set
2899 of LR(1) grammar states, this would mean looking in the look-ahead set,
2900 but we don't keep a full look-ahead set.  We only record the subset
2901 that leads to SHIFT.  We can, however, deduce the look-ahead set by
2902 looking at the SHIFT subsets for all states that we can get to by
2903 reducing zero or more times.  So we need a little function which
2904 checks if a given token is in any of these look-ahead sets.
2905
2906 ###### parser includes
2907         #include "parser.h"
2908
2909 ###### parser_run
2910
2911         static int in_lookahead(struct token *tk, const struct state *states, int state)
2912         {
2913                 while (state >= 0) {
2914                         if (search(&states[state], tk->num) >= 0)
2915                                 return 1;
2916                         if (states[state].reduce_prod < 0)
2917                                 return 0;
2918                         state = search(&states[state], states[state].reduce_sym);
2919                 }
2920                 return 0;
2921         }
2922
2923         void *parser_run(struct token_state *tokens,
2924                          const struct state states[],
2925                          int (*do_reduce)(int, void**, struct token_config*, void*),
2926                          void (*do_free)(short, void*),
2927                          FILE *trace, const char *non_term[],
2928                          struct token_config *config)
2929         {
2930                 struct parser p = { 0 };
2931                 struct token *tk = NULL;
2932                 int accepted = 0;
2933                 int shift_since_err = 1;
2934                 void *ret = NULL;
2935
2936                 shift(&p, TK_eof, 0, 1, NULL, states);
2937                 while (!accepted && p.tos > 0) {
2938                         struct token *err_tk;
2939                         struct frame *tos = &p.stack[p.tos-1];
2940                         if (!tk)
2941                                 tk = tok_copy(token_next(tokens));
2942                         parser_trace(trace, &p,
2943                                      tk, states, non_term, config->known_count);
2944
2945                         if (tk->num == TK_in) {
2946                                 tos->indents += 1;
2947                                 tos->since_newline = 0;
2948                                 tos->since_indent = 0;
2949                                 if (!states[tos->state].starts_line)
2950                                         tos->newline_permitted = 0;
2951                                 free(tk);
2952                                 tk = NULL;
2953                                 parser_trace_action(trace, "Record");
2954                                 continue;
2955                         }
2956                         if (tk->num == TK_out) {
2957                                 if (states[tos->state].reduce_size >= 0 &&
2958                                     states[tos->state].reduce_size <= tos->since_indent)
2959                                         goto force_reduce;
2960                                 if (states[tos->state].min_prefix >= tos->since_indent) {
2961                                         // OK to cancel
2962                                         struct frame *in = tos - tos->since_indent;
2963                                         in->indents -= 1;
2964                                         if (in->indents == 0) {
2965                                                 /* Reassess since_indent and newline_permitted */
2966                                                 if (in > p.stack) {
2967                                                         in->since_indent = in[-1].since_indent + 1;
2968                                                         in->newline_permitted = in[-1].newline_permitted;
2969                                                 } else {
2970                                                         in->since_indent = 0;
2971                                                         in->newline_permitted = 0;
2972                                                 }
2973                                                 if (states[in->state].starts_line)
2974                                                         in->newline_permitted = 1;
2975                                                 while (in < tos) {
2976                                                         in += 1;
2977                                                         in->since_indent = in[-1].since_indent + 1;
2978                                                         if (states[in->state].starts_line)
2979                                                                 in->newline_permitted = 1;
2980                                                         else
2981                                                                 in->newline_permitted = in[-1].newline_permitted;
2982                                                 }
2983                                         }
2984                                         free(tk);
2985                                         tk = NULL;
2986                                         parser_trace_action(trace, "Cancel");
2987                                         continue;
2988                                 }
2989                                 // fall through to error handling as both SHIFT and REDUCE
2990                                 // will fail.
2991                         }
2992                         if (tk->num == TK_newline) {
2993                                 if (!tos->newline_permitted) {
2994                                         free(tk);
2995                                         tk = NULL;
2996                                         parser_trace_action(trace, "Discard");
2997                                         continue;
2998                                 }
2999                                 if (tos->since_newline > 1 &&
3000                                     states[tos->state].reduce_size >= 0 &&
3001                                     states[tos->state].reduce_size <= tos->since_newline)
3002                                         goto force_reduce;
3003                         }
3004                         if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
3005                                 shift_since_err = 1;
3006                                 tk = NULL;
3007                                 parser_trace_action(trace, "Shift");
3008                                 continue;
3009                         }
3010                 force_reduce:
3011                         if (states[tos->state].reduce_prod >= 0 &&
3012                             states[tos->state].newline_only &&
3013                             !(tk->num == TK_newline ||
3014                               tk->num == TK_eof ||
3015                               tk->num == TK_out ||
3016                               (tos->indents == 0 && tos->since_newline == 0))) {
3017                                 /* Anything other than newline or out or eof
3018                                  * in an error unless we are already at start
3019                                  * of line, as this production must end at EOL.
3020                                  */
3021                         } else if (states[tos->state].reduce_prod >= 0) {
3022                                 void **body;
3023                                 void *res;
3024                                 const struct state *nextstate = &states[tos->state];
3025                                 int prod = nextstate->reduce_prod;
3026                                 int size = nextstate->reduce_size;
3027                                 int bufsize;
3028                                 static char buf[16*1024];
3029                                 short indents, start_of_line;
3030
3031                                 body = p.asn_stack + (p.tos - size);
3032
3033                                 bufsize = do_reduce(prod, body, config, buf);
3034
3035                                 indents = pop(&p, size, &start_of_line,
3036                                               do_free);
3037                                 res = memdup(buf, bufsize);
3038                                 memset(buf, 0, bufsize);
3039                                 if (!shift(&p, nextstate->reduce_sym,
3040                                            indents, start_of_line,
3041                                            res, states)) {
3042                                         if (prod != 0) abort();
3043                                         accepted = 1;
3044                                         ret = res;
3045                                 }
3046                                 parser_trace_action(trace, "Reduce");
3047                                 continue;
3048                         }
3049                         /* Error. We walk up the stack until we
3050                          * find a state which will accept TK_error.
3051                          * We then shift in TK_error and see what state
3052                          * that takes us too.
3053                          * Then we discard input tokens until
3054                          * we find one that is acceptable.
3055                          */
3056                         parser_trace_action(trace, "ERROR");
3057                         short indents = 0, start_of_line;
3058
3059                         err_tk = tok_copy(*tk);
3060                         while (p.tos > 0 &&
3061                                shift(&p, TK_error, 0, 0,
3062                                      err_tk, states) == 0)
3063                                 // discard this state
3064                                 indents += pop(&p, 1, &start_of_line, do_free);
3065                         if (p.tos == 0) {
3066                                 free(err_tk);
3067                                 // no state accepted TK_error
3068                                 break;
3069                         }
3070                         if (!shift_since_err) {
3071                                 /* must discard at least one token to avoid
3072                                  * infinite loop.
3073                                  */
3074                                 if (tk->num == TK_eof)
3075                                         break;
3076                                 free(tk);
3077                                 tk = tok_copy(token_next(tokens));
3078                         }
3079                         shift_since_err = 0;
3080                         tos = &p.stack[p.tos-1];
3081                         while (!in_lookahead(tk, states, tos->state) &&
3082                                tk->num != TK_eof) {
3083                                 free(tk);
3084                                 tk = tok_copy(token_next(tokens));
3085                                 shift_since_err = 1;
3086                                 if (tk->num == TK_in)
3087                                         indents += 1;
3088                                 if (tk->num == TK_out) {
3089                                         if (indents == 0)
3090                                                 break;
3091                                         indents -= 1;
3092                                         // FIXME update since_indent here
3093                                 }
3094                         }
3095                         tos->indents += indents;
3096                 }
3097                 free(tk);
3098                 pop(&p, p.tos, NULL, do_free);
3099                 free(p.asn_stack);
3100                 free(p.stack);
3101                 return ret;
3102         }
3103
3104 ###### exported functions
3105         void *parser_run(struct token_state *tokens,
3106                          const struct state states[],
3107                          int (*do_reduce)(int, void**, struct token_config*, void*),
3108                          void (*do_free)(short, void*),
3109                          FILE *trace, const char *non_term[],
3110                          struct token_config *config);
3111
3112 ### Tracing
3113
3114 Being able to visualize the parser in action can be invaluable when
3115 debugging the parser code, or trying to understand how the parse of a
3116 particular grammar progresses.  The stack contains all the important
3117 state, so just printing out the stack every time around the parse loop
3118 can make it possible to see exactly what is happening.
3119
3120 This doesn't explicitly show each SHIFT and REDUCE action.  However they
3121 are easily deduced from the change between consecutive lines, and the
3122 details of each state can be found by cross referencing the states list
3123 in the stack with the "report" that parsergen can generate.
3124
3125 For terminal symbols, we just dump the token.  For non-terminals we
3126 print the name of the symbol.  The look ahead token is reported at the
3127 end inside square brackets.
3128
3129 ###### parser functions
3130
3131         static char *reserved_words[] = {
3132                 [TK_error]        = "ERROR",
3133                 [TK_in]           = "IN",
3134                 [TK_out]          = "OUT",
3135                 [TK_newline]      = "NEWLINE",
3136                 [TK_eof]          = "$eof",
3137         };
3138         static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
3139         {
3140                 fprintf(trace, "(%d", f->state);
3141                 if (states[f->state].starts_line)
3142                         fprintf(trace, "s");
3143                 if (f->newline_permitted)
3144                         fprintf(trace, "n%d", f->since_newline);
3145                 fprintf(trace, ") ");
3146         }
3147
3148         void parser_trace(FILE *trace, struct parser *p,
3149                           struct token *tk, const struct state states[],
3150                           const char *non_term[], int knowns)
3151         {
3152                 int i;
3153                 if (!trace)
3154                         return;
3155                 for (i = 0; i < p->tos; i++) {
3156                         struct frame *f = &p->stack[i];
3157                         if (i) {
3158                                 int sym = f->sym;
3159                                 if (sym < TK_reserved &&
3160                                     reserved_words[sym] != NULL)
3161                                         fputs(reserved_words[sym], trace);
3162                                 else if (sym < TK_reserved + knowns) {
3163                                         struct token *t = p->asn_stack[i];
3164                                         text_dump(trace, t->txt, 20);
3165                                 } else
3166                                         fputs(non_term[sym - TK_reserved - knowns],
3167                                               trace);
3168                                 if (f->indents)
3169                                         fprintf(trace, ".%d", f->indents);
3170                                 if (f->since_newline == 0)
3171                                         fputs("/", trace);
3172                                 fputs(" ", trace);
3173                         }
3174                         parser_trace_state(trace, f, states);
3175                 }
3176                 fprintf(trace, "[");
3177                 if (tk->num < TK_reserved &&
3178                     reserved_words[tk->num] != NULL)
3179                         fputs(reserved_words[tk->num], trace);
3180                 else
3181                         text_dump(trace, tk->txt, 20);
3182                 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3183         }
3184
3185         void parser_trace_action(FILE *trace, char *action)
3186         {
3187                 if (trace)
3188                         fprintf(trace, " - %s\n", action);
3189         }
3190
3191 # A Worked Example
3192
3193 The obvious example for a parser is a calculator.
3194
3195 As `scanner` provides number parsing function using `libgmp` is it not much
3196 work to perform arbitrary rational number calculations.
3197
3198 This calculator takes one expression, or an equality test, per line.  The
3199 results are printed and if any equality test fails, the program exits with
3200 an error.
3201
3202 ###### File: parsergen.mk
3203         calc.c calc.h : parsergen parsergen.mdc
3204                 ./parsergen --tag calc -o calc parsergen.mdc
3205         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3206                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3207         calctest : calc
3208                 ./calc parsergen.mdc
3209         demos :: calctest
3210
3211 # calc: header
3212
3213         #include "parse_number.h"
3214         // what do we use for a demo-grammar?  A calculator of course.
3215         struct number {
3216                 mpq_t val;
3217                 char tail[2];
3218                 int err;
3219         };
3220
3221 # calc: code
3222
3223         #include <stdlib.h>
3224         #include <unistd.h>
3225         #include <fcntl.h>
3226         #include <sys/mman.h>
3227         #include <stdio.h>
3228         #include <malloc.h>
3229         #include <gmp.h>
3230         #include <string.h>
3231         #include "mdcode.h"
3232         #include "scanner.h"
3233         #include "parser.h"
3234
3235         #include "calc.h"
3236
3237         static void free_number(struct number *n)
3238         {
3239                 mpq_clear(n->val);
3240                 free(n);
3241         }
3242
3243         static int text_is(struct text t, char *s)
3244         {
3245                 return (strlen(s) == t.len &&
3246                         strncmp(s, t.txt, t.len) == 0);
3247         }
3248
3249         int main(int argc, char *argv[])
3250         {
3251                 int fd = open(argv[1], O_RDONLY);
3252                 int len = lseek(fd, 0, 2);
3253                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3254                 struct section *table = code_extract(file, file+len, NULL);
3255                 struct section *s;
3256                 struct token_config config = {
3257                         .ignored = (1 << TK_line_comment)
3258                                  | (1 << TK_in)
3259                                  | (1 << TK_out),
3260                         .number_chars = ".,_+-",
3261                         .word_start = "",
3262                         .word_cont = "",
3263                 };
3264                 for (s = table; s; s = s->next)
3265                         if (text_is(s->section, "example: input"))
3266                                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3267                 while (table) {
3268                         struct section *t = table->next;
3269                         code_free(table->code);
3270                         free(table);
3271                         table = t;
3272                 }
3273                 exit(0);
3274         }
3275
3276 # calc: grammar
3277
3278         $LEFT + -
3279         $LEFT * / //
3280
3281         Session -> Session Line
3282                 | Line
3283
3284         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3285                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3286                                         gmp_printf("  or as a decimal: %Fg\n", fl);
3287                                         mpf_clear(fl);
3288                                         }
3289                                      }$
3290                 | Expression = Expression NEWLINE ${
3291                         if (mpq_equal($1.val, $3.val))
3292                                 gmp_printf("Both equal %Qd\n", $1.val);
3293                         else {
3294                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
3295                                         $1.val, $3.val);
3296                                 exit(1);
3297                         }
3298                 }$
3299                 | NEWLINE ${ printf("Blank line\n"); }$
3300                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3301
3302         $number
3303         Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3304                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3305                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3306                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3307                 | Expression // Expression ${ {
3308                         mpz_t z0, z1, z2;
3309                         mpq_init($0.val);
3310                         mpz_init(z0); mpz_init(z1); mpz_init(z2);
3311                         mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3312                         mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3313                         mpz_tdiv_q(z0, z1, z2);
3314                         mpq_set_z($0.val, z0);
3315                         mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3316                 } }$
3317                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3318                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3319
3320 # example: input
3321
3322         355/113
3323         3.1415926535 - 355/113
3324         2 + 4 * 5
3325         1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3326         10 * 9 / 2
3327         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
3328
3329         355//113
3330
3331         error