ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 There are several distinct sections.
   8
   9 1. `grammar_read` will read a grammar from a literate-code file and
  10    store the productions, symbols, and code fragments.
  11 2. `grammar_analyse` will take that grammar and build LR parsing
  12    states and look-ahead tables.
  13 3. `grammar_report` will present the details of the analysis
  14    in a readable format and will report any conflicts.
  15 4. `parser_generate` will write out C code files with various
  16    tables and with the code fragments arranged in useful places.
  17 5. `parser_run` is a library function intended to be linked together
  18    with the generated parser tables to complete the implementation of
  19    a parser.
  20 6. Finally `calc` is a test grammar for a simple calculator.  The
  21    `parsergen` program built from the C code in this file can extract
  22    that grammar directly from this file and process it.
  23
  24 ###### File: parsergen.c
  25         #include <unistd.h>
  26         #include <stdlib.h>
  27         #include <stdio.h>
  28         ## includes
  29         ## forward declarations
  30         ## declarations
  31         ## functions
  32         ## grammar_read
  33         ## grammar_analyse
  34         ## grammar_report
  35         ## parser_generate
  36         ## main
  37 ###### File: parser.h
  38         ## exported types
  39         ## exported functions
  40 ###### File: libparser.c
  41         #include <unistd.h>
  42         #include <stdlib.h>
  43         #include <stdio.h>
  44         ## parser includes
  45         ## parser functions
  46         ## parser_run
  47 ###### File: parsergen.mk
  48         CFLAGS += -Wall -g
  49         all :: parsergen calc
  50         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  51                 ./md2c parsergen.mdc
  52
  53 ## Reading the grammar
  54
  55 The grammar must be stored in a markdown literate code file as parsed
  56 by "[mdcode][]".  It must have three top level (i.e. unreferenced)
  57 sections: `header`, `code`, and `grammar`.  The first two will be
  58 literally copied into the generated `.c` and `.h`. files.  The last
  59 contains the grammar.  This is tokenised with "[scanner][]".
  60
  61 If the `--tag` option is given, then any top level header that doesn't
  62 start with the tag is ignored, and the tag is striped from the rest.  So
  63 `--tag Foo`
  64 means that the three needed sections must be `Foo: header`, `Foo: code`,
  65 and `Foo: grammar`.  The tag `calc` is used to extract the same calculator
  66 from this file.
  67
  68 [mdcode]: mdcode.html
  69 [scanner]: scanner.html
  70
  71 ###### includes
  72         #include "mdcode.h"
  73         #include "scanner.h"
  74
  75 ###### parser includes
  76         #include "mdcode.h"
  77         #include "scanner.h"
  78
  79 The grammar contains production sets, precedence/associativity
  80 declarations, and data type declarations.  These are all parsed with
  81 _ad hoc_ parsing as we don't have a parser generator yet.
  82
  83 The precedence and associativity can be set for each production, but
  84 can be inherited from symbols.  The data type (either structure or a
  85 reference to a structure) is potentially defined for each non-terminal
  86 and describes what C structure is used to store information about each
  87 symbol.
  88
  89 ###### declarations
  90         enum assoc {Left, Right, Non};
  91         char *assoc_names[] = {"Left","Right","Non"};
  92
  93         struct symbol {
  94                 struct text struct_name;
  95                 int isref;
  96                 enum assoc assoc;
  97                 unsigned short precedence;
  98                 ## symbol fields
  99         };
 100         struct production {
 101                 unsigned short precedence;
 102                 enum assoc assoc;
 103                 char line_like;
 104                 ## production fields
 105         };
 106         struct grammar {
 107                 ## grammar fields
 108         };
 109
 110 The strings reported by `mdcode` and `scanner` are `struct text` which have
 111 length rather than being null terminated.  To help with printing and
 112 comparing we define `text_is` and `prtxt`, which should possibly go in
 113 `mdcode`.  `scanner` does provide `text_dump` which is useful for strings
 114 which might contain control characters.
 115
 116 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 117 because that is what we need to detect tags.
 118
 119 ###### functions
 120         static int text_is(struct text t, char *s)
 121         {
 122                 return (strlen(s) == t.len &&
 123                         strncmp(s, t.txt, t.len) == 0);
 124         }
 125         static void prtxt(struct text t)
 126         {
 127                 printf("%.*s", t.len, t.txt);
 128         }
 129
 130         static int strip_tag(struct text *t, char *tag)
 131         {
 132                 int skip = strlen(tag) + 1;
 133                 if (skip >= t->len ||
 134                     strncmp(t->txt, tag, skip-1) != 0 ||
 135                     t->txt[skip-1] != ':')
 136                         return 0;
 137                 while (skip < t->len && t->txt[skip] == ' ')
 138                         skip++;
 139                 t->len -= skip;
 140                 t->txt += skip;
 141                 return 1;
 142         }
 143
 144 ### Symbols
 145
 146 Productions are comprised primarily of symbols - terminal and
 147 non-terminal.  We do not make a syntactic distinction between the two,
 148 though non-terminals must be identifiers.  Non-terminal symbols are
 149 those which appear in the head of a production, terminal symbols are
 150 those which don't.  There are also "virtual" symbols used for precedence
 151 marking discussed later, and sometimes we won't know what type a symbol
 152 is yet.
 153
 154 ###### forward declarations
 155         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 156         char *symtypes = "UVTN";
 157 ###### symbol fields
 158         enum symtype type;
 159
 160 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 161 table of known symbols and the resulting parser will report them as
 162 `TK_reserved + N`.  A small set of identifiers are reserved for the
 163 different token types that `scanner` can report.
 164
 165 ###### declarations
 166         static char *reserved_words[] = {
 167                 [TK_error]        = "ERROR",
 168                 [TK_number]       = "NUMBER",
 169                 [TK_ident]        = "IDENTIFIER",
 170                 [TK_mark]         = "MARK",
 171                 [TK_string]       = "STRING",
 172                 [TK_multi_string] = "MULTI_STRING",
 173                 [TK_in]           = "IN",
 174                 [TK_out]          = "OUT",
 175                 [TK_newline]      = "NEWLINE",
 176                 [TK_eof]          = "$eof",
 177         };
 178 ###### symbol fields
 179         short num;
 180
 181 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 182 recognised.  The former is automatically expected at the end of the text
 183 being parsed. The latter are always ignored by the parser.
 184
 185 All of these symbols are stored in a simple symbol table.  We use the
 186 `struct text` from `mdcode` to store the name and link them together
 187 into a sorted list using an insertion sort.
 188
 189 We don't have separate `find` and `insert` functions as any symbol we
 190 find needs to be remembered.  We simply expect `find` to always return a
 191 symbol, but its type might be `Unknown`.
 192
 193 ###### includes
 194         #include <string.h>
 195
 196 ###### symbol fields
 197         struct text name;
 198         struct symbol *next;
 199
 200 ###### grammar fields
 201         struct symbol *syms;
 202         int num_syms;
 203
 204 ###### functions
 205         static struct symbol *sym_find(struct grammar *g, struct text s)
 206         {
 207                 struct symbol **l = &g->syms;
 208                 struct symbol *n;
 209                 int cmp = 1;
 210
 211                 while (*l &&
 212                         (cmp = text_cmp((*l)->name, s)) < 0)
 213                                 l = & (*l)->next;
 214                 if (cmp == 0)
 215                         return *l;
 216                 n = calloc(1, sizeof(*n));
 217                 n->name = s;
 218                 n->type = Unknown;
 219                 n->next = *l;
 220                 n->num = -1;
 221                 *l = n;
 222                 return n;
 223         }
 224
 225         static void symbols_init(struct grammar *g)
 226         {
 227                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 228                 int i;
 229                 for (i = 0; i < entries; i++) {
 230                         struct text t;
 231                         struct symbol *s;
 232                         t.txt = reserved_words[i];
 233                         if (!t.txt)
 234                                 continue;
 235                         t.len = strlen(t.txt);
 236                         s = sym_find(g, t);
 237                         s->type = Terminal;
 238                         s->num = i;
 239                 }
 240         }
 241
 242 ### Data types and precedence.
 243
 244 Data type specification and precedence specification are both
 245 introduced by a dollar sign at the start of the line.  If the next
 246 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 247 precedence, otherwise it specifies a data type.
 248
 249 The data type name is simply stored and applied to the head of all
 250 subsequent productions.  It must be the name of a structure optionally
 251 preceded by an asterisk which means a reference or "pointer".  So
 252 `$expression` maps to `struct expression` and `$*statement` maps to
 253 `struct statement *`.
 254
 255 Any productions given before the first data type declaration will have
 256 no data type associated with them and can carry no information.  In
 257 order to allow other non-terminals to have no type, the data type
 258 `$void` can be given.  This does *not* mean that `struct void` will be
 259 used, but rather than no type will be associated with future
 260 non-terminals.
 261
 262 The precedence line must contain a list of symbols - typically
 263 terminal symbols, but not necessarily.  It can only contain symbols
 264 that have not been seen yet, so precedence declaration must precede
 265 the productions that they affect.
 266
 267 A precedence line may also contain a "virtual" symbol which is an
 268 identifier preceded by `$$`.  Virtual terminals carry precedence
 269 information but are not included in the grammar.  A production can
 270 declare that it inherits the precedence of a given virtual symbol.
 271
 272 This use for `$$` precludes it from being used as a symbol in the
 273 described language.  Two other symbols: `${` and `}$` are also
 274 unavailable.
 275
 276 Each new precedence line introduces a new precedence level and
 277 declares how it associates.  This level is stored in each symbol
 278 listed and may be inherited by any production which uses the symbol.  A
 279 production inherits from the last symbol which has a precedence.
 280
 281 The symbols on the first precedence line have the lowest precedence.
 282 Subsequent lines introduce symbols with higher precedence.
 283
 284 ###### grammar fields
 285         struct text current_type;
 286         int type_isref;
 287         int prec_levels;
 288
 289 ###### declarations
 290         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 291         static const char *known[] = { "$$", "${", "}$" };
 292
 293 ###### functions
 294         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 295         {
 296                 struct token t = token_next(ts);
 297                 char *err;
 298                 enum assoc assoc;
 299                 int found;
 300
 301                 if (t.num != TK_ident) {
 302                         err = "type or assoc expected after '$'";
 303                         goto abort;
 304                 }
 305                 if (text_is(t.txt, "LEFT"))
 306                         assoc = Left;
 307                 else if (text_is(t.txt, "RIGHT"))
 308                         assoc = Right;
 309                 else if (text_is(t.txt, "NON"))
 310                         assoc = Non;
 311                 else {
 312                         g->current_type = t.txt;
 313                         g->type_isref = isref;
 314                         if (text_is(t.txt, "void"))
 315                                 g->current_type.txt = NULL;
 316                         t = token_next(ts);
 317                         if (t.num != TK_newline) {
 318                                 err = "Extra tokens after type name";
 319                                 goto abort;
 320                         }
 321                         return NULL;
 322                 }
 323
 324                 if (isref) {
 325                         err = "$* cannot be followed by a precedence";
 326                         goto abort;
 327                 }
 328
 329                 // This is a precedence line, need some symbols.
 330                 found = 0;
 331                 g->prec_levels += 1;
 332                 t = token_next(ts);
 333                 while (t.num != TK_newline) {
 334                         enum symtype type = Terminal;
 335                         struct symbol *s;
 336                         if (t.num == TK_virtual) {
 337                                 type = Virtual;
 338                                 t = token_next(ts);
 339                                 if (t.num != TK_ident) {
 340                                         err = "$$ must be followed by a word";
 341                                         goto abort;
 342                                 }
 343                         } else if (t.num != TK_ident &&
 344                                    t.num != TK_mark) {
 345                                 err = "Illegal token in precedence line";
 346                                 goto abort;
 347                         }
 348                         s = sym_find(g, t.txt);
 349                         if (s->type != Unknown) {
 350                                 err = "Symbols in precedence line must not already be known.";
 351                                 goto abort;
 352                         }
 353                         s->type = type;
 354                         s->precedence = g->prec_levels;
 355                         s->assoc = assoc;
 356                         found += 1;
 357                         t = token_next(ts);
 358                 }
 359                 if (found == 0)
 360                         err = "No symbols given on precedence line";
 361                         goto abort;
 362                 return NULL;
 363         abort:
 364                 while (t.num != TK_newline && t.num != TK_eof)
 365                         t = token_next(ts);
 366                 return err;
 367         }
 368
 369 ### Productions
 370
 371 A production either starts with an identifier which is the head
 372 non-terminal, or a vertical bar (`|`) in which case this production
 373 uses the same head as the previous one.  The identifier must be
 374 followed by a `->` mark.  All productions for a given non-terminal must
 375 be grouped together with the `nonterminal ->` given only once.
 376
 377 After this a (possibly empty) sequence of identifiers and marks form
 378 the body of the production.  A virtual symbol may be given after the
 379 body by preceding it with `$$`.  If a virtual symbol is given, the
 380 precedence of the production is that for the virtual symbol.  If none
 381 is given, the precedence is inherited from the last symbol in the
 382 production which has a precedence specified.
 383
 384 After the optional precedence may come the `${` mark.  This indicates
 385 the start of a code fragment.  If present, this must be on the same
 386 line as the start of the production.
 387
 388 All of the text from the `${` through to the matching `}$` is
 389 collected and forms the code-fragment for the production.  It must all
 390 be in one `code_node` of the literate code.  The `}$` must be
 391 at the end of a line.
 392
 393 Text in the code fragment will undergo substitutions where `$N` or
 394 `$<N`,for some numeric `N`, will be replaced with a variable holding
 395 the parse information for the particular symbol in the production.
 396 `$0` is the head of the production, `$1` is the first symbol of the
 397 body, etc.  The type of `$N` for a terminal symbol is `struct token`.
 398 For a non-terminal, it is whatever has been declared for that symbol.
 399 The `<` may be included for symbols declared as storing a reference
 400 (not a structure) and means that the reference is being moved out, so
 401 it will not automatically be freed.
 402
 403 While building productions we will need to add to an array which needs to
 404 grow dynamically.
 405
 406 ###### functions
 407         static void array_add(void *varray, int *cnt, void *new)
 408         {
 409                 void ***array = varray;
 410                 int current = 0;
 411                 const int step = 8;
 412                 current = ((*cnt-1) | (step-1))+1;
 413                 if (*cnt == current) {
 414                         /* must grow */
 415                         current += step;
 416                         *array = realloc(*array, current * sizeof(void*));
 417                 }
 418                 (*array)[*cnt] = new;
 419                 (*cnt) += 1;
 420         }
 421
 422 Collecting the code fragment simply involves reading tokens until we
 423 hit the end token `}$`, and noting the character position of the start and
 424 the end.
 425
 426 ###### functions
 427         static struct text collect_code(struct token_state *state,
 428                                         struct token start)
 429         {
 430                 struct text code;
 431                 struct token t;
 432                 code.txt = start.txt.txt + start.txt.len;
 433                 do
 434                         t = token_next(state);
 435                 while (t.node == start.node &&
 436                        t.num != TK_close && t.num != TK_error &&
 437                        t.num != TK_eof);
 438                 if (t.num == TK_close && t.node == start.node)
 439                         code.len = t.txt.txt - code.txt;
 440                 else
 441                         code.txt = NULL;
 442                 return code;
 443         }
 444
 445 Now we have all the bits we need to parse a full production.
 446
 447 ###### includes
 448         #include <memory.h>
 449
 450 ###### grammar fields
 451         struct production **productions;
 452         int production_count;
 453
 454 ###### production fields
 455         struct symbol  *head;
 456         struct symbol **body;
 457         int             body_size;
 458         struct text     code;
 459         int             code_line;
 460
 461 ###### symbol fields
 462         int first_production;
 463
 464 ###### functions
 465         static char *parse_production(struct grammar *g,
 466                                       struct symbol *head,
 467                                       struct token_state *state)
 468         {
 469                 /* Head has already been parsed. */
 470                 struct token tk;
 471                 char *err;
 472                 struct production p, *pp;
 473
 474                 memset(&p, 0, sizeof(p));
 475                 p.head = head;
 476                 tk = token_next(state);
 477                 while (tk.num == TK_ident || tk.num == TK_mark) {
 478                         struct symbol *bs = sym_find(g, tk.txt);
 479                         if (bs->type == Unknown)
 480                                 bs->type = Terminal;
 481                         if (bs->type == Virtual) {
 482                                 err = "Virtual symbol not permitted in production";
 483                                 goto abort;
 484                         }
 485                         if (bs->precedence) {
 486                                 p.precedence = bs->precedence;
 487                                 p.assoc = bs->assoc;
 488                         }
 489                         array_add(&p.body, &p.body_size, bs);
 490                         tk = token_next(state);
 491                 }
 492                 if (tk.num == TK_virtual) {
 493                         struct symbol *vs;
 494                         tk = token_next(state);
 495                         if (tk.num != TK_ident) {
 496                                 err = "word required after $$";
 497                                 goto abort;
 498                         }
 499                         vs = sym_find(g, tk.txt);
 500                         if (vs->num == TK_newline)
 501                                 p.line_like = 1;
 502                         else if (vs->num == TK_out)
 503                                 p.line_like = 2;
 504                         else if (vs->precedence == 0) {
 505                                 err = "symbol after $$ must have precedence";
 506                                 goto abort;
 507                         } else {
 508                                 p.precedence = vs->precedence;
 509                                 p.assoc = vs->assoc;
 510                         }
 511                         tk = token_next(state);
 512                 }
 513                 if (tk.num == TK_open) {
 514                         p.code_line = tk.line;
 515                         p.code = collect_code(state, tk);
 516                         if (p.code.txt == NULL) {
 517                                 err = "code fragment not closed properly";
 518                                 goto abort;
 519                         }
 520                         tk = token_next(state);
 521                 }
 522                 if (tk.num != TK_newline && tk.num != TK_eof) {
 523                         err = "stray tokens at end of line";
 524                         goto abort;
 525                 }
 526                 pp = malloc(sizeof(*pp));
 527                 *pp = p;
 528                 array_add(&g->productions, &g->production_count, pp);
 529                 return NULL;
 530         abort:
 531                 while (tk.num != TK_newline && tk.num != TK_eof)
 532                         tk = token_next(state);
 533                 return err;
 534         }
 535
 536 With the ability to parse production and dollar-lines, we have nearly all
 537 that we need to parse a grammar from a `code_node`.
 538
 539 The head of the first production will effectively be the `start` symbol of
 540 the grammar.  However it won't _actually_ be so.  Processing the grammar is
 541 greatly simplified if the real start symbol only has a single production,
 542 and expects `$eof` as the final terminal.  So when we find the first
 543 explicit production we insert an extra production as production zero which
 544 looks like
 545
 546 ###### Example: production 0
 547         $start -> START $eof
 548
 549 where `START` is the first non-terminal given.
 550
 551 ###### create production zero
 552         struct production *p = calloc(1,sizeof(*p));
 553         struct text start = {"$start",6};
 554         struct text eof = {"$eof",4};
 555         struct text code = {"$0 = $<1;", 9};
 556         p->head = sym_find(g, start);
 557         p->head->type = Nonterminal;
 558         p->head->struct_name = g->current_type;
 559         p->head->isref = g->type_isref;
 560         if (g->current_type.txt)
 561                 p->code = code;
 562         array_add(&p->body, &p->body_size, head);
 563         array_add(&p->body, &p->body_size, sym_find(g, eof));
 564         p->head->first_production = g->production_count;
 565         array_add(&g->productions, &g->production_count, p);
 566
 567 Now we are ready to read in the grammar.  We ignore comments
 568 and strings so that the marks which introduce them can be
 569 used as terminals in the grammar.  We don't ignore numbers
 570 even though we don't allow them as that causes the scanner
 571 to produce errors that the parser is better positioned to handle.
 572
 573 ###### grammar_read
 574         static struct grammar *grammar_read(struct code_node *code)
 575         {
 576                 struct token_config conf = {
 577                         .word_start = "",
 578                         .word_cont = "",
 579                         .words_marks = known,
 580                         .known_count = sizeof(known)/sizeof(known[0]),
 581                         .number_chars = "",
 582                         .ignored = (1 << TK_line_comment)
 583                                  | (1 << TK_block_comment)
 584                                  | (0 << TK_number)
 585                                  | (1 << TK_string)
 586                                  | (1 << TK_multi_string)
 587                                  | (1 << TK_in)
 588                                  | (1 << TK_out),
 589                 };
 590
 591                 struct token_state *state = token_open(code, &conf);
 592                 struct token tk;
 593                 struct symbol *head = NULL;
 594                 struct grammar *g;
 595                 char *err = NULL;
 596
 597                 g = calloc(1, sizeof(*g));
 598                 symbols_init(g);
 599
 600                 for (tk = token_next(state); tk.num != TK_eof;
 601                      tk = token_next(state)) {
 602                         if (tk.num == TK_newline)
 603                                 continue;
 604                         if (tk.num == TK_ident) {
 605                                 // new non-terminal
 606                                 head = sym_find(g, tk.txt);
 607                                 if (head->type == Nonterminal)
 608                                         err = "This non-terminal has already be used.";
 609                                 else if (head->type == Virtual)
 610                                         err = "Virtual symbol not permitted in head of production";
 611                                 else {
 612                                         head->type = Nonterminal;
 613                                         head->struct_name = g->current_type;
 614                                         head->isref = g->type_isref;
 615                                         if (g->production_count == 0) {
 616                                                 ## create production zero
 617                                         }
 618                                         head->first_production = g->production_count;
 619                                         tk = token_next(state);
 620                                         if (tk.num == TK_mark &&
 621                                             text_is(tk.txt, "->"))
 622                                                 err = parse_production(g, head, state);
 623                                         else
 624                                                 err = "'->' missing in production";
 625                                 }
 626                         } else if (tk.num == TK_mark
 627                                    && text_is(tk.txt, "|")) {
 628                                 // another production for same non-term
 629                                 if (head)
 630                                         err = parse_production(g, head, state);
 631                                 else
 632                                         err = "First production must have a head";
 633                         } else if (tk.num == TK_mark
 634                                    && text_is(tk.txt, "$")) {
 635                                 err = dollar_line(state, g, 0);
 636                         } else if (tk.num == TK_mark
 637                                    && text_is(tk.txt, "$*")) {
 638                                 err = dollar_line(state, g, 1);
 639                         } else {
 640                                 err = "Unrecognised token at start of line.";
 641                         }
 642                         if (err)
 643                                 goto abort;
 644                 }
 645                 token_close(state);
 646                 return g;
 647         abort:
 648                 fprintf(stderr, "Error at line %d: %s\n",
 649                         tk.line, err);
 650                 token_close(state);
 651                 free(g);
 652                 return NULL;
 653         }
 654
 655 ## Analysing the grammar
 656
 657 The central task in analysing the grammar is to determine a set of
 658 states to drive the parsing process.  This will require finding
 659 various sets of symbols and of "items".  Some of these sets will need
 660 to attach information to each element in the set, so they are more
 661 like maps.
 662
 663 Each "item" may have a 'look-ahead' or `LA` set associated with
 664 it.  Many of these will be the same as each other.  To avoid memory
 665 wastage, and to simplify some comparisons of sets, these sets will be
 666 stored in a list of unique sets, each assigned a number.
 667
 668 Once we have the data structures in hand to manage these sets and
 669 lists, we can start setting the 'nullable' flag, build the 'FIRST'
 670 sets, and then create the item sets which define the various states.
 671
 672 ### Symbol sets.
 673
 674 Though we don't only store symbols in these sets, they are the main
 675 things we store, so they are called symbol sets or "symsets".
 676
 677 A symset has a size, an array of shorts, and an optional array of data
 678 values, which are also shorts.  If the array of data is not present,
 679 we store an impossible pointer, as `NULL` is used to indicate that no
 680 memory has been allocated yet;
 681
 682 ###### declarations
 683         struct symset {
 684                 short cnt;
 685                 unsigned short *syms, *data;
 686         };
 687         #define NO_DATA ((unsigned short *)1)
 688         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 689         const struct symset INIT_DATASET = { 0, NULL, NULL };
 690
 691 The arrays of shorts are allocated in blocks of 8 and are kept sorted
 692 by using an insertion sort.  We don't explicitly record the amount of
 693 allocated space as it can be derived directly from the current `cnt` using
 694 `((cnt - 1) | 7) + 1`.
 695
 696 ###### functions
 697         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 698         {
 699                 int i;
 700                 int current = ((s->cnt-1) | 7) + 1;
 701                 if (current == s->cnt) {
 702                         current += 8;
 703                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 704                         if (s->data != NO_DATA)
 705                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 706                 }
 707                 i = s->cnt;
 708                 while (i > 0 && s->syms[i-1] > key) {
 709                         s->syms[i] = s->syms[i-1];
 710                         if (s->data != NO_DATA)
 711                                 s->data[i] = s->data[i-1];
 712                         i--;
 713                 }
 714                 s->syms[i] = key;
 715                 if (s->data != NO_DATA)
 716                         s->data[i] = val;
 717                 s->cnt += 1;
 718         }
 719
 720 Finding a symbol (or item) in a `symset` uses a simple binary search.
 721 We return the index where the value was found (so data can be accessed),
 722 or `-1` to indicate failure.
 723
 724         static int symset_find(struct symset *ss, unsigned short key)
 725         {
 726                 int lo = 0;
 727                 int hi = ss->cnt;
 728
 729                 if (hi == 0)
 730                         return -1;
 731                 while (lo + 1 < hi) {
 732                         int mid = (lo + hi) / 2;
 733                         if (ss->syms[mid] <= key)
 734                                 lo = mid;
 735                         else
 736                                 hi = mid;
 737                 }
 738                 if (ss->syms[lo] == key)
 739                         return lo;
 740                 return -1;
 741         }
 742
 743 We will often want to form the union of two symsets.  When we do, we
 744 will often want to know if anything changed (as that might mean there
 745 is more work to do).  So `symset_union` reports whether anything was
 746 added to the first set.  We use a slow quadratic approach as these
 747 sets don't really get very big.  If profiles shows this to be a problem it
 748 can be optimised later.
 749
 750         static int symset_union(struct symset *a, struct symset *b)
 751         {
 752                 int i;
 753                 int added = 0;
 754                 for (i = 0; i < b->cnt; i++)
 755                         if (symset_find(a, b->syms[i]) < 0) {
 756                                 unsigned short data = 0;
 757                                 if (b->data != NO_DATA)
 758                                         data = b->data[i];
 759                                 symset_add(a, b->syms[i], data);
 760                                 added++;
 761                         }
 762                 return added;
 763         }
 764
 765 And of course we must be able to free a symset.
 766
 767         static void symset_free(struct symset ss)
 768         {
 769                 free(ss.syms);
 770                 if (ss.data != NO_DATA)
 771                         free(ss.data);
 772         }
 773
 774 ### Symset Storage
 775
 776 Some symsets are simply stored somewhere appropriate in the `grammar`
 777 data structure, others needs a bit of indirection.  This applies
 778 particularly to "LA" sets.  These will be paired with "items" in an
 779 "itemset".  As itemsets will be stored in a symset, the "LA" set needs to be
 780 stored in the `data` field, so we need a mapping from a small (short)
 781 number to an LA `symset`.
 782
 783 As mentioned earlier this involves creating a list of unique symsets.
 784
 785 For now, we just use a linear list sorted by insertion.  A skiplist
 786 would be more efficient and may be added later.
 787
 788 ###### declarations
 789
 790         struct setlist {
 791                 struct symset ss;
 792                 int num;
 793                 struct setlist *next;
 794         };
 795
 796 ###### grammar fields
 797         struct setlist *sets;
 798         int nextset;
 799
 800 ###### functions
 801
 802         static int ss_cmp(struct symset a, struct symset b)
 803         {
 804                 int i;
 805                 int diff = a.cnt - b.cnt;
 806                 if (diff)
 807                         return diff;
 808                 for (i = 0; i < a.cnt; i++) {
 809                         diff = (int)a.syms[i] - (int)b.syms[i];
 810                         if (diff)
 811                                 return diff;
 812                 }
 813                 return 0;
 814         }
 815
 816         static int save_set(struct grammar *g, struct symset ss)
 817         {
 818                 struct setlist **sl = &g->sets;
 819                 int cmp = 1;
 820                 struct setlist *s;
 821
 822                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 823                         sl = & (*sl)->next;
 824                 if (cmp == 0) {
 825                         symset_free(ss);
 826                         return (*sl)->num;
 827                 }
 828
 829                 s = malloc(sizeof(*s));
 830                 s->ss = ss;
 831                 s->num = g->nextset;
 832                 g->nextset += 1;
 833                 s->next = *sl;
 834                 *sl = s;
 835                 return s->num;
 836         }
 837
 838 Finding a set by number is currently performed by a simple linear search.
 839 If this turns out to hurt performance, we can store the sets in a dynamic
 840 array like the productions.
 841
 842         static struct symset set_find(struct grammar *g, int num)
 843         {
 844                 struct setlist *sl = g->sets;
 845                 while (sl && sl->num != num)
 846                         sl = sl->next;
 847                 return sl->ss;
 848         }
 849
 850 ### Setting `nullable`
 851
 852 We set `nullable` on the head symbol for any production for which all
 853 the body symbols (if any) are nullable.  As this is a recursive
 854 definition, any change in the `nullable` setting means that we need to
 855 re-evaluate where it needs to be set.
 856
 857 We simply loop around performing the same calculations until no more
 858 changes happen.
 859
 860 ###### symbol fields
 861         int nullable;
 862
 863 ###### functions
 864         static void set_nullable(struct grammar *g)
 865         {
 866                 int check_again = 1;
 867                 while (check_again) {
 868                         int p;
 869                         check_again = 0;
 870                         for (p = 0; p < g->production_count; p++) {
 871                                 struct production *pr = g->productions[p];
 872                                 int s;
 873
 874                                 if (pr->head->nullable)
 875                                         continue;
 876                                 for (s = 0; s < pr->body_size; s++)
 877                                         if (! pr->body[s]->nullable)
 878                                                 break;
 879                                 if (s == pr->body_size) {
 880                                         pr->head->nullable = 1;
 881                                         check_again = 1;
 882                                 }
 883                         }
 884                 }
 885         }
 886
 887 ### Setting `line_like`
 888
 889 In order to be able to ignore newline tokens when not relevant, but
 890 still include them in the parse when needed, we will need to know
 891 which states can start a "line-like" section of code.  We ignore
 892 newlines when there is an indent since the most recent start of a
 893 line-like symbol.
 894
 895 A "line_like" symbol is simply any symbol that can derive a NEWLINE.
 896 If a symbol cannot derive a NEWLINE, then it is only part of a line -
 897 so is word-like.  If it can derive a NEWLINE, then we consider it to
 898 be like a line.
 899
 900 Clearly the `TK_newline` token can derive a NEWLINE.  Any symbol which
 901 is the head of a production that contains a line_like symbol is also a
 902 line-like symbol.  We use a new field `line_like` to record this
 903 attribute of symbols, and compute it in a repetitive manner similar to
 904 `set_nullable`.
 905
 906 ###### symbol fields
 907         int line_like;
 908
 909 ###### functions
 910         static void set_line_like(struct grammar *g)
 911         {
 912                 int check_again = 1;
 913                 g->symtab[TK_newline]->line_like = 1;
 914                 while (check_again) {
 915                         int p;
 916                         check_again = 0;
 917                         for (p = 0; p < g->production_count; p++) {
 918                                 struct production *pr = g->productions[p];
 919                                 int s;
 920
 921                                 if (pr->head->line_like)
 922                                         continue;
 923
 924                                 for (s = 0 ; s < pr->body_size; s++) {
 925                                         if (pr->body[s]->line_like) {
 926                                                 pr->head->line_like = 1;
 927                                                 check_again = 1;
 928                                                 break;
 929                                         }
 930                                 }
 931                         }
 932                 }
 933         }
 934
 935 ### Building the `first` sets
 936
 937 When calculating what can follow a particular non-terminal, we will need to
 938 know what the "first" terminal in any subsequent non-terminal might be.  So
 939 we calculate the `first` set for every non-terminal and store them in an
 940 array.  We don't bother recording the "first" set for terminals as they are
 941 trivial.
 942
 943 As the "first" for one symbol might depend on the "first" of another,
 944 we repeat the calculations until no changes happen, just like with
 945 `set_nullable`.  This makes use of the fact that `symset_union`
 946 reports if any change happens.
 947
 948 The core of this, which finds the "first" of part of a production body,
 949 will be reused for computing the "follow" sets, so we split it out
 950 into a separate function.
 951
 952 ###### grammar fields
 953         struct symset *first;
 954
 955 ###### functions
 956
 957         static int add_first(struct production *pr, int start,
 958                              struct symset *target, struct grammar *g,
 959                              int *to_end)
 960         {
 961                 int s;
 962                 int changed = 0;
 963                 for (s = start; s < pr->body_size; s++) {
 964                         struct symbol *bs = pr->body[s];
 965                         if (bs->type == Terminal) {
 966                                 if (symset_find(target, bs->num) < 0) {
 967                                         symset_add(target, bs->num, 0);
 968                                         changed = 1;
 969                                 }
 970                                 break;
 971                         } else if (symset_union(target, &g->first[bs->num]))
 972                                 changed = 1;
 973                         if (!bs->nullable)
 974                                 break;
 975                 }
 976                 if (to_end)
 977                         *to_end = (s == pr->body_size);
 978                 return changed;
 979         }
 980
 981         static void build_first(struct grammar *g)
 982         {
 983                 int check_again = 1;
 984                 int s;
 985                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
 986                 for (s = 0; s < g->num_syms; s++)
 987                         g->first[s] = INIT_SYMSET;
 988
 989                 while (check_again) {
 990                         int p;
 991                         check_again = 0;
 992                         for (p = 0; p < g->production_count; p++) {
 993                                 struct production *pr = g->productions[p];
 994                                 struct symset *head = &g->first[pr->head->num];
 995
 996                                 if (add_first(pr, 0, head, g, NULL))
 997                                         check_again = 1;
 998                         }
 999                 }
1000         }
1001
1002 ### Building the `follow` sets.
1003
1004 There are two different situations when we will want to generate "follow"
1005 sets.  If we are doing an SLR analysis, we want to generate a single
1006 "follow" set for each non-terminal in the grammar.  That is what is
1007 happening here.  If we are doing an LALR or LR analysis we will want
1008 to generate a separate "LA" set for each item.  We do that later
1009 in state generation.
1010
1011 There are two parts to generating a "follow" set.  Firstly we look at
1012 every place that any non-terminal appears in the body of any
1013 production, and we find the set of possible "first" symbols after
1014 there.  This is done using `add_first` above and only needs to be done
1015 once as the "first" sets are now stable and will not change.
1016
1017 ###### follow code
1018
1019         for (p = 0; p < g->production_count; p++) {
1020                 struct production *pr = g->productions[p];
1021                 int b;
1022
1023                 for (b = 0; b < pr->body_size - 1; b++) {
1024                         struct symbol *bs = pr->body[b];
1025                         if (bs->type == Terminal)
1026                                 continue;
1027                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1028                 }
1029         }
1030
1031 The second part is to add the "follow" set of the head of a production
1032 to the "follow" sets of the final symbol in the production, and any
1033 other symbol which is followed only by `nullable` symbols.  As this
1034 depends on "follow" itself we need to repeatedly perform the process
1035 until no further changes happen.
1036
1037 ###### follow code
1038
1039         for (again = 0, p = 0;
1040              p < g->production_count;
1041              p < g->production_count-1
1042                 ? p++ : again ? (again = 0, p = 0)
1043                               : p++) {
1044                 struct production *pr = g->productions[p];
1045                 int b;
1046
1047                 for (b = pr->body_size - 1; b >= 0; b--) {
1048                         struct symbol *bs = pr->body[b];
1049                         if (bs->type == Terminal)
1050                                 break;
1051                         if (symset_union(&g->follow[bs->num],
1052                                          &g->follow[pr->head->num]))
1053                                 again = 1;
1054                         if (!bs->nullable)
1055                                 break;
1056                 }
1057         }
1058
1059 We now just need to create and initialise the `follow` list to get a
1060 complete function.
1061
1062 ###### grammar fields
1063         struct symset *follow;
1064
1065 ###### functions
1066         static void build_follow(struct grammar *g)
1067         {
1068                 int s, again, p;
1069                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1070                 for (s = 0; s < g->num_syms; s++)
1071                         g->follow[s] = INIT_SYMSET;
1072                 ## follow code
1073         }
1074
1075 ### Building itemsets and states
1076
1077 There are three different levels of detail that can be chosen for
1078 building the itemsets and states for the LR grammar.  They are:
1079
1080 1. LR(0) or SLR(1), where no look-ahead is considered.
1081 2. LALR(1) where we build look-ahead sets with each item and merge
1082    the LA sets when we find two paths to the same "kernel" set of items.
1083 3. LR(1) where different look-ahead for any item in the set means
1084    a different state must be created.
1085
1086 ###### forward declarations
1087         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1088
1089 We need to be able to look through existing states to see if a newly
1090 generated state already exists.  For now we use a simple sorted linked
1091 list.
1092
1093 An item is a pair of numbers: the production number and the position of
1094 "DOT", which is an index into the body of the production.
1095 As the numbers are not enormous we can combine them into a single "short"
1096 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1097 production number (so 4000 productions with maximum of 15 symbols in the
1098 body).
1099
1100 Comparing the itemsets will be a little different to comparing symsets
1101 as we want to do the lookup after generating the "kernel" of an
1102 itemset, so we need to ignore the offset=zero items which are added during
1103 completion.
1104
1105 To facilitate this, we modify the "DOT" number so that "0" sorts to
1106 the end of the list in the symset, and then only compare items before
1107 the first "0".
1108
1109 ###### declarations
1110         static inline unsigned short item_num(int production, int index)
1111         {
1112                 return production | ((31-index) << 11);
1113         }
1114         static inline int item_prod(unsigned short item)
1115         {
1116                 return item & 0x7ff;
1117         }
1118         static inline int item_index(unsigned short item)
1119         {
1120                 return (31-(item >> 11)) & 0x1f;
1121         }
1122
1123 For LR(1) analysis we need to compare not just the itemset in a state
1124 but also the LA sets.  As we assign each unique LA set a number, we
1125 can just compare the symset and the data values together.
1126
1127 ###### functions
1128         static int itemset_cmp(struct symset a, struct symset b,
1129                                enum grammar_type type)
1130         {
1131                 int i;
1132                 int av, bv;
1133
1134                 for (i = 0;
1135                      i < a.cnt && i < b.cnt &&
1136                      item_index(a.syms[i]) > 0 &&
1137                      item_index(b.syms[i]) > 0;
1138                      i++) {
1139                         int diff = a.syms[i] - b.syms[i];
1140                         if (diff)
1141                                 return diff;
1142                         if (type == LR1) {
1143                                 diff = a.data[i] - b.data[i];
1144                                 if (diff)
1145                                         return diff;
1146                         }
1147                 }
1148                 if (i == a.cnt || item_index(a.syms[i]) == 0)
1149                         av = -1;
1150                 else
1151                         av = a.syms[i];
1152                 if (i == b.cnt || item_index(b.syms[i]) == 0)
1153                         bv = -1;
1154                 else
1155                         bv = b.syms[i];
1156                 if (av - bv)
1157                         return av - bv;
1158                 if (type < LR1 || av == -1)
1159                         return 0;
1160                 return
1161                         a.data[i] - b.data[i];
1162         }
1163
1164 It will be helpful to know if an itemset has been "completed" or not,
1165 particularly for LALR where itemsets get merged, at which point they
1166 need to be consider for completion again.  So  a `completed` flag is needed.
1167
1168 For correct handling of `TK_newline` when parsing, we will need to
1169 know which states (itemsets) can occur at the start of a line, so we
1170 will record a `starts_line` flag too whenever DOT is at the start of a
1171 `line_like` symbol.
1172
1173 Finally, for handling `TK_out` we need to know whether productions in the
1174 current state started *before* the most recent indent.  A state
1175 doesn't usually keep details of individual productions, so we need to
1176 add one extra detail. `min_prefix` is the smallest non-zero number of
1177 symbols *before* DOT in any production in an itemset.  This will allow
1178 us to determine if the the most recent indent is sufficiently recent
1179 to cancel it against a `TK_out`.  If it was seen longer ago than the
1180 `min_prefix`, and if the current state cannot be reduced, then the
1181 indented section must have ended in the middle of a syntactic unit, so
1182 an error must be signaled.
1183
1184 And now we can build the list of itemsets.  The lookup routine returns
1185 both a success flag and a pointer to where in the list an insert
1186 should happen, so we don't need to search a second time.
1187
1188 ###### declarations
1189         struct itemset {
1190                 struct itemset *next;
1191                 short state;
1192                 struct symset items;
1193                 struct symset go_to;
1194                 enum assoc assoc;
1195                 unsigned short precedence;
1196                 char completed;
1197                 char starts_line;
1198                 int min_prefix;
1199         };
1200
1201 ###### grammar fields
1202         struct itemset *items;
1203         int states;
1204
1205 ###### functions
1206         static int itemset_find(struct grammar *g, struct itemset ***where,
1207                                 struct symset kernel, enum grammar_type type)
1208         {
1209                 struct itemset **ip;
1210
1211                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1212                         struct itemset *i = *ip;
1213                         int diff;
1214                         diff = itemset_cmp(i->items, kernel, type);
1215                         if (diff < 0)
1216                                 continue;
1217                         if (diff > 0)
1218                                 break;
1219                         /* found */
1220                         *where = ip;
1221                         return 1;
1222                 }
1223                 *where = ip;
1224                 return 0;
1225         }
1226
1227 Adding an itemset may require merging the LA sets if LALR analysis is
1228 happening. If any new LA set adds any symbols that weren't in the old LA set, we
1229 clear the `completed` flag so that the dependants of this itemset will be
1230 recalculated and their LA sets updated.
1231
1232 `add_itemset` must consume the symsets it is passed, either by adding
1233 them to a data structure, of freeing them.
1234
1235         static int add_itemset(struct grammar *g, struct symset ss,
1236                                enum assoc assoc, unsigned short precedence,
1237                                enum grammar_type type)
1238         {
1239                 struct itemset **where, *is;
1240                 int i;
1241                 int found = itemset_find(g, &where, ss, type);
1242                 if (!found) {
1243                         is = calloc(1, sizeof(*is));
1244                         is->state = g->states;
1245                         g->states += 1;
1246                         is->items = ss;
1247                         is->assoc = assoc;
1248                         is->precedence = precedence;
1249                         is->next = *where;
1250                         is->go_to = INIT_DATASET;
1251                         *where = is;
1252                         return is->state;
1253                 }
1254                 is = *where;
1255                 if (type != LALR) {
1256                         symset_free(ss);
1257                         return is->state;
1258                 }
1259                 for (i = 0; i < ss.cnt; i++) {
1260                         struct symset temp = INIT_SYMSET, s;
1261                         if (ss.data[i] == is->items.data[i])
1262                                 continue;
1263                         s = set_find(g, is->items.data[i]);
1264                         symset_union(&temp, &s);
1265                         s = set_find(g, ss.data[i]);
1266                         if (symset_union(&temp, &s)) {
1267                                 is->items.data[i] = save_set(g, temp);
1268                                 is->completed = 0;
1269                         } else
1270                                 symset_free(temp);
1271                 }
1272                 symset_free(ss);
1273                 return is->state;
1274         }
1275
1276 #### The build
1277
1278 To build all the itemsets, we first insert the initial itemset made
1279 from production zero, complete each itemset, and then generate new
1280 itemsets from old until no new ones can be made.
1281
1282 Completing an itemset means finding all the items where "DOT" is followed by
1283 a nonterminal and adding "DOT=0" items for every production from that
1284 non-terminal - providing each item hasn't already been added.
1285
1286 If LA sets are needed, the LA set for each new item is found using
1287 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This may
1288 be supplemented by the LA set for the item which produce the new item.
1289
1290 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1291 is used in the next stage.
1292 If any of these symbols are flagged as `line_like`, then this
1293 state must be a `starts_line` state so now is a good time to record that.
1294
1295 When itemsets are created we assign a precedence to the itemset from
1296 the complete item, if there is one.  We ignore the possibility of
1297 there being two and don't (currently) handle precedence in such
1298 grammars.  When completing a grammar we ignore any item where DOT is
1299 followed by a terminal with a precedence lower than that for the
1300 itemset.  Unless the terminal has right associativity, we also ignore
1301 items where the terminal has the same precedence.  The result is that
1302 unwanted items are still in the itemset, but the terminal doesn't get
1303 into the go to set, so the item is ineffective.
1304
1305 ###### complete itemset
1306         for (i = 0; i < is->items.cnt; i++) {
1307                 int p = item_prod(is->items.syms[i]);
1308                 int bs = item_index(is->items.syms[i]);
1309                 struct production *pr = g->productions[p];
1310                 int p2;
1311                 struct symbol *s;
1312                 struct symset LA = INIT_SYMSET;
1313                 unsigned short sn = 0;
1314                 struct symset LAnl = INIT_SYMSET;
1315                 unsigned short snnl = 0;
1316
1317                 if (is->min_prefix == 0 ||
1318                     (bs > 0 && bs < is->min_prefix))
1319                         is->min_prefix = bs;
1320                 if (bs == pr->body_size)
1321                         continue;
1322                 s = pr->body[bs];
1323                 if (s->precedence && is->precedence &&
1324                     is->precedence > s->precedence)
1325                         /* This terminal has a low precedence and
1326                          * shouldn't be shifted
1327                          */
1328                         continue;
1329                 if (s->precedence && is->precedence &&
1330                     is->precedence == s->precedence && s->assoc != Right)
1331                         /* This terminal has a matching precedence and is
1332                          * not Right-associative, so we mustn't shift it.
1333                          */
1334                         continue;
1335                 if (symset_find(&done, s->num) < 0) {
1336                         symset_add(&done, s->num, 0);
1337                 }
1338                 if (s->type != Nonterminal)
1339                         continue;
1340                 if (s->line_like)
1341                         is->starts_line = 1;
1342                 again = 1;
1343                 if (type >= LALR) {
1344                         // Need the LA set.
1345                         int to_end;
1346                         add_first(pr, bs+1, &LA, g, &to_end);
1347                         if (to_end) {
1348                                 struct symset ss = set_find(g, is->items.data[i]);
1349                                 symset_union(&LA, &ss);
1350                         }
1351                         sn = save_set(g, LA);
1352                         LA = set_find(g, sn);
1353                         if (symset_find(&LA, TK_newline))
1354                                 symset_add(&LAnl, TK_newline, 0);
1355                         snnl = save_set(g, LAnl);
1356                         LAnl = set_find(g, snnl);
1357                 }
1358
1359                 /* Add productions for this symbol */
1360                 for (p2 = s->first_production;
1361                      p2 < g->production_count &&
1362                       g->productions[p2]->head == s;
1363                      p2++) {
1364                         int itm = item_num(p2, 0);
1365                         int pos = symset_find(&is->items, itm);
1366                         if (pos < 0) {
1367                                 if (g->productions[p2]->line_like)
1368                                         symset_add(&is->items, itm, snnl);
1369                                 else
1370                                         symset_add(&is->items, itm, sn);
1371                                 /* Will have re-ordered, so start
1372                                  * from beginning again */
1373                                 i = -1;
1374                         } else if (type >= LALR) {
1375                                 struct symset ss = set_find(g, is->items.data[pos]);
1376                                 struct symset tmp = INIT_SYMSET;
1377                                 struct symset *la = &LA;
1378
1379                                 if (g->productions[p2]->line_like)
1380                                         la = &LAnl;
1381                                 symset_union(&tmp, &ss);
1382                                 if (symset_union(&tmp, la)) {
1383                                         is->items.data[pos] = save_set(g, tmp);
1384                                         i = -1;
1385                                 } else
1386                                         symset_free(tmp);
1387                         }
1388                 }
1389         }
1390
1391 For each symbol we found (and placed in `done`) we collect all the items for
1392 which this symbol is next, and create a new itemset with "DOT" advanced over
1393 the symbol.  This is then added to the collection of itemsets (or merged
1394 with a pre-existing itemset).
1395
1396 ###### derive itemsets
1397         // Now we have a completed itemset, so we need to
1398         // compute all the 'next' itemsets and create them
1399         // if they don't exist.
1400         for (i = 0; i < done.cnt; i++) {
1401                 int j;
1402                 unsigned short state;
1403                 struct symbol *sym = g->symtab[done.syms[i]];
1404                 enum assoc assoc = Non;
1405                 unsigned short precedence = 0;
1406                 struct symset newitemset = INIT_SYMSET;
1407                 if (type >= LALR)
1408                         newitemset = INIT_DATASET;
1409
1410                 for (j = 0; j < is->items.cnt; j++) {
1411                         int itm = is->items.syms[j];
1412                         int p = item_prod(itm);
1413                         int bp = item_index(itm);
1414                         struct production *pr = g->productions[p];
1415                         unsigned short la = 0;
1416                         int pos;
1417
1418                         if (bp == pr->body_size)
1419                                 continue;
1420                         if (pr->body[bp] != sym)
1421                                 continue;
1422                         if (type >= LALR)
1423                                 la = is->items.data[j];
1424                         pos = symset_find(&newitemset, pr->head->num);
1425                         if (bp + 1 == pr->body_size &&
1426                             pr->precedence > 0 &&
1427                             pr->precedence > precedence) {
1428                                 // new itemset is reducible and has a precedence.
1429                                 precedence = pr->precedence;
1430                                 assoc = pr->assoc;
1431                         }
1432                         if (pos < 0)
1433                                 symset_add(&newitemset, item_num(p, bp+1), la);
1434                         else if (type >= LALR) {
1435                                 // Need to merge la set.
1436                                 int la2 = newitemset.data[pos];
1437                                 if (la != la2) {
1438                                         struct symset ss = set_find(g, la2);
1439                                         struct symset LA = INIT_SYMSET;
1440                                         symset_union(&LA, &ss);
1441                                         ss = set_find(g, la);
1442                                         if (symset_union(&LA, &ss))
1443                                                 newitemset.data[pos] = save_set(g, LA);
1444                                         else
1445                                                 symset_free(LA);
1446                                 }
1447                         }
1448                 }
1449                 state = add_itemset(g, newitemset, assoc, precedence, type);
1450                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1451                         symset_add(&is->go_to, done.syms[i], state);
1452         }
1453
1454 All that is left is to create the initial itemset from production zero, and
1455 with `TK_eof` as the LA set.
1456
1457 ###### functions
1458         static void build_itemsets(struct grammar *g, enum grammar_type type)
1459         {
1460                 struct symset first = INIT_SYMSET;
1461                 struct itemset *is;
1462                 int again;
1463                 unsigned short la = 0;
1464                 if (type >= LALR) {
1465                         // LA set just has eof
1466                         struct symset eof = INIT_SYMSET;
1467                         symset_add(&eof, TK_eof, 0);
1468                         la = save_set(g, eof);
1469                         first = INIT_DATASET;
1470                 }
1471                 // production 0, offset 0 (with no data)
1472                 symset_add(&first, item_num(0, 0), la);
1473                 add_itemset(g, first, Non, 0, type);
1474                 for (again = 0, is = g->items;
1475                      is;
1476                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
1477                         int i;
1478                         struct symset done = INIT_SYMSET;
1479                         if (is->completed)
1480                                 continue;
1481                         is->completed = 1;
1482                         again = 1;
1483                         ## complete itemset
1484                         ## derive itemsets
1485                         symset_free(done);
1486                 }
1487         }
1488
1489 ### Completing the analysis.
1490
1491 The exact process of analysis depends on which level was requested.  For
1492 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1493 `LR1` we need first, but not follow.  For `SLR` we need both.
1494
1495 We don't build the "action" tables that you might expect as the parser
1496 doesn't use them.  They will be calculated to some extent if needed for
1497 a report.
1498
1499 Once we have built everything we allocate arrays for the two lists:
1500 symbols and itemsets.  This allows more efficient access during reporting.
1501 The symbols are grouped as terminals and non-terminals and we record the
1502 changeover point in `first_nonterm`.
1503
1504 ###### grammar fields
1505         struct symbol **symtab;
1506         struct itemset **statetab;
1507         int first_nonterm;
1508
1509 ###### grammar_analyse
1510
1511         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1512         {
1513                 struct symbol *s;
1514                 struct itemset *is;
1515                 int snum = TK_reserved;
1516                 for (s = g->syms; s; s = s->next)
1517                         if (s->num < 0 && s->type == Terminal) {
1518                                 s->num = snum;
1519                                 snum++;
1520                         }
1521                 g->first_nonterm = snum;
1522                 for (s = g->syms; s; s = s->next)
1523                         if (s->num < 0) {
1524                                 s->num = snum;
1525                                 snum++;
1526                         }
1527                 g->num_syms = snum;
1528                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1529                 for (s = g->syms; s; s = s->next)
1530                         g->symtab[s->num] = s;
1531
1532                 set_nullable(g);
1533                 set_line_like(g);
1534                 if (type >= SLR)
1535                         build_first(g);
1536
1537                 if (type == SLR)
1538                         build_follow(g);
1539
1540                 build_itemsets(g, type);
1541
1542                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1543                 for (is = g->items; is ; is = is->next)
1544                         g->statetab[is->state] = is;
1545         }
1546
1547 ## Reporting on the Grammar
1548
1549 The purpose of the report is to give the grammar developer insight into
1550 how the grammar parser will work.  It is basically a structured dump of
1551 all the tables that have been generated, plus a description of any conflicts.
1552
1553 ###### grammar_report
1554         static int grammar_report(struct grammar *g, enum grammar_type type)
1555         {
1556                 report_symbols(g);
1557                 if (g->follow)
1558                         report_follow(g);
1559                 report_itemsets(g);
1560                 return report_conflicts(g, type);
1561         }
1562
1563 Firstly we have the complete list of symbols, together with the
1564 "FIRST" set if that was generated.  We add a mark to each symbol to
1565 show if it can end in a newline (`>`), if it is considered to be
1566 "line-like" (`<`), or if it is nullable (`.`).
1567
1568 ###### functions
1569
1570         static void report_symbols(struct grammar *g)
1571         {
1572                 int n;
1573                 if (g->first)
1574                         printf("SYMBOLS + FIRST:\n");
1575                 else
1576                         printf("SYMBOLS:\n");
1577
1578                 for (n = 0; n < g->num_syms; n++) {
1579                         struct symbol *s = g->symtab[n];
1580                         if (!s)
1581                                 continue;
1582
1583                         printf(" %c%c%3d%c: ",
1584                                s->nullable ? '.':' ',
1585                                s->line_like ? '<':' ',
1586                                s->num, symtypes[s->type]);
1587                         prtxt(s->name);
1588                         if (s->precedence)
1589                                 printf(" (%d%s)", s->precedence,
1590                                        assoc_names[s->assoc]);
1591
1592                         if (g->first && s->type == Nonterminal) {
1593                                 int j;
1594                                 char c = ':';
1595                                 for (j = 0; j < g->first[n].cnt; j++) {
1596                                         printf("%c ", c);
1597                                         c = ',';
1598                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1599                                 }
1600                         }
1601                         printf("\n");
1602                 }
1603         }
1604
1605 Then we have the follow sets if they were computed.
1606
1607         static void report_follow(struct grammar *g)
1608         {
1609                 int n;
1610                 printf("FOLLOW:\n");
1611                 for (n = 0; n < g->num_syms; n++)
1612                         if (g->follow[n].cnt) {
1613                                 int j;
1614                                 char c = ':';
1615                                 printf("  ");
1616                                 prtxt(g->symtab[n]->name);
1617                                 for (j = 0; j < g->follow[n].cnt; j++) {
1618                                         printf("%c ", c);
1619                                         c = ',';
1620                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1621                                 }
1622                                 printf("\n");
1623                         }
1624         }
1625
1626 And finally the item sets.  These include the GO TO tables and, for
1627 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1628 it up a bit.  First the items, with production number and associativity.
1629
1630         static void report_item(struct grammar *g, int itm)
1631         {
1632                 int p = item_prod(itm);
1633                 int dot = item_index(itm);
1634                 struct production *pr = g->productions[p];
1635                 int i;
1636
1637                 printf("    ");
1638                 prtxt(pr->head->name);
1639                 printf(" ->");
1640                 for (i = 0; i < pr->body_size; i++) {
1641                         printf(" %s", (dot == i ? ". ": ""));
1642                         prtxt(pr->body[i]->name);
1643                 }
1644                 if (dot == pr->body_size)
1645                         printf(" .");
1646                 printf(" [%d]", p);
1647                 if (pr->precedence && dot == pr->body_size)
1648                         printf(" (%d%s)", pr->precedence,
1649                                assoc_names[pr->assoc]);
1650                 if (dot < pr->body_size &&
1651                     pr->body[dot]->precedence) {
1652                         struct symbol *s = pr->body[dot];
1653                         printf(" [%d%s]", s->precedence,
1654                                assoc_names[s->assoc]);
1655                 }
1656                 if (pr->line_like == 1)
1657                         printf(" $$NEWLINE");
1658                 else if (pr->line_like)
1659                         printf(" $$OUT");
1660                 printf("\n");
1661         }
1662
1663 The LA sets which are (possibly) reported with each item:
1664
1665         static void report_la(struct grammar *g, int lanum)
1666         {
1667                 struct symset la = set_find(g, lanum);
1668                 int i;
1669                 char c = ':';
1670
1671                 printf("        LOOK AHEAD(%d)", lanum);
1672                 for (i = 0; i < la.cnt; i++) {
1673                         printf("%c ", c);
1674                         c = ',';
1675                         prtxt(g->symtab[la.syms[i]]->name);
1676                 }
1677                 printf("\n");
1678         }
1679
1680 Then the go to sets:
1681
1682         static void report_goto(struct grammar *g, struct symset gt)
1683         {
1684                 int i;
1685                 printf("    GOTO:\n");
1686
1687                 for (i = 0; i < gt.cnt; i++) {
1688                         printf("      ");
1689                         prtxt(g->symtab[gt.syms[i]]->name);
1690                         printf(" -> %d\n", gt.data[i]);
1691                 }
1692         }
1693
1694 Now we can report all the item sets complete with items, LA sets, and GO TO.
1695
1696         static void report_itemsets(struct grammar *g)
1697         {
1698                 int s;
1699                 printf("ITEM SETS(%d)\n", g->states);
1700                 for (s = 0; s < g->states; s++) {
1701                         int j;
1702                         struct itemset *is = g->statetab[s];
1703                         printf("  Itemset %d:%s min prefix=%d",
1704                                s, is->starts_line?" (startsline)":"", is->min_prefix);
1705                         if (is->precedence)
1706                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1707                         printf("\n");
1708                         for (j = 0; j < is->items.cnt; j++) {
1709                                 report_item(g, is->items.syms[j]);
1710                                 if (is->items.data != NO_DATA)
1711                                         report_la(g, is->items.data[j]);
1712                         }
1713                         report_goto(g, is->go_to);
1714                 }
1715         }
1716
1717 ### Reporting conflicts
1718
1719 Conflict detection varies a lot among different analysis levels.  However
1720 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1721 LR1 are also similar as they have FOLLOW or LA sets.
1722
1723 ###### functions
1724
1725         ## conflict functions
1726
1727         static int report_conflicts(struct grammar *g, enum grammar_type type)
1728         {
1729                 int cnt = 0;
1730                 printf("Conflicts:\n");
1731                 if (type < SLR)
1732                         cnt = conflicts_lr0(g, type);
1733                 else
1734                         cnt = conflicts_slr(g, type);
1735                 if (cnt == 0)
1736                         printf(" - no conflicts\n");
1737                 return cnt;
1738         }
1739
1740 LR0 conflicts are any state which have both a reducible item and
1741 a shiftable item, or two reducible items.
1742
1743 LR05 conflicts only occur if two possible reductions exist,
1744 as shifts always over-ride reductions.
1745
1746 ###### conflict functions
1747         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1748         {
1749                 int i;
1750                 int cnt = 0;
1751                 for (i = 0; i < g->states; i++) {
1752                         struct itemset *is = g->statetab[i];
1753                         int last_reduce = -1;
1754                         int prev_reduce = -1;
1755                         int last_shift = -1;
1756                         int j;
1757                         if (!is)
1758                                 continue;
1759                         for (j = 0; j < is->items.cnt; j++) {
1760                                 int itm = is->items.syms[j];
1761                                 int p = item_prod(itm);
1762                                 int bp = item_index(itm);
1763                                 struct production *pr = g->productions[p];
1764
1765                                 if (bp == pr->body_size) {
1766                                         prev_reduce = last_reduce;
1767                                         last_reduce = j;
1768                                         continue;
1769                                 }
1770                                 if (pr->body[bp]->type == Terminal)
1771                                         last_shift = j;
1772                         }
1773                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1774                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1775                                 report_item(g, is->items.syms[last_shift]);
1776                                 report_item(g, is->items.syms[last_reduce]);
1777                                 cnt++;
1778                         }
1779                         if (prev_reduce >= 0) {
1780                                 printf("  State %d has 2 (or more) reducible items\n", i);
1781                                 report_item(g, is->items.syms[prev_reduce]);
1782                                 report_item(g, is->items.syms[last_reduce]);
1783                                 cnt++;
1784                         }
1785                 }
1786                 return cnt;
1787         }
1788
1789 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1790 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1791 in the source of the look ahead set.
1792
1793 We build two datasets to reflect the "action" table: one which maps
1794 terminals to items where that terminal could be shifted and another
1795 which maps terminals to items that could be reduced when the terminal
1796 is in look-ahead.  We report when we get conflicts between the two.
1797
1798 As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
1799 terminal, we ignore it.  NEWLINES are handled specially with its own
1800 rules for when to shift and when to reduce.  Conflicts are expected,
1801 but handled internally.
1802
1803         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1804         {
1805                 int i;
1806                 int cnt = 0;
1807
1808                 for (i = 0; i < g->states; i++) {
1809                         struct itemset *is = g->statetab[i];
1810                         struct symset shifts = INIT_DATASET;
1811                         struct symset reduce = INIT_DATASET;
1812                         int j;
1813                         if (!is)
1814                                 continue;
1815                         /* First collect the shifts */
1816                         for (j = 0; j < is->items.cnt; j++) {
1817                                 unsigned short itm = is->items.syms[j];
1818                                 int p = item_prod(itm);
1819                                 int bp = item_index(itm);
1820                                 struct production *pr = g->productions[p];
1821                                 struct symbol *s;
1822
1823                                 if (bp >= pr->body_size ||
1824                                     pr->body[bp]->type != Terminal)
1825                                         /* not shiftable */
1826                                         continue;
1827
1828                                 s = pr->body[bp];
1829                                 if (s->precedence && is->precedence)
1830                                         /* Precedence resolves this, so no conflict */
1831                                         continue;
1832
1833                                 if (symset_find(&shifts, s->num) < 0)
1834                                         symset_add(&shifts, s->num, itm);
1835                         }
1836                         /* Now look for reductions and conflicts */
1837                         for (j = 0; j < is->items.cnt; j++) {
1838                                 unsigned short itm = is->items.syms[j];
1839                                 int p = item_prod(itm);
1840                                 int bp = item_index(itm);
1841                                 struct production *pr = g->productions[p];
1842
1843                                 if (bp < pr->body_size)
1844                                         continue;
1845                                 /* reducible */
1846                                 struct symset la;
1847                                 if (type == SLR)
1848                                         la = g->follow[pr->head->num];
1849                                 else
1850                                         la = set_find(g, is->items.data[j]);
1851                                 int k;
1852                                 for (k = 0; k < la.cnt; k++) {
1853                                         int pos = symset_find(&shifts, la.syms[k]);
1854                                         if (pos >= 0 && la.syms[k] != TK_newline) {
1855                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1856                                                 cnt++;
1857                                                         prtxt(g->symtab[la.syms[k]]->name);
1858                                                 printf(":\n");
1859                                                 report_item(g, shifts.data[pos]);
1860                                                 report_item(g, itm);
1861                                         }
1862                                         pos = symset_find(&reduce, la.syms[k]);
1863                                         if (pos < 0) {
1864                                                 symset_add(&reduce, la.syms[k], itm);
1865                                                 continue;
1866                                         }
1867                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1868                                         prtxt(g->symtab[la.syms[k]]->name);
1869                                         printf(":\n");
1870                                         report_item(g, itm);
1871                                         report_item(g, reduce.data[pos]);
1872                                         cnt++;
1873                                 }
1874                         }
1875                         symset_free(shifts);
1876                         symset_free(reduce);
1877                 }
1878                 return cnt;
1879         }
1880
1881 ## Generating the parser
1882
1883 The exported part of the parser is the `parse_XX` function, where the name
1884 `XX` is based on the name of the parser files.
1885
1886 This takes a `code_node`, a partially initialized `token_config`, and an
1887 optional `FILE` to send tracing to.  The `token_config` gets the list of
1888 known words added and then is used with the `code_node` to initialize the
1889 scanner.
1890
1891 `parse_XX` then calls the library function `parser_run` to actually complete
1892 the parse.  This needs the `states` table and function to call the various
1893 pieces of code provided in the grammar file, so they are generated first.
1894
1895 ###### parser_generate
1896
1897         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1898                                struct code_node *pre_reduce)
1899         {
1900                 gen_known(f, g);
1901                 gen_non_term(f, g);
1902                 gen_goto(f, g);
1903                 gen_states(f, g);
1904                 gen_reduce(f, g, file, pre_reduce);
1905                 gen_free(f, g);
1906
1907                 fprintf(f, "#line 0 \"gen_parser\"\n");
1908                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1909                         name);
1910                 fprintf(f, "{\n");
1911                 fprintf(f, "\tstruct token_state *tokens;\n");
1912                 fprintf(f, "\tconfig->words_marks = known;\n");
1913                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1914                 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1915                 fprintf(f, "\ttokens = token_open(code, config);\n");
1916                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1917                 fprintf(f, "\ttoken_close(tokens);\n");
1918                 fprintf(f, "\treturn rv;\n");
1919                 fprintf(f, "}\n\n");
1920         }
1921
1922 ### Known words table
1923
1924 The known words table is simply an array of terminal symbols.
1925 The table of nonterminals used for tracing is a similar array.  We
1926 include virtual symbols in the table of non_terminals to keep the
1927 numbers right.
1928
1929 ###### functions
1930
1931         static void gen_known(FILE *f, struct grammar *g)
1932         {
1933                 int i;
1934                 fprintf(f, "#line 0 \"gen_known\"\n");
1935                 fprintf(f, "static const char *known[] = {\n");
1936                 for (i = TK_reserved;
1937                      i < g->num_syms && g->symtab[i]->type == Terminal;
1938                      i++)
1939                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1940                                 g->symtab[i]->name.txt);
1941                 fprintf(f, "};\n\n");
1942         }
1943
1944         static void gen_non_term(FILE *f, struct grammar *g)
1945         {
1946                 int i;
1947                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1948                 fprintf(f, "static const char *non_term[] = {\n");
1949                 for (i = TK_reserved;
1950                      i < g->num_syms;
1951                      i++)
1952                         if (g->symtab[i]->type != Terminal)
1953                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1954                                         g->symtab[i]->name.txt);
1955                 fprintf(f, "};\n\n");
1956         }
1957
1958 ### States and the goto tables.
1959
1960 For each state we record the goto table, the reducible production if
1961 there is one, or a symbol to shift for error recovery.
1962 Some of the details of the reducible production are stored in the
1963 `do_reduce` function to come later.  Here we store the production number,
1964 the body size (useful for stack management) and the resulting symbol (useful
1965 for knowing how to free data later).
1966
1967 The go to table is stored in a simple array of `sym` and corresponding
1968 `state`.
1969
1970 ###### exported types
1971
1972         struct lookup {
1973                 short sym;
1974                 short state;
1975         };
1976         struct state {
1977                 short go_to_cnt;
1978                 const struct lookup * go_to;
1979                 short reduce_prod;
1980                 short reduce_size;
1981                 short reduce_sym;
1982                 char starts_line;
1983                 char newline_only;
1984                 short min_prefix;
1985         };
1986
1987 ###### functions
1988
1989         static void gen_goto(FILE *f, struct grammar *g)
1990         {
1991                 int i;
1992                 fprintf(f, "#line 0 \"gen_goto\"\n");
1993                 for (i = 0; i < g->states; i++) {
1994                         int j;
1995                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
1996                                 i);
1997                         struct symset gt = g->statetab[i]->go_to;
1998                         for (j = 0; j < gt.cnt; j++)
1999                                 fprintf(f, "\t{ %d, %d },\n",
2000                                         gt.syms[j], gt.data[j]);
2001                         fprintf(f, "};\n");
2002                 }
2003         }
2004
2005 ###### functions
2006
2007         static void gen_states(FILE *f, struct grammar *g)
2008         {
2009                 int i;
2010                 fprintf(f, "#line 0 \"gen_states\"\n");
2011                 fprintf(f, "static const struct state states[] = {\n");
2012                 for (i = 0; i < g->states; i++) {
2013                         struct itemset *is = g->statetab[i];
2014                         int j, prod = -1, prod_len;
2015
2016                         for (j = 0; j < is->items.cnt; j++) {
2017                                 int itm = is->items.syms[j];
2018                                 int p = item_prod(itm);
2019                                 int bp = item_index(itm);
2020                                 struct production *pr = g->productions[p];
2021
2022                                 if (bp < pr->body_size)
2023                                         continue;
2024                                 /* This is what we reduce */
2025                                 if (prod < 0 || prod_len < pr->body_size) {
2026                                         prod = p;
2027                                         prod_len = pr->body_size;
2028                                 }
2029                         }
2030
2031                         if (prod >= 0)
2032                                 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, %d, %d, %d },\n",
2033                                         i, is->go_to.cnt, i, prod,
2034                                         g->productions[prod]->body_size,
2035                                         g->productions[prod]->head->num,
2036                                         is->starts_line,
2037                                         g->productions[prod]->line_like,
2038                                         is->min_prefix);
2039                         else
2040                                 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, 0, %d },\n",
2041                                         i, is->go_to.cnt, i,
2042                                         is->starts_line, is->min_prefix);
2043                 }
2044                 fprintf(f, "};\n\n");
2045         }
2046
2047 ### The `do_reduce` function and the code
2048
2049 When the parser engine decides to reduce a production, it calls `do_reduce`.
2050 This has two functions.
2051
2052 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
2053 production being reduced.  Secondly it runs the code that was included with
2054 the production if any.
2055
2056 This code needs to be able to store data somewhere.  Rather than requiring
2057 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
2058 `do_reduce` return the size to be saved.
2059
2060 In order for the code to access "global" context, we pass in the
2061 "config" pointer that was passed to parser function.  If the `struct
2062 token_config` is embedded in some larger structure, the reducing code
2063 can access the larger structure using pointer manipulation.
2064
2065 The code fragment requires translation when written out.  Any `$N` needs to
2066 be converted to a reference either to that buffer (if `$0`) or to the
2067 structure returned by a previous reduction.  These pointers need to be cast
2068 to the appropriate type for each access.  All this is handled in
2069 `gen_code`.
2070
2071 `gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
2072 This applied only to symbols with references (or pointers), not those with structures.
2073 The `<` implies that the reference it being moved out, so the object will not be
2074 automatically freed.  This is equivalent to assigning `NULL` to the pointer.
2075
2076 ###### functions
2077
2078         static void gen_code(struct production *p, FILE *f, struct grammar *g)
2079         {
2080                 char *c;
2081                 char *used = calloc(1, p->body_size);
2082                 int i;
2083
2084                 fprintf(f, "\t\t\t");
2085                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2086                         int n;
2087                         int use = 0;
2088                         if (*c != '$') {
2089                                 fputc(*c, f);
2090                                 if (*c == '\n')
2091                                         fputs("\t\t\t", f);
2092                                 continue;
2093                         }
2094                         c++;
2095                         if (*c == '<') {
2096                                 use = 1;
2097                                 c++;
2098                         }
2099                         if (*c < '0' || *c > '9') {
2100                                 if (use)
2101                                         fputc('<', f);
2102                                 fputc(*c, f);
2103                                 continue;
2104                         }
2105                         n = *c - '0';
2106                         while (c[1] >= '0' && c[1] <= '9') {
2107                                 c += 1;
2108                                 n = n * 10 + *c - '0';
2109                         }
2110                         if (n == 0)
2111                                 fprintf(f, "(*(struct %.*s*%s)ret)",
2112                                         p->head->struct_name.len,
2113                                         p->head->struct_name.txt,
2114                                         p->head->isref ? "*":"");
2115                         else if (n > p->body_size)
2116                                 fprintf(f, "$%d", n);
2117                         else if (p->body[n-1]->type == Terminal)
2118                                 fprintf(f, "(*(struct token *)body[%d])",
2119                                         n-1);
2120                         else if (p->body[n-1]->struct_name.txt == NULL)
2121                                 fprintf(f, "$%d", n);
2122                         else {
2123                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2124                                         p->body[n-1]->struct_name.len,
2125                                         p->body[n-1]->struct_name.txt,
2126                                         p->body[n-1]->isref ? "*":"", n-1);
2127                                 used[n-1] = use;
2128                         }
2129                 }
2130                 fputs("\n", f);
2131                 for (i = 0; i < p->body_size; i++) {
2132                         if (p->body[i]->struct_name.txt &&
2133                             used[i]) {
2134                                 // assume this has been copied out
2135                                 if (p->body[i]->isref)
2136                                         fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2137                                 else
2138                                         fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2139                         }
2140                 }
2141                 free(used);
2142         }
2143
2144 ###### functions
2145
2146         static void gen_reduce(FILE *f, struct grammar *g, char *file,
2147                                struct code_node *code)
2148         {
2149                 int i;
2150                 fprintf(f, "#line 1 \"gen_reduce\"\n");
2151                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2152                 fprintf(f, "{\n");
2153                 fprintf(f, "\tint ret_size = 0;\n");
2154                 if (code)
2155                         code_node_print(f, code, file);
2156
2157                 fprintf(f, "#line 4 \"gen_reduce\"\n");
2158                 fprintf(f, "\tswitch(prod) {\n");
2159                 for (i = 0; i < g->production_count; i++) {
2160                         struct production *p = g->productions[i];
2161                         fprintf(f, "\tcase %d:\n", i);
2162
2163                         if (p->code.txt) {
2164                                 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2165                                 gen_code(p, f, g);
2166                         }
2167
2168                         if (p->head->struct_name.txt)
2169                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2170                                         p->head->struct_name.len,
2171                                         p->head->struct_name.txt,
2172                                         p->head->isref ? "*":"");
2173
2174                         fprintf(f, "\t\tbreak;\n");
2175                 }
2176                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2177         }
2178
2179 ### `do_free`
2180
2181 As each non-terminal can potentially cause a different type of data
2182 structure to be allocated and filled in, we need to be able to free it when
2183 done.
2184
2185 It is particularly important to have fine control over freeing during error
2186 recovery where individual stack frames might need to be freed.
2187
2188 For this, the grammar author is required to defined a `free_XX` function for
2189 each structure that is used by a non-terminal.  `do_free` will call whichever
2190 is appropriate given a symbol number, and will call `free` (as is
2191 appropriate for tokens) on any terminal symbol.
2192
2193 ###### functions
2194
2195         static void gen_free(FILE *f, struct grammar *g)
2196         {
2197                 int i;
2198
2199                 fprintf(f, "#line 0 \"gen_free\"\n");
2200                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2201                 fprintf(f, "{\n");
2202                 fprintf(f, "\tif (!asn) return;\n");
2203                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2204                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2205                 fprintf(f, "\tswitch(sym) {\n");
2206
2207                 for (i = 0; i < g->num_syms; i++) {
2208                         struct symbol *s = g->symtab[i];
2209                         if (!s ||
2210                             s->type != Nonterminal ||
2211                             s->struct_name.txt == NULL)
2212                                 continue;
2213
2214                         fprintf(f, "\tcase %d:\n", s->num);
2215                         if (s->isref) {
2216                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2217                                         s->struct_name.len,
2218                                         s->struct_name.txt);
2219                                 fprintf(f, "\t\tfree(asn);\n");
2220                         } else
2221                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2222                                         s->struct_name.len,
2223                                         s->struct_name.txt);
2224                         fprintf(f, "\t\tbreak;\n");
2225                 }
2226                 fprintf(f, "\t}\n}\n\n");
2227         }
2228
2229 ## The main routine.
2230
2231 There are three key parts to the "main" function of parsergen: processing
2232 the arguments, loading the grammar file, and dealing with the grammar.
2233
2234 ### Argument processing.
2235
2236 Command line options allow the selection of analysis mode, name of output
2237 file, and whether or not a report should be generated.  By default we create
2238 a report only if no code output was requested.
2239
2240 The `parse_XX` function name uses the basename of the output file which
2241 should not have a suffix (`.c`).  `.c` is added to the given name for the
2242 code, and `.h` is added for the header (if header text is specifed in the
2243 grammar file).
2244
2245 ###### includes
2246         #include <getopt.h>
2247
2248 ###### declarations
2249         static const struct option long_options[] = {
2250                 { "LR0",        0, NULL, '0' },
2251                 { "LR05",       0, NULL, '5' },
2252                 { "SLR",        0, NULL, 'S' },
2253                 { "LALR",       0, NULL, 'L' },
2254                 { "LR1",        0, NULL, '1' },
2255                 { "tag",        1, NULL, 't' },
2256                 { "report",     0, NULL, 'R' },
2257                 { "output",     1, NULL, 'o' },
2258                 { NULL,         0, NULL, 0   }
2259         };
2260         const char *options = "05SL1t:Ro:";
2261
2262 ###### process arguments
2263         int opt;
2264         char *outfile = NULL;
2265         char *infile;
2266         char *name;
2267         char *tag = NULL;
2268         int report = 1;
2269         enum grammar_type type = LR05;
2270         while ((opt = getopt_long(argc, argv, options,
2271                                   long_options, NULL)) != -1) {
2272                 switch(opt) {
2273                 case '0':
2274                         type = LR0; break;
2275                 case '5':
2276                         type = LR05; break;
2277                 case 'S':
2278                         type = SLR; break;
2279                 case 'L':
2280                         type = LALR; break;
2281                 case '1':
2282                         type = LR1; break;
2283                 case 'R':
2284                         report = 2; break;
2285                 case 'o':
2286                         outfile = optarg; break;
2287                 case 't':
2288                         tag = optarg; break;
2289                 default:
2290                         fprintf(stderr, "Usage: parsergen ...\n");
2291                         exit(1);
2292                 }
2293         }
2294         if (optind < argc)
2295                 infile = argv[optind++];
2296         else {
2297                 fprintf(stderr, "No input file given\n");
2298                 exit(1);
2299         }
2300         if (outfile && report == 1)
2301                 report = 0;
2302         name = outfile;
2303         if (name && strchr(name, '/'))
2304                 name = strrchr(name, '/')+1;
2305
2306         if (optind < argc) {
2307                 fprintf(stderr, "Excess command line arguments\n");
2308                 exit(1);
2309         }
2310
2311 ### Loading the grammar file
2312
2313 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2314 map it.
2315
2316 Once we have extracted the code (with `mdcode`) we expect to find three
2317 sections: header, code, and grammar.  Anything else that is not
2318 excluded by the `--tag` option is an error.
2319
2320 "header" and "code" are optional, though it is hard to build a working
2321 parser with neither. "grammar" must be provided.
2322
2323 ###### includes
2324         #include <fcntl.h>
2325         #include <sys/mman.h>
2326         #include <errno.h>
2327
2328 ###### functions
2329         static int errs;
2330         static void pr_err(char *msg)
2331         {
2332                 errs++;
2333                 fprintf(stderr, "%s\n", msg);
2334         }
2335
2336 ###### load file
2337         struct section *table;
2338         int fd;
2339         int len;
2340         char *file;
2341         fd = open(infile, O_RDONLY);
2342         if (fd < 0) {
2343                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2344                         infile, strerror(errno));
2345                 exit(1);
2346         }
2347         len = lseek(fd, 0, 2);
2348         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2349         table = code_extract(file, file+len, pr_err);
2350
2351         struct code_node *hdr = NULL;
2352         struct code_node *code = NULL;
2353         struct code_node *gram = NULL;
2354         struct code_node *pre_reduce = NULL;
2355         for (s = table; s; s = s->next) {
2356                 struct text sec = s->section;
2357                 if (tag && !strip_tag(&sec, tag))
2358                         continue;
2359                 if (text_is(sec, "header"))
2360                         hdr = s->code;
2361                 else if (text_is(sec, "code"))
2362                         code = s->code;
2363                 else if (text_is(sec, "grammar"))
2364                         gram = s->code;
2365                 else if (text_is(sec, "reduce"))
2366                         pre_reduce = s->code;
2367                 else {
2368                         fprintf(stderr, "Unknown content section: %.*s\n",
2369                                 s->section.len, s->section.txt);
2370                         rv |= 2;
2371                 }
2372         }
2373
2374 ### Processing the input
2375
2376 As we need to append an extention to a filename and then open it for
2377 writing, and we need to do this twice, it helps to have a separate function.
2378
2379 ###### functions
2380
2381         static FILE *open_ext(char *base, char *ext)
2382         {
2383                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2384                 FILE *f;
2385                 strcat(strcpy(fn, base), ext);
2386                 f = fopen(fn, "w");
2387                 free(fn);
2388                 return f;
2389         }
2390
2391 If we can read the grammar, then we analyse and optionally report on it.  If
2392 the report finds conflicts we will exit with an error status.
2393
2394 ###### process input
2395         struct grammar *g = NULL;
2396         if (gram == NULL) {
2397                 fprintf(stderr, "No grammar section provided\n");
2398                 rv |= 2;
2399         } else {
2400                 g = grammar_read(gram);
2401                 if (!g) {
2402                         fprintf(stderr, "Failure to parse grammar\n");
2403                         rv |= 2;
2404                 }
2405         }
2406         if (g) {
2407                 grammar_analyse(g, type);
2408                 if (report)
2409                         if (grammar_report(g, type))
2410                                 rv |= 1;
2411         }
2412
2413 If a "headers" section is defined, we write it out and include a declaration
2414 for the `parse_XX` function so it can be used from separate code.
2415
2416         if (rv == 0 && hdr && outfile) {
2417                 FILE *f = open_ext(outfile, ".h");
2418                 if (f) {
2419                         code_node_print(f, hdr, infile);
2420                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2421                                 name);
2422                         fclose(f);
2423                 } else {
2424                         fprintf(stderr, "Cannot create %s.h\n",
2425                                 outfile);
2426                         rv |= 4;
2427                 }
2428         }
2429
2430 And if all goes well and an output file was provided, we create the `.c`
2431 file with the code section (if any) and the parser tables and function.
2432
2433         if (rv == 0 && outfile) {
2434                 FILE *f = open_ext(outfile, ".c");
2435                 if (f) {
2436                         if (code)
2437                                 code_node_print(f, code, infile);
2438                         gen_parser(f, g, infile, name, pre_reduce);
2439                         fclose(f);
2440                 } else {
2441                         fprintf(stderr, "Cannot create %s.c\n",
2442                                 outfile);
2443                         rv |= 4;
2444                 }
2445         }
2446
2447 And that about wraps it up.  We need to set the locale so that UTF-8 is
2448 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2449
2450 ###### File: parsergen.mk
2451         parsergen : parsergen.o libscanner.o libmdcode.o
2452                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2453
2454 ###### includes
2455         #include <locale.h>
2456
2457 ###### main
2458
2459         int main(int argc, char *argv[])
2460         {
2461                 struct section *s;
2462                 int rv = 0;
2463
2464                 setlocale(LC_ALL,"");
2465
2466                 ## process arguments
2467                 ## load file
2468                 ## process input
2469
2470                 return rv;
2471         }
2472
2473 ## The SHIFT/REDUCE parser
2474
2475 Having analysed the grammar and generated all the tables, we only need the
2476 shift/reduce engine to bring it all together.
2477
2478 ### Goto table lookup
2479
2480 The parser generator has nicely provided us with goto tables sorted by
2481 symbol number.  We need a binary search function to find a symbol in the
2482 table.
2483
2484 ###### parser functions
2485
2486         static int search(const struct state *l, int sym)
2487         {
2488                 int lo = 0;
2489                 int hi = l->go_to_cnt;
2490
2491                 if (hi == 0)
2492                         return -1;
2493                 while (lo + 1 < hi) {
2494                         int mid = (lo + hi) / 2;
2495                         if (l->go_to[mid].sym <= sym)
2496                                 lo = mid;
2497                         else
2498                                 hi = mid;
2499                 }
2500                 if (l->go_to[lo].sym == sym)
2501                         return l->go_to[lo].state;
2502                 else
2503                         return -1;
2504         }
2505
2506 ### The state stack.
2507
2508 The core data structure for the parser is the stack.  This tracks all the
2509 symbols that have been recognised or partially recognised.
2510
2511 The stack usually won't grow very large - maybe a few tens of entries.  So
2512 we dynamically resize and array as required but never bother to shrink it
2513 down again.
2514
2515 We keep the stack as two separate allocations.  One, `asn_stack` stores the
2516 "abstract syntax nodes" which are created by each reduction.  When we call
2517 `do_reduce` we need to pass an array of the `asn`s of the body of the
2518 production, and by keeping a separate `asn` stack, we can just pass a
2519 pointer into this stack.
2520
2521 The other allocation stores all other stack fields of which there are six.
2522 The `state` is the most important one and guides the parsing process.  The
2523 `sym` is nearly unnecessary.  However when we want to free entries from the
2524 `asn_stack`, it helps to know what type they are so we can call the right
2525 freeing function.  The symbol leads us to the right free function through
2526 `do_free`.
2527
2528 The `indents` count tracks the line indents with-in the symbol or
2529 immediately follow it.  These are used to allow indent information to
2530 guide parsing and error recovery.
2531
2532 `since_newline` tracks how many stack frames since the last
2533 start-of-line (whether indented or not).  So if `since_newline` is
2534 zero, then this symbol is at the start of a line.  Similarly
2535 `since_indent` counts the number of states since an indent, it is zero
2536 precisely when `indents` is not zero.
2537
2538 `newline_permitted` keeps track of whether newlines should be ignored
2539 or not.
2540
2541 The stack is most properly seen as alternating states and symbols -
2542 states, like the 'DOT' in items, are between symbols.  Each frame in
2543 our stack holds a state and the symbol that was before it.  The
2544 bottom of stack holds the start state but no symbol, as nothing came
2545 before the beginning.
2546
2547 ###### parser functions
2548
2549         struct parser {
2550                 struct frame {
2551                         short state;
2552                         short newline_permitted;
2553
2554                         short sym;
2555                         short indents;
2556                         short since_newline;
2557                         short since_indent;
2558                 } *stack;
2559                 void **asn_stack;
2560                 int stack_size;
2561                 int tos;
2562         };
2563
2564 #### Shift and pop
2565
2566 Two operations are needed on the stack - shift (which is like push) and pop.
2567
2568 Shift applies not only to terminals but also to non-terminals.  When
2569 we reduce a production we will pop off entries corresponding to the
2570 body symbols, then push on an item for the head of the production.
2571 This last is exactly the same process as shifting in a terminal so we
2572 use the same function for both.  In both cases we provide the symbol,
2573 the number of indents the symbol contains (which will be zero for a
2574 terminal symbol) and a flag indicating the the symbol was at (or was
2575 reduced from a symbol which was at) the start of a line.  The state is
2576 deduced from the current top-of-stack state and the new symbol.
2577
2578 To simplify other code we arrange for `shift` to fail if there is no `goto`
2579 state for the symbol.  This is useful in basic parsing due to our design
2580 that we shift when we can, and reduce when we cannot.  So the `shift`
2581 function reports if it could.
2582
2583 `shift` is also used to push state zero onto the stack, so if the
2584 stack is empty, it always chooses zero as the next state.
2585
2586 So `shift` finds the next state.  If that succeeds it extends the
2587 allocations if needed and pushes all the information onto the stacks.
2588
2589 Newlines are permitted after a `starts_line` state until an internal
2590 indent.  If the new frame has neither a `starts_line` state nor an
2591 indent, newlines are permitted if the previous stack frame permitted
2592 them.
2593
2594 ###### parser functions
2595
2596         static int shift(struct parser *p,
2597                          short sym, short indents, short start_of_line,
2598                          void *asn,
2599                          const struct state states[])
2600         {
2601                 // Push an entry onto the stack
2602                 struct frame next = {0};
2603                 int newstate = p->tos
2604                         ? search(&states[p->stack[p->tos-1].state],
2605                                  sym)
2606                         : 0;
2607                 if (newstate < 0)
2608                         return 0;
2609                 if (p->tos >= p->stack_size) {
2610                         p->stack_size += 10;
2611                         p->stack = realloc(p->stack, p->stack_size
2612                                            * sizeof(p->stack[0]));
2613                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2614                                            * sizeof(p->asn_stack[0]));
2615                 }
2616                 next.sym = sym;
2617                 next.indents = indents;
2618                 next.state = newstate;
2619                 if (states[newstate].starts_line)
2620                         next.newline_permitted = 1;
2621                 else if (indents)
2622                         next.newline_permitted = 0;
2623                 else if (p->tos)
2624                         next.newline_permitted =
2625                                 p->stack[p->tos-1].newline_permitted;
2626                 else
2627                         next.newline_permitted = 0;
2628
2629                 if (!start_of_line) {
2630                         if (p->tos)
2631                                 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2632                         else
2633                                 next.since_newline = 1;
2634                 }
2635                 if (indents)
2636                         next.since_indent = 0;
2637                 else if (p->tos)
2638                         next.since_indent = p->stack[p->tos-1].since_indent + 1;
2639                 else
2640                         next.since_indent = 1;
2641
2642                 p->stack[p->tos] = next;
2643                 p->asn_stack[p->tos] = asn;
2644                 p->tos++;
2645                 return 1;
2646         }
2647
2648 `pop` primarily moves the top of stack (`tos`) back down the required
2649 amount and frees any `asn` entries that need to be freed.  It also
2650 collects a summary of the indents and line starts in the symbols that
2651 are being removed. It is called _after_ we reduce a production, just
2652 before we `shift` the nonterminal in.
2653
2654 ###### parser functions
2655
2656         static int pop(struct parser *p, int num,
2657                        short *start_of_line,
2658                        void(*do_free)(short sym, void *asn))
2659         {
2660                 int i;
2661                 short indents = 0;
2662                 int sol = 0;
2663
2664                 p->tos -= num;
2665                 for (i = 0; i < num; i++) {
2666                         sol |= !p->stack[p->tos+i].since_newline;
2667                         indents += p->stack[p->tos+i].indents;
2668                         do_free(p->stack[p->tos+i].sym,
2669                                 p->asn_stack[p->tos+i]);
2670                 }
2671                 if (start_of_line)
2672                         *start_of_line = sol;
2673                 return indents;
2674         }
2675
2676 ### Memory allocation
2677
2678 The `scanner` returns tokens in a local variable - we want them in allocated
2679 memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
2680 a reduce is in a large buffer.  Both of these require some allocation and
2681 copying, hence `memdup` and `tokcopy`.
2682
2683 ###### parser includes
2684         #include <memory.h>
2685
2686 ###### parser functions
2687
2688         void *memdup(void *m, int len)
2689         {
2690                 void *ret;
2691                 if (len == 0)
2692                         return NULL;
2693                 ret = malloc(len);
2694                 memcpy(ret, m, len);
2695                 return ret;
2696         }
2697
2698         static struct token *tok_copy(struct token tk)
2699         {
2700                 struct token *new = malloc(sizeof(*new));
2701                 *new = tk;
2702                 return new;
2703         }
2704
2705 ### The heart of the parser.
2706
2707 Now we have the parser.  If we can shift we do, though newlines and
2708 reducing indenting may block that.  If not and we can reduce we do
2709 that.  If the production we reduced was production zero, then we have
2710 accepted the input and can finish.
2711
2712 We return whatever `asn` was returned by reducing production zero.
2713
2714 If we can neither shift nor reduce we have an error to handle.  We pop
2715 single entries off the stack until we can shift the `TK_error` symbol, then
2716 drop input tokens until we find one we can shift into the new error state.
2717
2718 When we find `TK_in` and `TK_out` tokens which report indents we need
2719 to handle them directly as the grammar cannot express what we want to
2720 do with them.
2721
2722 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2723 record how many indents there are following the previous token.
2724
2725 `TK_out` tokens must be canceled against an indent count
2726 within the stack.  If we can reduce some symbols that are all since
2727 the most recent indent, then we do that first.  If the minimum prefix
2728 of the current state then extends back before the most recent indent,
2729 that indent can be cancelled.  If the minimum prefix is shorter then
2730 the indent had ended prematurely and we must start error handling, which
2731 is still a work-in-progress.
2732
2733 `TK_newline` tokens are ignored unless the top stack frame records
2734 that they are permitted.  In that case they will not be considered for
2735 shifting if it is possible to reduce some symbols that are all since
2736 the most recent start of line.  This is how a newline forcibly
2737 terminates any line-like structure - we try to reduce down to at most
2738 one symbol for each line where newlines are allowed.
2739 A consequence of this is that a rule like
2740
2741 ###### Example: newlines - broken
2742
2743         Newlines ->
2744                 | NEWLINE Newlines
2745         IfStatement -> Newlines if ....
2746
2747 cannot work, as the NEWLINE will never be shifted as the empty string
2748 will be reduced first.  Optional sets of newlines need to be include
2749 in the thing that preceed:
2750
2751 ###### Example: newlines - works
2752
2753         If -> if
2754                 | NEWLINE If
2755         IfStatement -> If ....
2756
2757 Here the NEWLINE will be shifted because nothing can be reduced until
2758 the `if` is seen.
2759
2760 When, during error handling, we discard token read in, we want to keep
2761 discarding until we see one that is recognised.  If we had a full set
2762 of LR(1) grammar states, this will mean looking in the look-ahead set,
2763 but we don't keep a full look-ahead set.  We only record the subset
2764 that leads to SHIFT.  We can, however, deduce the look-ahead set but
2765 looking at the SHIFT subsets for all states that we can get to by
2766 reducing zero or more times.  So we need a little function which
2767 checks if a given token is in any of these look-ahead sets.
2768
2769 ###### parser includes
2770         #include "parser.h"
2771
2772 ###### parser_run
2773
2774         static int in_lookahead(struct token *tk, const struct state *states, int state)
2775         {
2776                 while (state >= 0) {
2777                         if (search(&states[state], tk->num) >= 0)
2778                                 return 1;
2779                         if (states[state].reduce_prod < 0)
2780                                 return 0;
2781                         state = search(&states[state], states[state].reduce_sym);
2782                 }
2783                 return 0;
2784         }
2785
2786         void *parser_run(struct token_state *tokens,
2787                          const struct state states[],
2788                          int (*do_reduce)(int, void**, struct token_config*, void*),
2789                          void (*do_free)(short, void*),
2790                          FILE *trace, const char *non_term[],
2791                          struct token_config *config)
2792         {
2793                 struct parser p = { 0 };
2794                 struct token *tk = NULL;
2795                 int accepted = 0;
2796                 void *ret = NULL;
2797
2798                 shift(&p, TK_eof, 0, 1, NULL, states);
2799                 while (!accepted) {
2800                         struct token *err_tk;
2801                         struct frame *tos = &p.stack[p.tos-1];
2802                         if (!tk)
2803                                 tk = tok_copy(token_next(tokens));
2804                         parser_trace(trace, &p,
2805                                      tk, states, non_term, config->known_count);
2806
2807                         if (tk->num == TK_in) {
2808                                 tos->indents += 1;
2809                                 tos->since_newline = 0;
2810                                 tos->since_indent = 0;
2811                                 if (!states[tos->state].starts_line)
2812                                         tos->newline_permitted = 0;
2813                                 free(tk);
2814                                 tk = NULL;
2815                                 parser_trace_action(trace, "Record");
2816                                 continue;
2817                         }
2818                         if (tk->num == TK_out) {
2819                                 if (states[tos->state].reduce_size >= 0 &&
2820                                     states[tos->state].reduce_size <= tos->since_indent)
2821                                         goto force_reduce;
2822                                 if (states[tos->state].min_prefix >= tos->since_indent) {
2823                                         // OK to cancel
2824                                         struct frame *in = tos - tos->since_indent;
2825                                         in->indents -= 1;
2826                                         if (in->indents == 0) {
2827                                                 /* Reassess since_indent and newline_permitted */
2828                                                 if (in > p.stack) {
2829                                                         in->since_indent = in[-1].since_indent + 1;
2830                                                         in->newline_permitted = in[-1].newline_permitted;
2831                                                 } else {
2832                                                         in->since_indent = 0;
2833                                                         in->newline_permitted = 0;
2834                                                 }
2835                                                 if (states[in->state].starts_line)
2836                                                         in->newline_permitted = 1;
2837                                                 while (in < tos) {
2838                                                         in += 1;
2839                                                         in->since_indent = in[-1].since_indent + 1;
2840                                                         if (states[in->state].starts_line)
2841                                                                 in->newline_permitted = 1;
2842                                                         else
2843                                                                 in->newline_permitted = in[-1].newline_permitted;
2844                                                 }
2845                                         }
2846                                         free(tk);
2847                                         tk = NULL;
2848                                         parser_trace_action(trace, "Cancel");
2849                                         continue;
2850                                 }
2851                                 // fall through to error handling as both SHIFT and REDUCE
2852                                 // will fail.
2853                         }
2854                         if (tk->num == TK_newline) {
2855                                 if (!tos->newline_permitted) {
2856                                         free(tk);
2857                                         tk = NULL;
2858                                         parser_trace_action(trace, "Discard");
2859                                         continue;
2860                                 }
2861                                 if (tos->since_newline > 1 &&
2862                                     states[tos->state].reduce_size >= 0 &&
2863                                     states[tos->state].reduce_size <= tos->since_newline)
2864                                         goto force_reduce;
2865                         }
2866                         if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
2867                                 tk = NULL;
2868                                 parser_trace_action(trace, "Shift");
2869                                 continue;
2870                         }
2871                 force_reduce:
2872                         if (states[tos->state].reduce_prod >= 0 &&
2873                             states[tos->state].newline_only &&
2874                             !(tk->num == TK_newline ||
2875                               tk->num == TK_eof ||
2876                               tk->num == TK_out ||
2877                               (tos->indents == 0 && tos->since_newline == 0))) {
2878                                 /* Anything other than newline or out or eof
2879                                  * in an error unless we are already at start
2880                                  * of line, as this production must end at EOL.
2881                                  */
2882                         } else if (states[tos->state].reduce_prod >= 0) {
2883                                 void **body;
2884                                 void *res;
2885                                 const struct state *nextstate = &states[tos->state];
2886                                 int prod = nextstate->reduce_prod;
2887                                 int size = nextstate->reduce_size;
2888                                 int bufsize;
2889                                 static char buf[16*1024];
2890                                 short indents, start_of_line;
2891
2892                                 body = p.asn_stack + (p.tos - size);
2893
2894                                 bufsize = do_reduce(prod, body, config, buf);
2895
2896                                 indents = pop(&p, size, &start_of_line,
2897                                               do_free);
2898                                 res = memdup(buf, bufsize);
2899                                 memset(buf, 0, bufsize);
2900                                 if (!shift(&p, nextstate->reduce_sym,
2901                                            indents, start_of_line,
2902                                            res, states)) {
2903                                         if (prod != 0) abort();
2904                                         accepted = 1;
2905                                         ret = res;
2906                                 }
2907                                 parser_trace_action(trace, "Reduce");
2908                                 continue;
2909                         }
2910                         /* Error. We walk up the stack until we
2911                          * find a state which will accept TK_error.
2912                          * We then shift in TK_error and see what state
2913                          * that takes us too.
2914                          * Then we discard input tokens until
2915                          * we find one that is acceptable.
2916                          */
2917                         parser_trace_action(trace, "ERROR");
2918                         short indents = 0, start_of_line;
2919
2920                         err_tk = tok_copy(*tk);
2921                         while (p.tos > 0 &&
2922                                shift(&p, TK_error, 0, 0,
2923                                      err_tk, states) == 0)
2924                                 // discard this state
2925                                 indents += pop(&p, 1, &start_of_line, do_free);
2926                         if (p.tos == 0) {
2927                                 free(err_tk);
2928                                 // no state accepted TK_error
2929                                 break;
2930                         }
2931                         tos = &p.stack[p.tos-1];
2932                         while (!in_lookahead(tk, states, tos->state) &&
2933                                tk->num != TK_eof) {
2934                                 free(tk);
2935                                 tk = tok_copy(token_next(tokens));
2936                                 if (tk->num == TK_in)
2937                                         indents += 1;
2938                                 if (tk->num == TK_out) {
2939                                         if (indents == 0)
2940                                                 break;
2941                                         indents -= 1;
2942                                         // FIXME update since_indent here
2943                                 }
2944                         }
2945                         tos->indents += indents;
2946                 }
2947                 free(tk);
2948                 pop(&p, p.tos, NULL, do_free);
2949                 free(p.asn_stack);
2950                 free(p.stack);
2951                 return ret;
2952         }
2953
2954 ###### exported functions
2955         void *parser_run(struct token_state *tokens,
2956                          const struct state states[],
2957                          int (*do_reduce)(int, void**, struct token_config*, void*),
2958                          void (*do_free)(short, void*),
2959                          FILE *trace, const char *non_term[],
2960                          struct token_config *config);
2961
2962 ### Tracing
2963
2964 Being able to visualize the parser in action can be invaluable when
2965 debugging the parser code, or trying to understand how the parse of a
2966 particular grammar progresses.  The stack contains all the important
2967 state, so just printing out the stack every time around the parse loop
2968 can make it possible to see exactly what is happening.
2969
2970 This doesn't explicitly show each SHIFT and REDUCE action.  However they
2971 are easily deduced from the change between consecutive lines, and the
2972 details of each state can be found by cross referencing the states list
2973 in the stack with the "report" that parsergen can generate.
2974
2975 For terminal symbols, we just dump the token.  For non-terminals we
2976 print the name of the symbol.  The look ahead token is reported at the
2977 end inside square brackets.
2978
2979 ###### parser functions
2980
2981         static char *reserved_words[] = {
2982                 [TK_error]        = "ERROR",
2983                 [TK_in]           = "IN",
2984                 [TK_out]          = "OUT",
2985                 [TK_newline]      = "NEWLINE",
2986                 [TK_eof]          = "$eof",
2987         };
2988         static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2989         {
2990                 fprintf(trace, "(%d", f->state);
2991                 if (states[f->state].starts_line)
2992                         fprintf(trace, "s");
2993                 if (f->newline_permitted)
2994                         fprintf(trace, "n%d", f->since_newline);
2995                 fprintf(trace, ") ");
2996         }
2997
2998         void parser_trace(FILE *trace, struct parser *p,
2999                           struct token *tk, const struct state states[],
3000                           const char *non_term[], int knowns)
3001         {
3002                 int i;
3003                 if (!trace)
3004                         return;
3005                 for (i = 0; i < p->tos; i++) {
3006                         struct frame *f = &p->stack[i];
3007                         if (i) {
3008                                 int sym = f->sym;
3009                                 if (sym < TK_reserved &&
3010                                     reserved_words[sym] != NULL)
3011                                         fputs(reserved_words[sym], trace);
3012                                 else if (sym < TK_reserved + knowns) {
3013                                         struct token *t = p->asn_stack[i];
3014                                         text_dump(trace, t->txt, 20);
3015                                 } else
3016                                         fputs(non_term[sym - TK_reserved - knowns],
3017                                               trace);
3018                                 if (f->indents)
3019                                         fprintf(trace, ".%d", f->indents);
3020                                 if (f->since_newline == 0)
3021                                         fputs("/", trace);
3022                                 fputs(" ", trace);
3023                         }
3024                         parser_trace_state(trace, f, states);
3025                 }
3026                 fprintf(trace, "[");
3027                 if (tk->num < TK_reserved &&
3028                     reserved_words[tk->num] != NULL)
3029                         fputs(reserved_words[tk->num], trace);
3030                 else
3031                         text_dump(trace, tk->txt, 20);
3032                 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3033         }
3034
3035         void parser_trace_action(FILE *trace, char *action)
3036         {
3037                 if (trace)
3038                         fprintf(trace, " - %s\n", action);
3039         }
3040
3041 # A Worked Example
3042
3043 The obvious example for a parser is a calculator.
3044
3045 As `scanner` provides number parsing function using `libgmp` is it not much
3046 work to perform arbitrary rational number calculations.
3047
3048 This calculator takes one expression, or an equality test, per line.  The
3049 results are printed and if any equality test fails, the program exits with
3050 an error.
3051
3052 ###### File: parsergen.mk
3053         calc.c calc.h : parsergen parsergen.mdc
3054                 ./parsergen --tag calc -o calc parsergen.mdc
3055         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3056                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3057         calctest : calc
3058                 ./calc parsergen.mdc
3059         demos :: calctest
3060
3061 # calc: header
3062
3063         #include "parse_number.h"
3064         // what do we use for a demo-grammar?  A calculator of course.
3065         struct number {
3066                 mpq_t val;
3067                 char tail[2];
3068                 int err;
3069         };
3070
3071 # calc: code
3072
3073         #include <stdlib.h>
3074         #include <unistd.h>
3075         #include <fcntl.h>
3076         #include <sys/mman.h>
3077         #include <stdio.h>
3078         #include <malloc.h>
3079         #include <gmp.h>
3080         #include <string.h>
3081         #include "mdcode.h"
3082         #include "scanner.h"
3083         #include "parser.h"
3084
3085         #include "calc.h"
3086
3087         static void free_number(struct number *n)
3088         {
3089                 mpq_clear(n->val);
3090                 free(n);
3091         }
3092
3093         static int text_is(struct text t, char *s)
3094         {
3095                 return (strlen(s) == t.len &&
3096                         strncmp(s, t.txt, t.len) == 0);
3097         }
3098
3099         int main(int argc, char *argv[])
3100         {
3101                 int fd = open(argv[1], O_RDONLY);
3102                 int len = lseek(fd, 0, 2);
3103                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3104                 struct section *table = code_extract(file, file+len, NULL);
3105                 struct section *s;
3106                 struct token_config config = {
3107                         .ignored = (1 << TK_line_comment)
3108                                  | (1 << TK_block_comment)
3109                                  | (1 << TK_in)
3110                                  | (1 << TK_out),
3111                         .number_chars = ".,_+-",
3112                         .word_start = "",
3113                         .word_cont = "",
3114                 };
3115                 for (s = table; s; s = s->next)
3116                         if (text_is(s->section, "example: input"))
3117                                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3118                 while (table) {
3119                         struct section *t = table->next;
3120                         code_free(table->code);
3121                         free(table);
3122                         table = t;
3123                 }
3124                 exit(0);
3125         }
3126
3127 # calc: grammar
3128
3129         $LEFT + -
3130         $LEFT * /
3131
3132         Session -> Session Line
3133                 | Line
3134
3135         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3136                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3137                                         gmp_printf("  or as a decimal: %Fg\n", fl);
3138                                         mpf_clear(fl);
3139                                         }
3140                                      }$
3141                 | Expression = Expression NEWLINE ${
3142                         if (mpq_equal($1.val, $3.val))
3143                                 gmp_printf("Both equal %Qd\n", $1.val);
3144                         else {
3145                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
3146                                         $1.val, $3.val);
3147                                 exit(1);
3148                         }
3149                 }$
3150                 | NEWLINE ${ printf("Blank line\n"); }$
3151                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3152
3153         $number
3154         Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3155                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3156                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3157                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3158                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3159                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3160
3161 # example: input
3162
3163         355/113
3164         3.1415926535 - 355/113
3165         2 + 4 * 5
3166         1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3167         10 * 9 / 2
3168         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
3169
3170         error