ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator with 2D support #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 "2D support" means that indentation and line breaks can be significant.
   8
   9 There are several distinct sections.
  10
  11 1. `grammar_read` will read a grammar from a literate-code file and
  12    store the productions, symbols, and code fragments.
  13 2. `grammar_analyse` will take that grammar and build LR parsing
  14    states and look-ahead tables.
  15 3. `grammar_report` will present the details of the analysis
  16    in a readable format and will report any conflicts.
  17 4. `parser_generate` will write out C code files with various
  18    tables and with the code fragments arranged in useful places.
  19 5. `parser_run` is a library function intended to be linked together
  20    with the generated parser tables to complete the implementation of
  21    a parser.
  22 6. Finally `calc` is a test grammar for a simple calculator.  The
  23    `parsergen` program built from the C code in this file can extract
  24    that grammar directly from this file and process it.
  25
  26 ###### File: parsergen.c
  27         #include <unistd.h>
  28         #include <stdlib.h>
  29         #include <stdio.h>
  30         ## includes
  31         ## forward declarations
  32         ## declarations
  33         ## functions
  34         ## grammar_read
  35         ## grammar_analyse
  36         ## grammar_report
  37         ## parser_generate
  38         ## main
  39 ###### File: parser.h
  40         ## exported types
  41         ## exported functions
  42 ###### File: libparser.c
  43         #include <unistd.h>
  44         #include <stdlib.h>
  45         #include <stdio.h>
  46         ## parser includes
  47         ## parser functions
  48         ## parser_run
  49 ###### File: parsergen.mk
  50         CFLAGS += -Wall -g
  51         all :: parsergen calc
  52         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  53                 ./md2c parsergen.mdc
  54
  55 ## Reading the grammar
  56
  57 The grammar must be stored in a markdown literate code file as parsed by
  58 "[mdcode][]".  It may have up to four top level (i.e.  unreferenced)
  59 sections: `header`, `code`, `grammar` and `reduce`.  The first two, if
  60 present, will be literally copied into the generated `.c` and `.h`
  61 files.  The third is required and contains the grammar which is
  62 tokenised with "[scanner][]".  `reduce` can provide variable
  63 declarations and common intialization code for all the reduction code
  64 fragments in the grammar.
  65
  66 If the `--tag` option is given, then any top level header that doesn't
  67 start with the tag is ignored, and the tag is striped from the rest.  So
  68 `--tag Foo` means that the four sections expected are `Foo: header`,
  69 `Foo: code`, `Foo: grammar` and `Foo: reduce`.  The tag `calc` is used
  70 to extract the sample calculator from this file.
  71
  72 [mdcode]: mdcode.html
  73 [scanner]: scanner.html
  74
  75 ###### includes
  76         #include "mdcode.h"
  77         #include "scanner.h"
  78
  79 ###### parser includes
  80         #include "mdcode.h"
  81         #include "scanner.h"
  82
  83 The grammar contains production sets, precedence/associativity
  84 declarations, general terminal declarations, and data type declarations.
  85 These are all parsed with _ad hoc_ parsing as we don't have a parser
  86 generator yet.
  87
  88 The precedence and associativity can be set for each production, and
  89 can be inherited from symbols.  The data type (either structure or a
  90 reference to a structure) is potentially defined for each non-terminal
  91 and describes what C structure is used to store information about each
  92 symbol.
  93
  94 ###### declarations
  95         enum assoc {Left, Right, Non};
  96         char *assoc_names[] = {"Left","Right","Non"};
  97
  98         struct symbol {
  99                 struct text struct_name;
 100                 int isref;
 101                 enum assoc assoc;
 102                 unsigned short precedence;
 103                 ## symbol fields
 104         };
 105         struct production {
 106                 unsigned short precedence;
 107                 enum assoc assoc;
 108                 ## production fields
 109         };
 110         struct grammar {
 111                 ## grammar fields
 112         };
 113
 114 The strings reported by `mdcode` and `scanner` are `struct text` which
 115 have length rather than being nul terminated.  To help with printing and
 116 comparing we define `text_is` and `prtxt`, which should possibly go in
 117 `mdcode`.  `scanner` does provide `text_dump` which is useful for
 118 strings which might contain control characters.
 119
 120 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 121 because that is what we need to detect tags.
 122
 123 ###### functions
 124         static int text_is(struct text t, char *s)
 125         {
 126                 return (strlen(s) == t.len &&
 127                         strncmp(s, t.txt, t.len) == 0);
 128         }
 129         static void prtxt(struct text t)
 130         {
 131                 printf("%.*s", t.len, t.txt);
 132         }
 133
 134         static int strip_tag(struct text *t, char *tag)
 135         {
 136                 int skip = strlen(tag) + 1;
 137                 if (skip >= t->len ||
 138                     strncmp(t->txt, tag, skip-1) != 0 ||
 139                     t->txt[skip-1] != ':')
 140                         return 0;
 141                 while (skip < t->len && t->txt[skip] == ' ')
 142                         skip++;
 143                 t->len -= skip;
 144                 t->txt += skip;
 145                 return 1;
 146         }
 147
 148 ### Symbols
 149
 150 Productions are comprised primarily of symbols - terminal and
 151 non-terminal.  We do not make a syntactic distinction between the two,
 152 though non-terminals must be identifiers.  Non-terminal symbols are
 153 those which appear in the head of a production, terminal symbols are
 154 those which don't.  There are also "virtual" symbols used for precedence
 155 marking discussed later, and sometimes we won't know what type a symbol
 156 is yet.
 157
 158 To help with code safety it is possible to declare the terminal symbols.
 159 If this is done, then any symbol used in a production that does not
 160 appear in the head of any production and is not declared as a terminal
 161 is treated as an error.
 162
 163 ###### forward declarations
 164         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 165         char *symtypes = "UVTN";
 166 ###### symbol fields
 167         enum symtype type;
 168 ###### grammar fields
 169         int terminals_declared;
 170
 171 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 172 table of known symbols and the resulting parser will report them as
 173 `TK_reserved + N`.  A small set of identifiers are reserved for the
 174 different token types that `scanner` can report.
 175
 176 ###### declarations
 177
 178         static struct { int num; char *name; } reserved_words[] = {
 179                 { TK_error,        "ERROR" },
 180                 { TK_number,       "NUMBER" },
 181                 { TK_ident,        "IDENTIFIER" },
 182                 { TK_mark,         "MARK" },
 183                 { TK_string,       "STRING" },
 184                 { TK_multi_string, "MULTI_STRING" },
 185                 { TK_in,           "IN" },
 186                 { TK_out,          "OUT" },
 187                 { TK_newline,      "NEWLINE" },
 188                 { TK_eof,          "$eof" },
 189         };
 190 ###### symbol fields
 191         short num;
 192
 193 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 194 recognised.  The former is automatically expected at the end of the text
 195 being parsed. The latter are always ignored by the parser.
 196
 197 All of these symbols are stored in a simple symbol table.  We use the
 198 `struct text` from `mdcode` to store the name and link them together
 199 into a sorted list using an insertion sort.
 200
 201 We don't have separate `find` and `insert` functions as any symbol we
 202 find needs to be remembered.  We simply expect `find` to always return a
 203 symbol, but its type might be `Unknown`.
 204
 205 ###### includes
 206         #include <string.h>
 207
 208 ###### symbol fields
 209         struct text name;
 210         struct symbol *next;
 211
 212 ###### grammar fields
 213         struct symbol *syms;
 214         int num_syms;
 215
 216 ###### functions
 217         static struct symbol *sym_find(struct grammar *g, struct text s)
 218         {
 219                 struct symbol **l = &g->syms;
 220                 struct symbol *n;
 221                 int cmp = 1;
 222
 223                 while (*l &&
 224                         (cmp = text_cmp((*l)->name, s)) < 0)
 225                                 l = & (*l)->next;
 226                 if (cmp == 0)
 227                         return *l;
 228                 n = calloc(1, sizeof(*n));
 229                 n->name = s;
 230                 n->type = Unknown;
 231                 n->next = *l;
 232                 n->num = -1;
 233                 *l = n;
 234                 return n;
 235         }
 236
 237         static void symbols_init(struct grammar *g)
 238         {
 239                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 240                 int i;
 241                 for (i = 0; i < entries; i++) {
 242                         struct text t;
 243                         struct symbol *s;
 244                         t.txt = reserved_words[i].name;
 245                         t.len = strlen(t.txt);
 246                         s = sym_find(g, t);
 247                         s->type = Terminal;
 248                         s->num = reserved_words[i].num;
 249                 }
 250         }
 251
 252 ### Data types and precedence.
 253
 254 Data type specification, precedence specification, and declaration of
 255 terminals are all introduced by a dollar sign at the start of the line.
 256 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 257 precedence, if it is `TERM` then the line declares terminals without
 258 precedence, otherwise it specifies a data type.
 259
 260 The data type name is simply stored and applied to the head of all
 261 subsequent productions.  It must be the name of a structure optionally
 262 preceded by an asterisk which means a reference or "pointer".  So
 263 `$expression` maps to `struct expression` and `$*statement` maps to
 264 `struct statement *`.
 265
 266 Any productions given before the first data type declaration will have
 267 no data type associated with them and can carry no information.  In
 268 order to allow other non-terminals to have no type, the data type
 269 `$void` can be given.  This does *not* mean that `struct void` will be
 270 used, but rather than no type will be associated with subsequent
 271 non-terminals.
 272
 273 The precedence line must contain a list of symbols, either terminal
 274 symbols or virtual symbols.  It can only contain symbols that have not
 275 been seen yet, so precedence declaration must precede the productions
 276 that they affect.
 277
 278 A "virtual" symbol is an identifier preceded by `$$`.  Virtual symbols
 279 carry precedence information but are not included in the grammar.  A
 280 production can declare that it inherits the precedence of a given
 281 virtual symbol.
 282
 283 This use for `$$` precludes it from being used as a symbol in the
 284 described language.  Two other symbols: `${` and `}$` are also
 285 unavailable.
 286
 287 Each new precedence line introduces a new precedence level and
 288 declares how it associates.  This level is stored in each symbol
 289 listed and may be inherited by any production which uses the symbol.  A
 290 production inherits from the last symbol which has a precedence.
 291
 292 The symbols on the first precedence line have the lowest precedence.
 293 Subsequent lines introduce symbols with higher precedence and so bind
 294 more tightly.
 295
 296 Note that a declaration line (or "dollar line") can start with either of
 297 two different marks: "$" or "$*".  The `dollar_line()` function defined
 298 here is told which was found in the `isref` argument.
 299
 300 ###### grammar fields
 301         struct text current_type;
 302         int type_isref;
 303         int prec_levels;
 304
 305 ###### declarations
 306         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 307         static const char *known[] = { "$$", "${", "}$" };
 308
 309 ###### functions
 310         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 311         {
 312                 struct token t = token_next(ts);
 313                 char *err;
 314                 enum assoc assoc;
 315                 int term = 0;
 316                 int found;
 317
 318                 if (t.num != TK_ident) {
 319                         err = "type or assoc expected after '$'";
 320                         goto abort;
 321                 }
 322                 if (text_is(t.txt, "LEFT"))
 323                         assoc = Left;
 324                 else if (text_is(t.txt, "RIGHT"))
 325                         assoc = Right;
 326                 else if (text_is(t.txt, "NON"))
 327                         assoc = Non;
 328                 else if (text_is(t.txt, "TERM")) {
 329                         term = 1;
 330                         g->terminals_declared = 1;
 331                 } else {
 332                         g->current_type = t.txt;
 333                         g->type_isref = isref;
 334                         if (text_is(t.txt, "void"))
 335                                 g->current_type.txt = NULL;
 336                         t = token_next(ts);
 337                         if (t.num != TK_newline) {
 338                                 err = "Extra tokens after type name";
 339                                 goto abort;
 340                         }
 341                         return NULL;
 342                 }
 343
 344                 if (isref) {
 345                         err = "$* cannot be followed by a precedence";
 346                         goto abort;
 347                 }
 348
 349                 // This is a precedence or TERM line, need some symbols.
 350                 found = 0;
 351                 g->prec_levels += 1;
 352                 t = token_next(ts);
 353                 while (t.num != TK_newline) {
 354                         enum symtype type = Terminal;
 355                         struct symbol *s;
 356                         if (t.num == TK_virtual) {
 357                                 type = Virtual;
 358                                 t = token_next(ts);
 359                                 if (t.num != TK_ident) {
 360                                         err = "$$ must be followed by a word";
 361                                         goto abort;
 362                                 }
 363                                 if (term) {
 364                                         err = "Virtual symbols not permitted on $TERM line";
 365                                         goto abort;
 366                                 }
 367                         } else if (t.num != TK_ident &&
 368                                    t.num != TK_mark) {
 369                                 err = "Illegal token in precedence line";
 370                                 goto abort;
 371                         }
 372                         s = sym_find(g, t.txt);
 373                         if (s->type != Unknown) {
 374                                 err = "Symbols in precedence/TERM line must not already be known.";
 375                                 goto abort;
 376                         }
 377                         s->type = type;
 378                         if (!term) {
 379                                 s->precedence = g->prec_levels;
 380                                 s->assoc = assoc;
 381                         }
 382                         found += 1;
 383                         t = token_next(ts);
 384                 }
 385                 if (found == 0)
 386                         err = "No symbols given on precedence/TERM line";
 387                         goto abort;
 388                 return NULL;
 389         abort:
 390                 while (t.num != TK_newline && t.num != TK_eof)
 391                         t = token_next(ts);
 392                 return err;
 393         }
 394
 395 ### Productions
 396
 397 A production either starts with an identifier which is the head
 398 non-terminal, or a vertical bar (`|`) in which case this production
 399 uses the same head as the previous one.  The identifier must be
 400 followed by a `->` mark.  All productions for a given non-terminal must
 401 be grouped together with the `nonterminal ->` given only once.
 402
 403 After this a (possibly empty) sequence of identifiers and marks form
 404 the body of the production.  A virtual symbol may be given after the
 405 body by preceding it with `$$`.  If a virtual symbol is given, the
 406 precedence of the production is that for the virtual symbol.  If none
 407 is given, the precedence is inherited from the last symbol in the
 408 production which has a precedence specified.
 409
 410 After the optional precedence may come the `${` mark.  This indicates
 411 the start of a code fragment.  If present, this must be on the same
 412 line as the start of the production.
 413
 414 All of the text from the `${` through to the matching `}$` is
 415 collected and forms the code-fragment for the production.  It must all
 416 be in one `code_node` of the literate code.  The `}$` must be
 417 at the end of a line.
 418
 419 Text in the code fragment will undergo substitutions where `$N` or
 420 `$<N`,for some numeric `N` (or non-numeric indicator as described
 421 later), will be replaced with a variable holding the parse information
 422 for the particular symbol in the production.  `$0` is the head of the
 423 production, `$1` is the first symbol of the body, etc.  The type of `$N`
 424 for a terminal symbol is `struct token`.  For a non-terminal, it is
 425 whatever has been declared for that symbol.  The `<` may be included and
 426 means that the value (usually a reference) is being moved out, so it
 427 will not automatically be freed.  The effect of using '<' is that the
 428 variable is cleared to all-zeros.
 429
 430 While building productions we will need to add to an array which needs to
 431 grow dynamically.
 432
 433 ###### functions
 434         static void array_add(void *varray, int *cnt, void *new)
 435         {
 436                 void ***array = varray;
 437                 int current = 0;
 438                 const int step = 8;
 439                 current = ((*cnt-1) | (step-1))+1;
 440                 if (*cnt == current) {
 441                         /* must grow */
 442                         current += step;
 443                         *array = realloc(*array, current * sizeof(void*));
 444                 }
 445                 (*array)[*cnt] = new;
 446                 (*cnt) += 1;
 447         }
 448
 449 Collecting the code fragment simply involves reading tokens until we
 450 hit the end token `}$`, and noting the character position of the start and
 451 the end.
 452
 453 ###### functions
 454         static struct text collect_code(struct token_state *state,
 455                                         struct token start)
 456         {
 457                 struct text code;
 458                 struct token t;
 459                 code.txt = start.txt.txt + start.txt.len;
 460                 do
 461                         t = token_next(state);
 462                 while (t.node == start.node &&
 463                        t.num != TK_close && t.num != TK_error &&
 464                        t.num != TK_eof);
 465                 if (t.num == TK_close && t.node == start.node)
 466                         code.len = t.txt.txt - code.txt;
 467                 else
 468                         code.txt = NULL;
 469                 return code;
 470         }
 471
 472 Now we have all the bits we need to parse a full production.
 473
 474 ###### includes
 475         #include <memory.h>
 476
 477 ###### grammar fields
 478         struct production **productions;
 479         int production_count;
 480
 481 ###### production fields
 482         struct symbol  *head;
 483         struct symbol **body;
 484         int             body_size;
 485         struct text     code;
 486         int             code_line;
 487
 488 ###### symbol fields
 489         int first_production;
 490
 491 ###### functions
 492         static char *parse_production(struct grammar *g,
 493                                       struct symbol *head,
 494                                       struct token_state *state)
 495         {
 496                 /* Head has already been parsed. */
 497                 struct token tk;
 498                 char *err;
 499                 struct production p, *pp;
 500
 501                 memset(&p, 0, sizeof(p));
 502                 p.head = head;
 503                 tk = token_next(state);
 504                 while (tk.num == TK_ident || tk.num == TK_mark) {
 505                         struct symbol *bs = sym_find(g, tk.txt);
 506                         if (bs->type == Unknown) {
 507                                 if (!g->terminals_declared)
 508                                         bs->type = Terminal;
 509                         }
 510                         if (bs->type == Virtual) {
 511                                 err = "Virtual symbol not permitted in production";
 512                                 goto abort;
 513                         }
 514                         if (bs->precedence) {
 515                                 p.precedence = bs->precedence;
 516                                 p.assoc = bs->assoc;
 517                         }
 518                         array_add(&p.body, &p.body_size, bs);
 519                         tk = token_next(state);
 520                 }
 521                 if (tk.num == TK_virtual) {
 522                         struct symbol *vs;
 523                         tk = token_next(state);
 524                         if (tk.num != TK_ident) {
 525                                 err = "word required after $$";
 526                                 goto abort;
 527                         }
 528                         vs = sym_find(g, tk.txt);
 529                         if (vs->precedence == 0) {
 530                                 err = "symbol after $$ must have precedence";
 531                                 goto abort;
 532                         } else {
 533                                 p.precedence = vs->precedence;
 534                                 p.assoc = vs->assoc;
 535                         }
 536                         tk = token_next(state);
 537                 }
 538                 if (tk.num == TK_open) {
 539                         p.code_line = tk.line;
 540                         p.code = collect_code(state, tk);
 541                         if (p.code.txt == NULL) {
 542                                 err = "code fragment not closed properly";
 543                                 goto abort;
 544                         }
 545                         tk = token_next(state);
 546                 }
 547
 548                 if (tk.num != TK_newline && tk.num != TK_eof) {
 549                         err = "stray tokens at end of line";
 550                         goto abort;
 551                 }
 552                 pp = malloc(sizeof(*pp));
 553                 *pp = p;
 554                 array_add(&g->productions, &g->production_count, pp);
 555                 return NULL;
 556         abort:
 557                 while (tk.num != TK_newline && tk.num != TK_eof)
 558                         tk = token_next(state);
 559                 free(p.body);
 560                 return err;
 561         }
 562
 563 With the ability to parse productions and dollar-lines, we have nearly all
 564 that we need to parse a grammar from a `code_node`.
 565
 566 The head of the first production will effectively be the `start` symbol
 567 of the grammar.  However it won't _actually_ be so.  Processing the
 568 grammar is greatly simplified if the real start symbol only has a single
 569 production, and expects `$eof` as the final terminal.  So when we find
 570 the first explicit production we insert an extra implicit production as
 571 production zero which looks like
 572
 573 ###### Example: production 0
 574         $start -> START $eof
 575
 576 where `START` is the first non-terminal given.
 577
 578 ###### create production zero
 579         struct production *p = calloc(1,sizeof(*p));
 580         struct text start = {"$start",6};
 581         struct text eof = {"$eof",4};
 582         struct text code = {"$0 = $<1;", 9};
 583         p->head = sym_find(g, start);
 584         p->head->type = Nonterminal;
 585         p->head->struct_name = g->current_type;
 586         p->head->isref = g->type_isref;
 587         if (g->current_type.txt)
 588                 p->code = code;
 589         array_add(&p->body, &p->body_size, head);
 590         array_add(&p->body, &p->body_size, sym_find(g, eof));
 591         p->head->first_production = g->production_count;
 592         array_add(&g->productions, &g->production_count, p);
 593
 594 Now we are ready to read in the grammar.  We ignore comments
 595 and strings so that the marks which introduce them can be
 596 used as terminals in the grammar.  We don't ignore numbers
 597 even though we don't allow them as that causes the scanner
 598 to produce errors that the parser is better positioned to handle.
 599 We also ignore indentation, but we expect and use newlines.
 600
 601 To allow comments in the grammar, we explicitly check for "//" as the
 602 first token on the line and those lines are skipped.  "//" can still be
 603 used as a terminal anywhere that a terminal is expected.
 604
 605 ###### grammar_read
 606         static struct grammar *grammar_read(struct code_node *code)
 607         {
 608                 struct token_config conf = {
 609                         .word_start = "",
 610                         .word_cont = "",
 611                         .words_marks = known,
 612                         .known_count = sizeof(known)/sizeof(known[0]),
 613                         .number_chars = "",
 614                         .ignored = (1 << TK_line_comment)
 615                                  | (1 << TK_block_comment)
 616                                  | (0 << TK_number)
 617                                  | (1 << TK_string)
 618                                  | (1 << TK_multi_string)
 619                                  | (1 << TK_in)
 620                                  | (1 << TK_out),
 621                 };
 622
 623                 struct token_state *state = token_open(code, &conf);
 624                 struct token tk;
 625                 struct symbol *head = NULL;
 626                 struct grammar *g;
 627                 char *err = NULL;
 628
 629                 g = calloc(1, sizeof(*g));
 630                 symbols_init(g);
 631
 632                 for (tk = token_next(state); tk.num != TK_eof;
 633                      tk = token_next(state)) {
 634                         if (tk.num == TK_newline)
 635                                 continue;
 636                         if (tk.num == TK_ident) {
 637                                 // new non-terminal
 638                                 head = sym_find(g, tk.txt);
 639                                 if (head->type == Nonterminal)
 640                                         err = "This non-terminal has already be used.";
 641                                 else if (head->type == Virtual)
 642                                         err = "Virtual symbol not permitted in head of production";
 643                                 else {
 644                                         head->type = Nonterminal;
 645                                         head->struct_name = g->current_type;
 646                                         head->isref = g->type_isref;
 647                                         if (g->production_count == 0) {
 648                                                 ## create production zero
 649                                         }
 650                                         head->first_production = g->production_count;
 651                                         tk = token_next(state);
 652                                         if (tk.num == TK_mark &&
 653                                             text_is(tk.txt, "->"))
 654                                                 err = parse_production(g, head, state);
 655                                         else
 656                                                 err = "'->' missing in production";
 657                                 }
 658                         } else if (tk.num == TK_mark
 659                                    && text_is(tk.txt, "|")) {
 660                                 // another production for same non-term
 661                                 if (head)
 662                                         err = parse_production(g, head, state);
 663                                 else
 664                                         err = "First production must have a head";
 665                         } else if (tk.num == TK_mark
 666                                    && text_is(tk.txt, "$")) {
 667                                 err = dollar_line(state, g, 0);
 668                         } else if (tk.num == TK_mark
 669                                    && text_is(tk.txt, "$*")) {
 670                                 err = dollar_line(state, g, 1);
 671                         } else if (tk.num == TK_mark
 672                                    && text_is(tk.txt, "//")) {
 673                                 while (tk.num != TK_newline &&
 674                                        tk.num != TK_eof)
 675                                         tk = token_next(state);
 676                         } else {
 677                                 err = "Unrecognised token at start of line.";
 678                         }
 679                         if (err)
 680                                 goto abort;
 681                 }
 682                 token_close(state);
 683                 if (g->terminals_declared) {
 684                         struct symbol *s;
 685                         int errs = 0;
 686                         for (s = g->syms; s; s = s->next) {
 687                                 if (s->type != Unknown)
 688                                         continue;
 689                                 errs += 1;
 690                                 fprintf(stderr, "Token %.*s not declared\n",
 691                                         s->name.len, s->name.txt);
 692                         }
 693                         if (errs) {
 694                                 free(g); // FIXME free content
 695                                 g = NULL;
 696                         }
 697                 }
 698                 return g;
 699         abort:
 700                 fprintf(stderr, "Error at line %d: %s\n",
 701                         tk.line, err);
 702                 token_close(state);
 703                 free(g); // FIXME free content
 704                 return NULL;
 705         }
 706
 707 ## Analysing the grammar
 708
 709 The central task in analysing the grammar is to determine a set of
 710 states to drive the parsing process.  This will require finding various
 711 sets of symbols and of "items".  Some of these sets will need to attach
 712 information to each element in the set, so they are more like maps.
 713
 714 Each "item" may have a 'look-ahead' or `LA` set associated with it.
 715 Many of these will be the same as each other.  To avoid memory wastage,
 716 and to simplify some comparisons of sets, these sets will be stored in a
 717 list of unique sets, each assigned a number.
 718
 719 Once we have the data structures in hand to manage these sets and lists,
 720 we can start setting the 'nullable' flag, build the 'FIRST' sets, and
 721 then create the item sets which define the various states.
 722
 723 ### Symbol sets.
 724
 725 Though we don't only store symbols in these sets, they are the main
 726 things we store, so they are called symbol sets or "symsets".
 727
 728 A symset has a size, an array of shorts, and an optional array of data
 729 values, which are also shorts.  If the array of data is not present, we
 730 store an impossible pointer, as `NULL` is used to indicate that no
 731 memory has been allocated yet;
 732
 733 ###### declarations
 734         struct symset {
 735                 short cnt;
 736                 unsigned short *syms, *data;
 737         };
 738         #define NO_DATA ((unsigned short *)1)
 739         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 740         const struct symset INIT_DATASET = { 0, NULL, NULL };
 741
 742 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
 743 using an insertion sort.  We don't explicitly record the amount of
 744 allocated space as it can be derived directly from the current `cnt`
 745 using `((cnt - 1) | 7) + 1`.
 746
 747 ###### functions
 748         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 749         {
 750                 int i;
 751                 int current = ((s->cnt-1) | 7) + 1;
 752                 if (current == s->cnt) {
 753                         current += 8;
 754                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 755                         if (s->data != NO_DATA)
 756                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 757                 }
 758                 i = s->cnt;
 759                 while (i > 0 && s->syms[i-1] > key) {
 760                         s->syms[i] = s->syms[i-1];
 761                         if (s->data != NO_DATA)
 762                                 s->data[i] = s->data[i-1];
 763                         i--;
 764                 }
 765                 s->syms[i] = key;
 766                 if (s->data != NO_DATA)
 767                         s->data[i] = val;
 768                 s->cnt += 1;
 769         }
 770
 771 Finding a symbol (or item) in a `symset` uses a simple binary search.
 772 We return the index where the value was found (so data can be accessed),
 773 or `-1` to indicate failure.
 774
 775         static int symset_find(struct symset *ss, unsigned short key)
 776         {
 777                 int lo = 0;
 778                 int hi = ss->cnt;
 779
 780                 if (hi == 0)
 781                         return -1;
 782                 while (lo + 1 < hi) {
 783                         int mid = (lo + hi) / 2;
 784                         if (ss->syms[mid] <= key)
 785                                 lo = mid;
 786                         else
 787                                 hi = mid;
 788                 }
 789                 if (ss->syms[lo] == key)
 790                         return lo;
 791                 return -1;
 792         }
 793
 794 We will often want to form the union of two symsets.  When we do, we
 795 will often want to know if anything changed (as that might mean there is
 796 more work to do).  So `symset_union` reports whether anything was added
 797 to the first set.  We use a slow quadratic approach as these sets are
 798 rarely large.  If profiling shows this to be a problem it can be
 799 optimised later.
 800
 801         static int symset_union(struct symset *a, struct symset *b)
 802         {
 803                 int i;
 804                 int added = 0;
 805                 for (i = 0; i < b->cnt; i++)
 806                         if (symset_find(a, b->syms[i]) < 0) {
 807                                 unsigned short data = 0;
 808                                 if (b->data != NO_DATA)
 809                                         data = b->data[i];
 810                                 symset_add(a, b->syms[i], data);
 811                                 added++;
 812                         }
 813                 return added;
 814         }
 815
 816 And of course we must be able to free a symset.
 817
 818         static void symset_free(struct symset ss)
 819         {
 820                 free(ss.syms);
 821                 if (ss.data != NO_DATA)
 822                         free(ss.data);
 823         }
 824
 825 ### Symset Storage
 826
 827 Some symsets are simply stored somewhere appropriate in the `grammar`
 828 data structure, others needs a bit of indirection.  This applies
 829 particularly to "LA" sets.  These will be paired with "items" in an
 830 "itemset".  As itemsets will be stored in a symset, the "LA" set needs
 831 to be stored in the `data` field, so we need a mapping from a small
 832 (short) number to an LA `symset`.
 833
 834 As mentioned earlier this involves creating a list of unique symsets.
 835
 836 For now, we just use a linear list sorted by insertion.  A skiplist
 837 would be more efficient and may be added later.
 838
 839 ###### declarations
 840
 841         struct setlist {
 842                 struct symset ss;
 843                 int num;
 844                 struct setlist *next;
 845         };
 846
 847 ###### grammar fields
 848         struct setlist *sets;
 849         int nextset;
 850
 851 ###### functions
 852
 853         static int ss_cmp(struct symset a, struct symset b)
 854         {
 855                 int i;
 856                 int diff = a.cnt - b.cnt;
 857                 if (diff)
 858                         return diff;
 859                 for (i = 0; i < a.cnt; i++) {
 860                         diff = (int)a.syms[i] - (int)b.syms[i];
 861                         if (diff)
 862                                 return diff;
 863                 }
 864                 return 0;
 865         }
 866
 867         static int save_set(struct grammar *g, struct symset ss)
 868         {
 869                 struct setlist **sl = &g->sets;
 870                 int cmp = 1;
 871                 struct setlist *s;
 872
 873                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 874                         sl = & (*sl)->next;
 875                 if (cmp == 0) {
 876                         symset_free(ss);
 877                         return (*sl)->num;
 878                 }
 879
 880                 s = malloc(sizeof(*s));
 881                 s->ss = ss;
 882                 s->num = g->nextset;
 883                 g->nextset += 1;
 884                 s->next = *sl;
 885                 *sl = s;
 886                 return s->num;
 887         }
 888
 889 Finding a set by number is currently performed by a simple linear
 890 search.  If this turns out to hurt performance, we can store the sets in
 891 a dynamic array like the productions.
 892
 893         static struct symset set_find(struct grammar *g, int num)
 894         {
 895                 struct setlist *sl = g->sets;
 896                 while (sl && sl->num != num)
 897                         sl = sl->next;
 898                 return sl->ss;
 899         }
 900
 901 ### Setting `nullable`
 902
 903 We set `nullable` on the head symbol for any production for which all
 904 the body symbols (if any) are nullable.  As this is a recursive
 905 definition, any change in the `nullable` setting means that we need to
 906 re-evaluate where it needs to be set.
 907
 908 We simply loop around performing the same calculations until no more
 909 changes happen.
 910
 911 ###### symbol fields
 912         int nullable;
 913
 914 ###### functions
 915         static void set_nullable(struct grammar *g)
 916         {
 917                 int check_again = 1;
 918                 while (check_again) {
 919                         int p;
 920                         check_again = 0;
 921                         for (p = 0; p < g->production_count; p++) {
 922                                 struct production *pr = g->productions[p];
 923                                 int s;
 924
 925                                 if (pr->head->nullable)
 926                                         continue;
 927                                 for (s = 0; s < pr->body_size; s++)
 928                                         if (! pr->body[s]->nullable)
 929                                                 break;
 930                                 if (s == pr->body_size) {
 931                                         pr->head->nullable = 1;
 932                                         check_again = 1;
 933                                 }
 934                         }
 935                 }
 936         }
 937
 938 ### Building the `first` sets
 939
 940 When calculating what can follow a particular non-terminal, we will need
 941 to know what the "first" terminal in any subsequent non-terminal might
 942 be.  So we calculate the `first` set for every non-terminal and store
 943 them in an array.  We don't bother recording the "first" set for
 944 terminals as they are trivial.
 945
 946 As the "first" for one symbol might depend on the "first" of another, we
 947 repeat the calculations until no changes happen, just like with
 948 `set_nullable`.  This makes use of the fact that `symset_union` reports
 949 if any change happens.
 950
 951 The core of this, which finds the "first" of part of a production body,
 952 will be reused for computing the "follow" sets, so we split that out
 953 into a separate function, `add_first`, which also reports if it got all
 954 the way to the end of the production without finding a non-nullable
 955 symbol.
 956
 957 ###### grammar fields
 958         struct symset *first;
 959
 960 ###### functions
 961
 962         static int add_first(struct production *pr, int start,
 963                              struct symset *target, struct grammar *g,
 964                              int *to_end)
 965         {
 966                 int s;
 967                 int changed = 0;
 968                 for (s = start; s < pr->body_size; s++) {
 969                         struct symbol *bs = pr->body[s];
 970                         if (bs->type == Terminal) {
 971                                 if (symset_find(target, bs->num) < 0) {
 972                                         symset_add(target, bs->num, 0);
 973                                         changed = 1;
 974                                 }
 975                                 break;
 976                         } else if (symset_union(target, &g->first[bs->num]))
 977                                 changed = 1;
 978                         if (!bs->nullable)
 979                                 break;
 980                 }
 981                 if (to_end)
 982                         *to_end = (s == pr->body_size);
 983                 return changed;
 984         }
 985
 986         static void build_first(struct grammar *g)
 987         {
 988                 int check_again = 1;
 989                 int s;
 990                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
 991                 for (s = 0; s < g->num_syms; s++)
 992                         g->first[s] = INIT_SYMSET;
 993
 994                 while (check_again) {
 995                         int p;
 996                         check_again = 0;
 997                         for (p = 0; p < g->production_count; p++) {
 998                                 struct production *pr = g->productions[p];
 999                                 struct symset *head = &g->first[pr->head->num];
1000
1001                                 if (add_first(pr, 0, head, g, NULL))
1002                                         check_again = 1;
1003                         }
1004                 }
1005         }
1006
1007 ### Building the `follow` sets.
1008
1009 There are two different situations when we will want to generate
1010 "follow" sets.  If we are doing an SLR analysis, we want to generate a
1011 single "follow" set for each non-terminal in the grammar.  That is what
1012 is happening here.  If we are doing an LALR or LR analysis we will want
1013 to generate a separate "LA" set for each item.  We do that later in
1014 state generation.
1015
1016 There are two parts to generating a "follow" set.  Firstly we look at
1017 every place that any non-terminal appears in the body of any production,
1018 and we find the set of possible "first" symbols after there.  This is
1019 done using `add_first` above and only needs to be done once as the
1020 "first" sets are now stable and will not change.
1021
1022 ###### grammar fields
1023         struct symset *follow;
1024
1025 ###### follow code
1026
1027         for (p = 0; p < g->production_count; p++) {
1028                 struct production *pr = g->productions[p];
1029                 int b;
1030
1031                 for (b = 0; b < pr->body_size - 1; b++) {
1032                         struct symbol *bs = pr->body[b];
1033                         if (bs->type == Terminal)
1034                                 continue;
1035                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1036                 }
1037         }
1038
1039 The second part is to add the "follow" set of the head of a production
1040 to the "follow" sets of the final symbol in the production, and any
1041 other symbol which is followed only by `nullable` symbols.  As this
1042 depends on "follow" itself we need to repeatedly perform the process
1043 until no further changes happen.
1044
1045 Rather than 2 nested loops, one that repeats the whole process until
1046 there is no change, and one that iterates of the set of productions, we
1047 combine these two functions into a single loop.
1048
1049 ###### follow code
1050
1051         for (check_again = 0, p = 0;
1052              p < g->production_count;
1053              p < g->production_count-1
1054                 ? p++ : check_again ? (check_again = 0, p = 0)
1055                               : p++) {
1056                 struct production *pr = g->productions[p];
1057                 int b;
1058
1059                 for (b = pr->body_size - 1; b >= 0; b--) {
1060                         struct symbol *bs = pr->body[b];
1061                         if (bs->type == Terminal)
1062                                 break;
1063                         if (symset_union(&g->follow[bs->num],
1064                                          &g->follow[pr->head->num]))
1065                                 check_again = 1;
1066                         if (!bs->nullable)
1067                                 break;
1068                 }
1069         }
1070
1071 We now just need to create and initialise the `follow` list to get a
1072 complete function.
1073
1074 ###### functions
1075         static void build_follow(struct grammar *g)
1076         {
1077                 int s, check_again, p;
1078                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1079                 for (s = 0; s < g->num_syms; s++)
1080                         g->follow[s] = INIT_SYMSET;
1081                 ## follow code
1082         }
1083
1084 ### Building itemsets and states
1085
1086 There are three different levels of detail that can be chosen for
1087 building the itemsets and states for the LR grammar.  They are:
1088
1089 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1090    itemsets - look-ahead, if used at all, is simply the 'follow' sets
1091    already computed,
1092 2. LALR(1) where we build look-ahead sets with each item and merge
1093    the LA sets when we find two paths to the same "kernel" set of items.
1094 3. LR(1) where different look-ahead for any item in the set means
1095    a different item set must be created.
1096
1097 ###### forward declarations
1098         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1099
1100 We need to be able to look through existing states to see if a newly
1101 generated state already exists.  For now we use a simple sorted linked
1102 list.
1103
1104 An item is a pair of numbers: the production number and the position of
1105 "DOT", which is an index into the body of the production.  As the
1106 numbers are not enormous we can combine them into a single "short" and
1107 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1108 production number (so 2000 productions with maximum of 31 symbols in the
1109 body).
1110
1111 Comparing the itemsets will be a little different to comparing symsets
1112 as we want to do the lookup after generating the "kernel" of an
1113 itemset, so we need to ignore the offset=zero items which are added during
1114 completion.
1115
1116 To facilitate this, we modify the "DOT" number so that "0" sorts to
1117 the end of the list in the symset, and then only compare items before
1118 the first "0".
1119
1120 ###### declarations
1121         static inline unsigned short item_num(int production, int dot)
1122         {
1123                 return production | ((31-dot) << 11);
1124         }
1125         static inline int item_prod(unsigned short item)
1126         {
1127                 return item & 0x7ff;
1128         }
1129         static inline int item_dot(unsigned short item)
1130         {
1131                 return (31-(item >> 11)) & 0x1f;
1132         }
1133
1134 For LR(1) analysis we need to compare not just the itemset in a state
1135 but also the LA sets.  As we assign each unique LA set a number, we
1136 can just compare the symset and the data values together.
1137
1138 ###### functions
1139         static int itemset_cmp(struct symset a, struct symset b,
1140                                enum grammar_type type)
1141         {
1142                 int i;
1143                 int av, bv;
1144
1145                 for (i = 0;
1146                      i < a.cnt && i < b.cnt &&
1147                      item_dot(a.syms[i]) > 0 &&
1148                      item_dot(b.syms[i]) > 0;
1149                      i++) {
1150                         int diff = a.syms[i] - b.syms[i];
1151                         if (diff)
1152                                 return diff;
1153                         if (type == LR1) {
1154                                 diff = a.data[i] - b.data[i];
1155                                 if (diff)
1156                                         return diff;
1157                         }
1158                 }
1159                 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1160                         av = -1;
1161                 else
1162                         av = a.syms[i];
1163                 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1164                         bv = -1;
1165                 else
1166                         bv = b.syms[i];
1167                 if (av - bv)
1168                         return av - bv;
1169                 if (type < LR1 || av == -1)
1170                         return 0;
1171                 return
1172                         a.data[i] - b.data[i];
1173         }
1174
1175 It will be helpful to know if an itemset has been "completed" or not,
1176 particularly for LALR where itemsets get merged, at which point they
1177 need to be consider for completion again.  So a `completed` flag is
1178 needed.
1179
1180 And now we can build the list of itemsets.  The lookup routine returns
1181 both a success flag and a pointer to where in the list an insert should
1182 happen, so we don't need to search a second time.
1183
1184 ###### declarations
1185         struct itemset {
1186                 struct itemset *next;
1187                 short state;
1188                 struct symset items;
1189                 struct symset go_to;
1190                 enum assoc assoc;
1191                 unsigned short precedence;
1192                 char completed;
1193         };
1194
1195 ###### grammar fields
1196         struct itemset *items;
1197         int states;
1198
1199 ###### functions
1200         static int itemset_find(struct grammar *g, struct itemset ***where,
1201                                 struct symset kernel, enum grammar_type type)
1202         {
1203                 struct itemset **ip;
1204
1205                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1206                         struct itemset *i = *ip;
1207                         int diff;
1208                         diff = itemset_cmp(i->items, kernel, type);
1209                         if (diff < 0)
1210                                 continue;
1211                         if (diff > 0)
1212                                 break;
1213                         /* found */
1214                         *where = ip;
1215                         return 1;
1216                 }
1217                 *where = ip;
1218                 return 0;
1219         }
1220
1221 Adding an itemset may require merging the LA sets if LALR analysis is
1222 happening.  If any new LA set adds any symbols that weren't in the old
1223 LA set, we clear the `completed` flag so that the dependants of this
1224 itemset will be recalculated and their LA sets updated.
1225
1226 `add_itemset` must consume the symsets it is passed, either by adding
1227 them to a data structure, of freeing them.
1228
1229         static int add_itemset(struct grammar *g, struct symset ss,
1230                                enum assoc assoc, unsigned short precedence,
1231                                enum grammar_type type)
1232         {
1233                 struct itemset **where, *is;
1234                 int i;
1235                 int found = itemset_find(g, &where, ss, type);
1236                 if (!found) {
1237                         is = calloc(1, sizeof(*is));
1238                         is->state = g->states;
1239                         g->states += 1;
1240                         is->items = ss;
1241                         is->assoc = assoc;
1242                         is->precedence = precedence;
1243                         is->next = *where;
1244                         is->go_to = INIT_DATASET;
1245                         *where = is;
1246                         return is->state;
1247                 }
1248                 is = *where;
1249                 if (type != LALR) {
1250                         symset_free(ss);
1251                         return is->state;
1252                 }
1253                 for (i = 0; i < ss.cnt; i++) {
1254                         struct symset temp = INIT_SYMSET, s;
1255                         if (ss.data[i] == is->items.data[i])
1256                                 continue;
1257                         s = set_find(g, is->items.data[i]);
1258                         symset_union(&temp, &s);
1259                         s = set_find(g, ss.data[i]);
1260                         if (symset_union(&temp, &s)) {
1261                                 is->items.data[i] = save_set(g, temp);
1262                                 is->completed = 0;
1263                         } else
1264                                 symset_free(temp);
1265                 }
1266                 symset_free(ss);
1267                 return is->state;
1268         }
1269
1270 #### The build
1271
1272 To build all the itemsets, we first insert the initial itemset made from
1273 production zero, complete each itemset, and then generate new itemsets
1274 from old until no new ones can be made.
1275
1276 Completing an itemset means finding all the items where "DOT" is
1277 followed by a nonterminal and adding "DOT=0" items for every production
1278 from that non-terminal - providing each item hasn't already been added.
1279 When we add such an item it might get inserted before the current
1280 one, so to make sure we process it we reset the loop counter to the
1281 start.
1282
1283 If LA sets are needed, the LA set for each new item is found using
1284 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This
1285 may be supplemented by the LA set for the item which produced the new
1286 item.
1287
1288 We also collect a set of all symbols which follow "DOT" (in `done`) as
1289 this is used in the next stage.
1290
1291 When itemsets are created we assign a precedence to the itemset from the
1292 complete item, if there is one.  We ignore the possibility of there
1293 being two and don't (currently) handle precedence in such grammars.
1294 When completing a grammar we ignore any item where DOT is followed by a
1295 terminal with a precedence lower than that for the itemset.  Unless the
1296 terminal has right associativity, we also ignore items where the
1297 terminal has the same precedence.  The result is that unwanted items are
1298 still in the itemset, but the terminal doesn't get into the go to set,
1299 so the item is ineffective.
1300
1301 ###### complete itemset
1302         for (i = 0; i < is->items.cnt; i++) {
1303                 int p = item_prod(is->items.syms[i]);
1304                 int bs = item_dot(is->items.syms[i]);
1305                 struct production *pr = g->productions[p];
1306                 int p2;
1307                 struct symbol *s;
1308                 struct symset LA = INIT_SYMSET;
1309                 unsigned short sn = 0;
1310
1311                 if (bs == pr->body_size)
1312                         continue;
1313                 s = pr->body[bs];
1314                 if (s->precedence && is->precedence &&
1315                     is->precedence > s->precedence)
1316                         /* This terminal has a low precedence and
1317                          * shouldn't be shifted
1318                          */
1319                         continue;
1320                 if (s->precedence && is->precedence &&
1321                     is->precedence == s->precedence && s->assoc != Right)
1322                         /* This terminal has a matching precedence and is
1323                          * not Right-associative, so we mustn't shift it.
1324                          */
1325                         continue;
1326                 if (symset_find(&done, s->num) < 0)
1327                         symset_add(&done, s->num, 0);
1328
1329                 if (s->type != Nonterminal)
1330                         continue;
1331                 check_again = 1;
1332                 if (type >= LALR) {
1333                         // Need the LA set.
1334                         int to_end;
1335                         add_first(pr, bs+1, &LA, g, &to_end);
1336                         if (to_end) {
1337                                 struct symset ss = set_find(g, is->items.data[i]);
1338                                 symset_union(&LA, &ss);
1339                         }
1340                         sn = save_set(g, LA);
1341                         LA = set_find(g, sn);
1342                 }
1343
1344                 /* Add productions for this symbol */
1345                 for (p2 = s->first_production;
1346                      p2 < g->production_count &&
1347                       g->productions[p2]->head == s;
1348                      p2++) {
1349                         int itm = item_num(p2, 0);
1350                         int pos = symset_find(&is->items, itm);
1351                         if (pos < 0) {
1352                                 symset_add(&is->items, itm, sn);
1353                                 /* Will have re-ordered, so start
1354                                  * from beginning again */
1355                                 i = -1;
1356                         } else if (type >= LALR) {
1357                                 struct symset ss = set_find(g, is->items.data[pos]);
1358                                 struct symset tmp = INIT_SYMSET;
1359                                 struct symset *la = &LA;
1360
1361                                 symset_union(&tmp, &ss);
1362                                 if (symset_union(&tmp, la)) {
1363                                         is->items.data[pos] = save_set(g, tmp);
1364                                         i = -1;
1365                                 } else
1366                                         symset_free(tmp);
1367                         }
1368                 }
1369         }
1370
1371 For each symbol we found (and placed in `done`) we collect all the items
1372 for which this symbol is next, and create a new itemset with "DOT"
1373 advanced over the symbol.  This is then added to the collection of
1374 itemsets (or merged with a pre-existing itemset).
1375
1376 ###### derive itemsets
1377         // Now we have a completed itemset, so we need to
1378         // compute all the 'next' itemsets and create them
1379         // if they don't exist.
1380         for (i = 0; i < done.cnt; i++) {
1381                 int j;
1382                 unsigned short state;
1383                 struct symbol *sym = g->symtab[done.syms[i]];
1384                 enum assoc assoc = Non;
1385                 unsigned short precedence = 0;
1386                 struct symset newitemset = INIT_SYMSET;
1387                 if (type >= LALR)
1388                         newitemset = INIT_DATASET;
1389
1390                 for (j = 0; j < is->items.cnt; j++) {
1391                         int itm = is->items.syms[j];
1392                         int p = item_prod(itm);
1393                         int bp = item_dot(itm);
1394                         struct production *pr = g->productions[p];
1395                         unsigned short la = 0;
1396                         int pos;
1397
1398                         if (bp == pr->body_size)
1399                                 continue;
1400                         if (pr->body[bp] != sym)
1401                                 continue;
1402
1403                         bp += 1;
1404                         if (type >= LALR)
1405                                 la = is->items.data[j];
1406                         if (bp == pr->body_size &&
1407                             pr->precedence > 0 &&
1408                             pr->precedence > precedence) {
1409                                 // new itemset is reducible and has a precedence.
1410                                 precedence = pr->precedence;
1411                                 assoc = pr->assoc;
1412                         }
1413                         pos = symset_find(&newitemset, item_num(p, bp));
1414
1415                         if (pos < 0)
1416                                 symset_add(&newitemset, item_num(p, bp), la);
1417                         else if (type >= LALR) {
1418                                 // Need to merge la set.
1419                                 int la2 = newitemset.data[pos];
1420                                 if (la != la2) {
1421                                         struct symset ss = set_find(g, la2);
1422                                         struct symset LA = INIT_SYMSET;
1423                                         symset_union(&LA, &ss);
1424                                         ss = set_find(g, la);
1425                                         if (symset_union(&LA, &ss))
1426                                                 newitemset.data[pos] = save_set(g, LA);
1427                                         else
1428                                                 symset_free(LA);
1429                                 }
1430                         }
1431                 }
1432                 state = add_itemset(g, newitemset, assoc, precedence, type);
1433                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1434                         symset_add(&is->go_to, done.syms[i], state);
1435         }
1436
1437 All that is left is to create the initial itemset from production zero, and
1438 with `TK_eof` as the LA set.
1439
1440 ###### functions
1441         static void build_itemsets(struct grammar *g, enum grammar_type type)
1442         {
1443                 struct symset first = INIT_SYMSET;
1444                 struct itemset *is;
1445                 int check_again;
1446                 unsigned short la = 0;
1447                 if (type >= LALR) {
1448                         // LA set just has eof
1449                         struct symset eof = INIT_SYMSET;
1450                         symset_add(&eof, TK_eof, 0);
1451                         la = save_set(g, eof);
1452                         first = INIT_DATASET;
1453                 }
1454                 // production 0, offset 0 (with no data)
1455                 symset_add(&first, item_num(0, 0), la);
1456                 add_itemset(g, first, Non, 0, type);
1457                 for (check_again = 0, is = g->items;
1458                      is;
1459                      is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1460                         int i;
1461                         struct symset done = INIT_SYMSET;
1462                         if (is->completed)
1463                                 continue;
1464                         is->completed = 1;
1465                         check_again = 1;
1466                         ## complete itemset
1467                         ## derive itemsets
1468                         symset_free(done);
1469                 }
1470         }
1471
1472 ### Completing the analysis.
1473
1474 The exact process of analysis depends on which level was requested.  For
1475 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1476 `LR1` we need first, but not follow.  For `SLR` we need both.
1477
1478 We don't build the "action" tables that you might expect as the parser
1479 doesn't use them.  They will be calculated to some extent if needed for
1480 a report.
1481
1482 Once we have built everything we allocate arrays for the two lists:
1483 symbols and itemsets.  This allows more efficient access during
1484 reporting.  The symbols are grouped as terminals and then non-terminals,
1485 and we record the changeover point in `first_nonterm`.
1486
1487 ###### grammar fields
1488         struct symbol **symtab;
1489         struct itemset **statetab;
1490         int first_nonterm;
1491
1492 ###### grammar_analyse
1493
1494         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1495         {
1496                 struct symbol *s;
1497                 struct itemset *is;
1498                 int snum = TK_reserved;
1499                 for (s = g->syms; s; s = s->next)
1500                         if (s->num < 0 && s->type == Terminal) {
1501                                 s->num = snum;
1502                                 snum++;
1503                         }
1504                 g->first_nonterm = snum;
1505                 for (s = g->syms; s; s = s->next)
1506                         if (s->num < 0 && s->type != Virtual) {
1507                                 s->num = snum;
1508                                 snum++;
1509                         }
1510                 for (s = g->syms; s; s = s->next)
1511                         if (s->num < 0) {
1512                                 s->num = snum;
1513                                 snum++;
1514                         }
1515                 g->num_syms = snum;
1516                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1517                 for (s = g->syms; s; s = s->next)
1518                         g->symtab[s->num] = s;
1519
1520                 set_nullable(g);
1521                 if (type >= SLR)
1522                         build_first(g);
1523
1524                 if (type == SLR)
1525                         build_follow(g);
1526
1527                 build_itemsets(g, type);
1528
1529                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1530                 for (is = g->items; is ; is = is->next)
1531                         g->statetab[is->state] = is;
1532         }
1533
1534 ## Reporting on the Grammar
1535
1536 The purpose of the report is to give the grammar developer insight into
1537 how the grammar parser will work.  It is basically a structured dump of
1538 all the tables that have been generated, plus a description of any conflicts.
1539
1540 ###### grammar_report
1541         static int grammar_report(struct grammar *g, enum grammar_type type)
1542         {
1543                 report_symbols(g);
1544                 if (g->follow)
1545                         report_follow(g);
1546                 report_itemsets(g);
1547                 return report_conflicts(g, type);
1548         }
1549
1550 Firstly we have the complete list of symbols, together with the
1551 "FIRST" set if that was generated.  We add a mark to each symbol to
1552 show if it is nullable (`.`).
1553
1554 ###### functions
1555
1556         static void report_symbols(struct grammar *g)
1557         {
1558                 int n;
1559                 if (g->first)
1560                         printf("SYMBOLS + FIRST:\n");
1561                 else
1562                         printf("SYMBOLS:\n");
1563
1564                 for (n = 0; n < g->num_syms; n++) {
1565                         struct symbol *s = g->symtab[n];
1566                         if (!s)
1567                                 continue;
1568
1569                         printf(" %c%3d%c: ",
1570                                s->nullable ? '.':' ',
1571                                s->num, symtypes[s->type]);
1572                         prtxt(s->name);
1573                         if (s->precedence)
1574                                 printf(" (%d%s)", s->precedence,
1575                                        assoc_names[s->assoc]);
1576
1577                         if (g->first && s->type == Nonterminal) {
1578                                 int j;
1579                                 char c = ':';
1580                                 for (j = 0; j < g->first[n].cnt; j++) {
1581                                         printf("%c ", c);
1582                                         c = ',';
1583                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1584                                 }
1585                         }
1586                         printf("\n");
1587                 }
1588         }
1589
1590 Then we have the follow sets if they were computed.
1591
1592         static void report_follow(struct grammar *g)
1593         {
1594                 int n;
1595                 printf("FOLLOW:\n");
1596                 for (n = 0; n < g->num_syms; n++)
1597                         if (g->follow[n].cnt) {
1598                                 int j;
1599                                 char c = ':';
1600                                 printf("  ");
1601                                 prtxt(g->symtab[n]->name);
1602                                 for (j = 0; j < g->follow[n].cnt; j++) {
1603                                         printf("%c ", c);
1604                                         c = ',';
1605                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1606                                 }
1607                                 printf("\n");
1608                         }
1609         }
1610
1611 And finally the item sets.  These include the GO TO tables and, for
1612 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1613 it up a bit.  First the items, with production number and associativity.
1614
1615         static void report_item(struct grammar *g, int itm)
1616         {
1617                 int p = item_prod(itm);
1618                 int dot = item_dot(itm);
1619                 struct production *pr = g->productions[p];
1620                 int i;
1621
1622                 printf("    ");
1623                 prtxt(pr->head->name);
1624                 printf(" ->");
1625                 for (i = 0; i < pr->body_size; i++) {
1626                         printf(" %s", (dot == i ? ". ": ""));
1627                         prtxt(pr->body[i]->name);
1628                 }
1629                 if (dot == pr->body_size)
1630                         printf(" .");
1631                 printf(" [%d]", p);
1632                 if (pr->precedence && dot == pr->body_size)
1633                         printf(" (%d%s)", pr->precedence,
1634                                assoc_names[pr->assoc]);
1635                 if (dot < pr->body_size &&
1636                     pr->body[dot]->precedence) {
1637                         struct symbol *s = pr->body[dot];
1638                         printf(" [%d%s]", s->precedence,
1639                                assoc_names[s->assoc]);
1640                 }
1641                 printf("\n");
1642         }
1643
1644 The LA sets which are (possibly) reported with each item:
1645
1646         static void report_la(struct grammar *g, int lanum)
1647         {
1648                 struct symset la = set_find(g, lanum);
1649                 int i;
1650                 char c = ':';
1651
1652                 printf("        LOOK AHEAD(%d)", lanum);
1653                 for (i = 0; i < la.cnt; i++) {
1654                         printf("%c ", c);
1655                         c = ',';
1656                         prtxt(g->symtab[la.syms[i]]->name);
1657                 }
1658                 printf("\n");
1659         }
1660
1661 Then the go to sets:
1662
1663         static void report_goto(struct grammar *g, struct symset gt)
1664         {
1665                 int i;
1666                 printf("    GOTO:\n");
1667
1668                 for (i = 0; i < gt.cnt; i++) {
1669                         printf("      ");
1670                         prtxt(g->symtab[gt.syms[i]]->name);
1671                         printf(" -> %d\n", gt.data[i]);
1672                 }
1673         }
1674
1675 Now we can report all the item sets complete with items, LA sets, and GO TO.
1676
1677         static void report_itemsets(struct grammar *g)
1678         {
1679                 int s;
1680                 printf("ITEM SETS(%d)\n", g->states);
1681                 for (s = 0; s < g->states; s++) {
1682                         int j;
1683                         struct itemset *is = g->statetab[s];
1684                         printf("  Itemset %d:", s);
1685                         if (is->precedence)
1686                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1687                         printf("\n");
1688                         for (j = 0; j < is->items.cnt; j++) {
1689                                 report_item(g, is->items.syms[j]);
1690                                 if (is->items.data != NO_DATA)
1691                                         report_la(g, is->items.data[j]);
1692                         }
1693                         report_goto(g, is->go_to);
1694                 }
1695         }
1696
1697 ### Reporting conflicts
1698
1699 Conflict detection varies a lot among different analysis levels.  However
1700 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1701 LR1 are also similar as they have FOLLOW or LA sets.
1702
1703 ###### functions
1704
1705         ## conflict functions
1706
1707         static int report_conflicts(struct grammar *g, enum grammar_type type)
1708         {
1709                 int cnt = 0;
1710                 printf("Conflicts:\n");
1711                 if (type < SLR)
1712                         cnt = conflicts_lr0(g, type);
1713                 else
1714                         cnt = conflicts_slr(g, type);
1715                 if (cnt == 0)
1716                         printf(" - no conflicts\n");
1717                 return cnt;
1718         }
1719
1720 LR0 conflicts are any state which have both a reducible item and
1721 a shiftable item, or two reducible items.
1722
1723 LR05 conflicts only occur if two possible reductions exist,
1724 as shifts always over-ride reductions.
1725
1726 ###### conflict functions
1727         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1728         {
1729                 int i;
1730                 int cnt = 0;
1731                 for (i = 0; i < g->states; i++) {
1732                         struct itemset *is = g->statetab[i];
1733                         int last_reduce = -1;
1734                         int prev_reduce = -1;
1735                         int last_shift = -1;
1736                         int j;
1737                         if (!is)
1738                                 continue;
1739                         for (j = 0; j < is->items.cnt; j++) {
1740                                 int itm = is->items.syms[j];
1741                                 int p = item_prod(itm);
1742                                 int bp = item_dot(itm);
1743                                 struct production *pr = g->productions[p];
1744
1745                                 if (bp == pr->body_size) {
1746                                         prev_reduce = last_reduce;
1747                                         last_reduce = j;
1748                                         continue;
1749                                 }
1750                                 if (pr->body[bp]->type == Terminal)
1751                                         last_shift = j;
1752                         }
1753                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1754                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1755                                 report_item(g, is->items.syms[last_shift]);
1756                                 report_item(g, is->items.syms[last_reduce]);
1757                                 cnt++;
1758                         }
1759                         if (prev_reduce >= 0) {
1760                                 printf("  State %d has 2 (or more) reducible items\n", i);
1761                                 report_item(g, is->items.syms[prev_reduce]);
1762                                 report_item(g, is->items.syms[last_reduce]);
1763                                 cnt++;
1764                         }
1765                 }
1766                 return cnt;
1767         }
1768
1769 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1770 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1771 in the source of the look ahead set.
1772
1773 We build two datasets to reflect the "action" table: one which maps
1774 terminals to items where that terminal could be shifted and another
1775 which maps terminals to items that could be reduced when the terminal
1776 is in look-ahead.  We report when we get conflicts between the two.
1777
1778 As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
1779 terminal, we ignore it.  NEWLINES are handled specially with its own
1780 rules for when to shift and when to reduce.  Conflicts are expected,
1781 but handled internally.
1782
1783         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1784         {
1785                 int i;
1786                 int cnt = 0;
1787
1788                 for (i = 0; i < g->states; i++) {
1789                         struct itemset *is = g->statetab[i];
1790                         struct symset shifts = INIT_DATASET;
1791                         struct symset reduce = INIT_DATASET;
1792                         int j;
1793                         if (!is)
1794                                 continue;
1795                         /* First collect the shifts */
1796                         for (j = 0; j < is->items.cnt; j++) {
1797                                 unsigned short itm = is->items.syms[j];
1798                                 int p = item_prod(itm);
1799                                 int bp = item_dot(itm);
1800                                 struct production *pr = g->productions[p];
1801                                 struct symbol *s;
1802
1803                                 if (bp >= pr->body_size ||
1804                                     pr->body[bp]->type != Terminal)
1805                                         /* not shiftable */
1806                                         continue;
1807
1808                                 s = pr->body[bp];
1809                                 if (s->precedence && is->precedence)
1810                                         /* Precedence resolves this, so no conflict */
1811                                         continue;
1812
1813                                 if (symset_find(&shifts, s->num) < 0)
1814                                         symset_add(&shifts, s->num, itm);
1815                         }
1816                         /* Now look for reductions and conflicts */
1817                         for (j = 0; j < is->items.cnt; j++) {
1818                                 unsigned short itm = is->items.syms[j];
1819                                 int p = item_prod(itm);
1820                                 int bp = item_dot(itm);
1821                                 struct production *pr = g->productions[p];
1822
1823                                 if (bp < pr->body_size)
1824                                         continue;
1825                                 /* reducible */
1826                                 struct symset la;
1827                                 if (type == SLR)
1828                                         la = g->follow[pr->head->num];
1829                                 else
1830                                         la = set_find(g, is->items.data[j]);
1831                                 int k;
1832                                 for (k = 0; k < la.cnt; k++) {
1833                                         int pos = symset_find(&shifts, la.syms[k]);
1834                                         if (pos >= 0 && la.syms[k] != TK_newline) {
1835                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1836                                                 cnt++;
1837                                                         prtxt(g->symtab[la.syms[k]]->name);
1838                                                 printf(":\n");
1839                                                 report_item(g, shifts.data[pos]);
1840                                                 report_item(g, itm);
1841                                         }
1842                                         pos = symset_find(&reduce, la.syms[k]);
1843                                         if (pos < 0) {
1844                                                 symset_add(&reduce, la.syms[k], itm);
1845                                                 continue;
1846                                         }
1847                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1848                                         prtxt(g->symtab[la.syms[k]]->name);
1849                                         printf(":\n");
1850                                         report_item(g, itm);
1851                                         report_item(g, reduce.data[pos]);
1852                                         cnt++;
1853                                 }
1854                         }
1855                         symset_free(shifts);
1856                         symset_free(reduce);
1857                 }
1858                 return cnt;
1859         }
1860
1861
1862 ## Generating the parser
1863
1864 The exported part of the parser is the `parse_XX` function, where the name
1865 `XX` is based on the name of the parser files.
1866
1867 This takes a `code_node`, a partially initialized `token_config`, and an
1868 optional `FILE` to send tracing to.  The `token_config` gets the list of
1869 known words added and then is used with the `code_node` to initialize the
1870 scanner.
1871
1872 `parse_XX` then calls the library function `parser_run` to actually complete
1873 the parse.  This needs the `states` table and functions to call the various
1874 pieces of code provided in the grammar file, so they are generated first.
1875
1876 ###### parser_generate
1877
1878         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1879                                struct code_node *pre_reduce)
1880         {
1881                 gen_known(f, g);
1882                 gen_non_term(f, g);
1883                 gen_goto(f, g);
1884                 gen_states(f, g);
1885                 gen_reduce(f, g, file, pre_reduce);
1886                 gen_free(f, g);
1887
1888                 fprintf(f, "#line 0 \"gen_parser\"\n");
1889                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1890                         name);
1891                 fprintf(f, "{\n");
1892                 fprintf(f, "\tstruct token_state *tokens;\n");
1893                 fprintf(f, "\tconfig->words_marks = known;\n");
1894                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1895                 fprintf(f, "\ttokens = token_open(code, config);\n");
1896                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1897                 fprintf(f, "\ttoken_close(tokens);\n");
1898                 fprintf(f, "\treturn rv;\n");
1899                 fprintf(f, "}\n\n");
1900         }
1901
1902 ### Known words table
1903
1904 The known words table is simply an array of terminal symbols.
1905 The table of nonterminals used for tracing is a similar array.
1906
1907 ###### functions
1908
1909         static void gen_known(FILE *f, struct grammar *g)
1910         {
1911                 int i;
1912                 fprintf(f, "#line 0 \"gen_known\"\n");
1913                 fprintf(f, "static const char *known[] = {\n");
1914                 for (i = TK_reserved;
1915                      i < g->num_syms && g->symtab[i]->type == Terminal;
1916                      i++)
1917                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1918                                 g->symtab[i]->name.txt);
1919                 fprintf(f, "};\n\n");
1920         }
1921
1922         static void gen_non_term(FILE *f, struct grammar *g)
1923         {
1924                 int i;
1925                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1926                 fprintf(f, "static const char *non_term[] = {\n");
1927                 for (i = TK_reserved;
1928                      i < g->num_syms;
1929                      i++)
1930                         if (g->symtab[i]->type == Nonterminal)
1931                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1932                                         g->symtab[i]->name.txt);
1933                 fprintf(f, "};\n\n");
1934         }
1935
1936 ### States and the goto tables.
1937
1938 For each state we record the goto table and details of the reducible
1939 production if there is one.
1940 Some of the details of the reducible production are stored in the
1941 `do_reduce` function to come later.  Here we store the production
1942 number, the body size (useful for stack management), and the resulting
1943 symbol (useful for knowing how to free data later).
1944
1945 The go to table is stored in a simple array of `sym` and corresponding
1946 `state`.
1947
1948 ###### exported types
1949
1950         struct lookup {
1951                 short sym;
1952                 short state;
1953         };
1954         struct state {
1955                 short go_to_cnt;
1956                 const struct lookup * go_to;
1957                 short reduce_prod;
1958                 short reduce_size;
1959                 short reduce_sym;
1960                 short result_size;
1961         };
1962
1963 ###### functions
1964
1965         static void gen_goto(FILE *f, struct grammar *g)
1966         {
1967                 int i;
1968                 fprintf(f, "#line 0 \"gen_goto\"\n");
1969                 for (i = 0; i < g->states; i++) {
1970                         struct symset gt = g->statetab[i]->go_to;
1971                         int j;
1972
1973                         if (gt.cnt == 0)
1974                                 continue;
1975                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
1976                                 i);
1977                         for (j = 0; j < gt.cnt; j++)
1978                                 fprintf(f, "\t{ %d, %d },\n",
1979                                         gt.syms[j], gt.data[j]);
1980                         fprintf(f, "};\n");
1981                 }
1982         }
1983
1984         static void gen_states(FILE *f, struct grammar *g)
1985         {
1986                 int i;
1987                 fprintf(f, "#line 0 \"gen_states\"\n");
1988                 fprintf(f, "static const struct state states[] = {\n");
1989                 for (i = 0; i < g->states; i++) {
1990                         struct itemset *is = g->statetab[i];
1991                         int j, prod = -1, prod_len;
1992
1993                         for (j = 0; j < is->items.cnt; j++) {
1994                                 int itm = is->items.syms[j];
1995                                 int p = item_prod(itm);
1996                                 int bp = item_dot(itm);
1997                                 struct production *pr = g->productions[p];
1998
1999                                 if (bp < pr->body_size)
2000                                         continue;
2001                                 /* This is what we reduce - choose longest */
2002                                 if (prod < 0 || prod_len < pr->body_size) {
2003                                         prod = p;
2004                                         prod_len = pr->body_size;
2005                                 }
2006                         }
2007                         if (is->go_to.cnt)
2008                                 fprintf(f, "\t[%d] = { %d, goto_%d, ",
2009                                         i, is->go_to.cnt, i);
2010                         else
2011                                 fprintf(f, "\t[%d] = { 0, NULL, ", i);
2012                         if (prod >= 0) {
2013                                 struct production *pr = g->productions[prod];
2014                                 struct symbol *hd = pr->head;
2015                                 fprintf(f, "%d, %d, %d, ",
2016                                         prod, pr->body_size, pr->head->num);
2017                                 if (hd->struct_name.txt == NULL)
2018                                         fprintf(f, "0 },\n");
2019                                 else
2020                                         fprintf(f, "sizeof(struct %.*s%s) },\n",
2021                                                 hd->struct_name.len,
2022                                                 hd->struct_name.txt,
2023                                                 hd->isref ? "*" : "");
2024                         } else
2025                                 fprintf(f, "-1, -1, -1, -1 },\n");
2026                 }
2027                 fprintf(f, "};\n\n");
2028         }
2029
2030 ### The `do_reduce` function and the code
2031
2032 When the parser engine decides to reduce a production, it calls
2033 `do_reduce` which runs the code that was included with the production,
2034 if any.
2035
2036 This code needs to be able to store data somewhere.  Rather than
2037 requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
2038 buffer and have `do_reduce` return the size to be saved.
2039
2040 In order for the code to access "global" context, we pass in the
2041 "config" pointer that was passed to the parser function.  If the `struct
2042 token_config` is embedded in some larger structure, the reducing code
2043 can access the larger structure using pointer manipulation.  Performing
2044 that pointer manipulation, and any other common code or variables for
2045 all reduce actions, can be provided in code node called "reduce" which
2046 is passed around in `pre_reduce` which you might have already noticed.
2047
2048 The code fragments require translation when written out.  Any `$N` needs
2049 to be converted to a reference either to that buffer (if `$0`) or to the
2050 structure returned by a previous reduction.  These pointers need to be
2051 cast to the appropriate type for each access.  All this is handled in
2052 `gen_code`.
2053
2054 `gen_code` also allows symbol references to contain a '`<`' as in
2055 '`$<2`'.  This is particularly useful for references (or pointers), but
2056 can be used with structures too.  The `<` implies that the value is
2057 being moved out, so the object will not be automatically freed.  It is
2058 equivalent to assigning `NULL` to the pointer or filling a structure
2059 with zeros.
2060
2061 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2062 and possibly a number.  A number by itself (other than zero) selects a
2063 symbol from the body of the production.  A sequence of letters selects
2064 the shortest symbol in the body which contains those letters in the
2065 given order.  If a number follows the letters, then a later occurrence
2066 of that symbol is chosen.  So "`$AB2`" will refer to the structure
2067 attached to the second occurrence of the shortest symbol which contains
2068 an `A` followed by a `B`.  If there is no unique shortest system, or if
2069 the number given is too large, then the symbol reference is not
2070 transformed, and will cause an error when the code is compiled.
2071
2072 ###### functions
2073
2074         static int textchr(struct text t, char c, int s)
2075         {
2076                 int i;
2077                 for (i = s; i < t.len; i++)
2078                         if (t.txt[i] == c)
2079                                 return i;
2080                 return -1;
2081         }
2082
2083         static int subseq_match(char *seq, int slen, struct text name)
2084         {
2085                 int st = 0;
2086                 while (slen > 0) {
2087                         st = textchr(name, *seq, st);
2088                         if (st < 0)
2089                                 return 0;
2090                         slen -= 1;
2091                         seq += 1;
2092                         st += 1;
2093                 }
2094                 return 1;
2095         }
2096
2097         static int choose_sym(char **namep, int len, struct production *p)
2098         {
2099                 char *name = *namep;
2100                 char *nam = name;
2101                 int namlen;
2102                 int n = 0;
2103                 int i, s, slen;
2104                 char c;
2105
2106                 c = *name;
2107                 while (len > 0 &&
2108                        ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2109                         name += 1;
2110                         len -= 1;
2111                         c = *name;
2112                 }
2113                 namlen = name-nam;
2114                 while (len > 0 && (c >= '0' && c <= '9')) {
2115                         name += 1;
2116                         len -= 1;
2117                         n = n * 10 + (c - '0');
2118                         c = *name;
2119                 }
2120                 if (namlen == 0) {
2121                         if (name == *namep || n > p->body_size)
2122                                 return -1;
2123                         *namep = name;
2124                         return n;
2125                 }
2126                 slen = 0; s = -1;
2127                 for (i = 0; i < p->body_size; i++) {
2128                         if (!subseq_match(nam, namlen, p->body[i]->name))
2129                                 continue;
2130                         if (slen == 0 || p->body[i]->name.len < slen) {
2131                                 s = i;
2132                                 slen = p->body[i]->name.len;
2133                         }
2134                         if (s >= 0 && p->body[i] != p->body[s] &&
2135                             p->body[i]->name.len == p->body[s]->name.len)
2136                                 /* not unique, so s cannot be used */
2137                                 s = -1;
2138                 }
2139                 if (s < 0)
2140                         return -1;
2141                 if (n == 0)
2142                         n = 1;
2143                 for (i = 0; i < p->body_size; i++)
2144                         if (p->body[i] == p->body[s]) {
2145                                 n -= 1;
2146                                 if (n == 0)
2147                                         break;
2148                         }
2149                 if (n > 0 || i == p->body_size)
2150                         return -1;
2151                 *namep = name;
2152                 return i + 1;
2153         }
2154
2155         static void gen_code(struct production *p, FILE *f, struct grammar *g)
2156         {
2157                 char *c;
2158                 char *used = calloc(1, p->body_size);
2159                 int i;
2160
2161                 fprintf(f, "\t\t\t");
2162                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2163                         int n;
2164                         int use = 0;
2165                         if (*c != '$') {
2166                                 fputc(*c, f);
2167                                 if (*c == '\n')
2168                                         fputs("\t\t\t", f);
2169                                 continue;
2170                         }
2171                         c++;
2172                         if (*c == '<') {
2173                                 use = 1;
2174                                 c++;
2175                         }
2176                         n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2177                         if (n < 0) {
2178                                 fputc('$', f);
2179                                 if (use)
2180                                         fputc('<', f);
2181                                 fputc(*c, f);
2182                                 continue;
2183                         }
2184                         if (n == 0)
2185                                 fprintf(f, "(*(struct %.*s*%s)ret)",
2186                                         p->head->struct_name.len,
2187                                         p->head->struct_name.txt,
2188                                         p->head->isref ? "*":"");
2189                         else if (p->body[n-1]->type == Terminal)
2190                                 fprintf(f, "(*(struct token *)body[%d])",
2191                                         n-1);
2192                         else if (p->body[n-1]->struct_name.txt == NULL)
2193                                 fprintf(f, "$%d", n);
2194                         else {
2195                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2196                                         p->body[n-1]->struct_name.len,
2197                                         p->body[n-1]->struct_name.txt,
2198                                         p->body[n-1]->isref ? "*":"", n-1);
2199                                 used[n-1] = use;
2200                         }
2201                         c -= 1;
2202                 }
2203                 fputs("\n", f);
2204                 for (i = 0; i < p->body_size; i++) {
2205                         if (p->body[i]->struct_name.txt &&
2206                             used[i]) {
2207                                 // assume this has been copied out
2208                                 if (p->body[i]->isref)
2209                                         fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2210                                 else
2211                                         fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2212                         }
2213                 }
2214                 free(used);
2215         }
2216
2217 ###### functions
2218
2219         static void gen_reduce(FILE *f, struct grammar *g, char *file,
2220                                struct code_node *pre_reduce)
2221         {
2222                 int i;
2223                 fprintf(f, "#line 1 \"gen_reduce\"\n");
2224                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2225                 fprintf(f, "{\n");
2226                 fprintf(f, "\tint ret_size = 0;\n");
2227                 if (pre_reduce)
2228                         code_node_print(f, pre_reduce, file);
2229
2230                 fprintf(f, "#line 4 \"gen_reduce\"\n");
2231                 fprintf(f, "\tswitch(prod) {\n");
2232                 for (i = 0; i < g->production_count; i++) {
2233                         struct production *p = g->productions[i];
2234                         fprintf(f, "\tcase %d:\n", i);
2235
2236                         if (p->code.txt) {
2237                                 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2238                                 gen_code(p, f, g);
2239                         }
2240
2241                         if (p->head->struct_name.txt)
2242                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2243                                         p->head->struct_name.len,
2244                                         p->head->struct_name.txt,
2245                                         p->head->isref ? "*":"");
2246
2247                         fprintf(f, "\t\tbreak;\n");
2248                 }
2249                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2250         }
2251
2252 ### `do_free`
2253
2254 As each non-terminal can potentially cause a different type of data
2255 structure to be allocated and filled in, we need to be able to free it when
2256 done.
2257
2258 It is particularly important to have fine control over freeing during error
2259 recovery where individual stack frames might need to be freed.
2260
2261 For this, the grammar author is required to defined a `free_XX` function for
2262 each structure that is used by a non-terminal.  `do_free` will call whichever
2263 is appropriate given a symbol number, and will call `free` (as is
2264 appropriate for tokens) on any terminal symbol.
2265
2266 ###### functions
2267
2268         static void gen_free(FILE *f, struct grammar *g)
2269         {
2270                 int i;
2271
2272                 fprintf(f, "#line 0 \"gen_free\"\n");
2273                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2274                 fprintf(f, "{\n");
2275                 fprintf(f, "\tif (!asn) return;\n");
2276                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2277                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2278                 fprintf(f, "\tswitch(sym) {\n");
2279
2280                 for (i = 0; i < g->num_syms; i++) {
2281                         struct symbol *s = g->symtab[i];
2282                         if (!s ||
2283                             s->type != Nonterminal ||
2284                             s->struct_name.txt == NULL)
2285                                 continue;
2286
2287                         fprintf(f, "\tcase %d:\n", s->num);
2288                         if (s->isref) {
2289                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2290                                         s->struct_name.len,
2291                                         s->struct_name.txt);
2292                                 fprintf(f, "\t\tfree(asn);\n");
2293                         } else
2294                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2295                                         s->struct_name.len,
2296                                         s->struct_name.txt);
2297                         fprintf(f, "\t\tbreak;\n");
2298                 }
2299                 fprintf(f, "\t}\n}\n\n");
2300         }
2301
2302 ## The main routine.
2303
2304 There are three key parts to the "main" function of parsergen: processing
2305 the arguments, loading the grammar file, and dealing with the grammar.
2306
2307 ### Argument processing.
2308
2309 Command line options allow the selection of analysis mode, name of output
2310 file, and whether or not a report should be generated.  By default we create
2311 a report only if no code output was requested.
2312
2313 The `parse_XX` function name uses the basename of the output file which
2314 should not have a suffix (`.c`).  `.c` is added to the given name for the
2315 code, and `.h` is added for the header (if header text is specifed in the
2316 grammar file).
2317
2318 ###### includes
2319         #include <getopt.h>
2320
2321 ###### declarations
2322         static const struct option long_options[] = {
2323                 { "LR0",        0, NULL, '0' },
2324                 { "LR05",       0, NULL, '5' },
2325                 { "SLR",        0, NULL, 'S' },
2326                 { "LALR",       0, NULL, 'L' },
2327                 { "LR1",        0, NULL, '1' },
2328                 { "tag",        1, NULL, 't' },
2329                 { "report",     0, NULL, 'R' },
2330                 { "output",     1, NULL, 'o' },
2331                 { NULL,         0, NULL, 0   }
2332         };
2333         const char *options = "05SL1t:Ro:";
2334
2335 ###### process arguments
2336         int opt;
2337         char *outfile = NULL;
2338         char *infile;
2339         char *name;
2340         char *tag = NULL;
2341         int report = 1;
2342         enum grammar_type type = LR05;
2343         while ((opt = getopt_long(argc, argv, options,
2344                                   long_options, NULL)) != -1) {
2345                 switch(opt) {
2346                 case '0':
2347                         type = LR0; break;
2348                 case '5':
2349                         type = LR05; break;
2350                 case 'S':
2351                         type = SLR; break;
2352                 case 'L':
2353                         type = LALR; break;
2354                 case '1':
2355                         type = LR1; break;
2356                 case 'R':
2357                         report = 2; break;
2358                 case 'o':
2359                         outfile = optarg; break;
2360                 case 't':
2361                         tag = optarg; break;
2362                 default:
2363                         fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2364                         exit(1);
2365                 }
2366         }
2367         if (optind < argc)
2368                 infile = argv[optind++];
2369         else {
2370                 fprintf(stderr, "No input file given\n");
2371                 exit(1);
2372         }
2373         if (outfile && report == 1)
2374                 report = 0;
2375         name = outfile;
2376         if (name && strchr(name, '/'))
2377                 name = strrchr(name, '/')+1;
2378
2379         if (optind < argc) {
2380                 fprintf(stderr, "Excess command line arguments\n");
2381                 exit(1);
2382         }
2383
2384 ### Loading the grammar file
2385
2386 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2387 map it.
2388
2389 Once we have extracted the code (with `mdcode`) we expect to find three
2390 or four sections: header, code, grammar, reduce.
2391 Anything else that is not excluded by the `--tag` option is an error.
2392
2393 "header", "code", and "reduce" are optional, though it is hard to build
2394 a working parser without the first two.  "grammar" must be provided.
2395
2396 ###### includes
2397         #include <fcntl.h>
2398         #include <sys/mman.h>
2399         #include <errno.h>
2400
2401 ###### functions
2402         static int errs;
2403         static void pr_err(char *msg)
2404         {
2405                 errs++;
2406                 fprintf(stderr, "%s\n", msg);
2407         }
2408
2409 ###### load file
2410         struct section *table;
2411         int fd;
2412         int len;
2413         char *file;
2414         fd = open(infile, O_RDONLY);
2415         if (fd < 0) {
2416                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2417                         infile, strerror(errno));
2418                 exit(1);
2419         }
2420         len = lseek(fd, 0, 2);
2421         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2422         table = code_extract(file, file+len, pr_err);
2423
2424         struct code_node *hdr = NULL;
2425         struct code_node *code = NULL;
2426         struct code_node *gram = NULL;
2427         struct code_node *pre_reduce = NULL;
2428         for (s = table; s; s = s->next) {
2429                 struct text sec = s->section;
2430                 if (tag && !strip_tag(&sec, tag))
2431                         continue;
2432                 if (text_is(sec, "header"))
2433                         hdr = s->code;
2434                 else if (text_is(sec, "code"))
2435                         code = s->code;
2436                 else if (text_is(sec, "grammar"))
2437                         gram = s->code;
2438                 else if (text_is(sec, "reduce"))
2439                         pre_reduce = s->code;
2440                 else {
2441                         fprintf(stderr, "Unknown content section: %.*s\n",
2442                                 s->section.len, s->section.txt);
2443                         rv |= 2;
2444                 }
2445         }
2446
2447 ### Processing the input
2448
2449 As we need to append an extention to a filename and then open it for
2450 writing, and we need to do this twice, it helps to have a separate function.
2451
2452 ###### functions
2453
2454         static FILE *open_ext(char *base, char *ext)
2455         {
2456                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2457                 FILE *f;
2458                 strcat(strcpy(fn, base), ext);
2459                 f = fopen(fn, "w");
2460                 free(fn);
2461                 return f;
2462         }
2463
2464 If we can read the grammar, then we analyse and optionally report on it.  If
2465 the report finds conflicts we will exit with an error status.
2466
2467 ###### process input
2468         struct grammar *g = NULL;
2469         if (gram == NULL) {
2470                 fprintf(stderr, "No grammar section provided\n");
2471                 rv |= 2;
2472         } else {
2473                 g = grammar_read(gram);
2474                 if (!g) {
2475                         fprintf(stderr, "Failure to parse grammar\n");
2476                         rv |= 2;
2477                 }
2478         }
2479         if (g) {
2480                 grammar_analyse(g, type);
2481                 if (report)
2482                         if (grammar_report(g, type))
2483                                 rv |= 1;
2484         }
2485
2486 If a "headers" section is defined, we write it out and include a declaration
2487 for the `parse_XX` function so it can be used from separate code.
2488
2489         if (rv == 0 && hdr && outfile) {
2490                 FILE *f = open_ext(outfile, ".h");
2491                 if (f) {
2492                         code_node_print(f, hdr, infile);
2493                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2494                                 name);
2495                         fclose(f);
2496                 } else {
2497                         fprintf(stderr, "Cannot create %s.h\n",
2498                                 outfile);
2499                         rv |= 4;
2500                 }
2501         }
2502
2503 And if all goes well and an output file was provided, we create the `.c`
2504 file with the code section (if any) and the parser tables and function.
2505
2506         if (rv == 0 && outfile) {
2507                 FILE *f = open_ext(outfile, ".c");
2508                 if (f) {
2509                         if (code)
2510                                 code_node_print(f, code, infile);
2511                         gen_parser(f, g, infile, name, pre_reduce);
2512                         fclose(f);
2513                 } else {
2514                         fprintf(stderr, "Cannot create %s.c\n",
2515                                 outfile);
2516                         rv |= 4;
2517                 }
2518         }
2519
2520 And that about wraps it up.  We need to set the locale so that UTF-8 is
2521 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2522
2523 ###### File: parsergen.mk
2524         parsergen : parsergen.o libscanner.o libmdcode.o
2525                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2526
2527 ###### includes
2528         #include <locale.h>
2529
2530 ###### main
2531
2532         int main(int argc, char *argv[])
2533         {
2534                 struct section *s;
2535                 int rv = 0;
2536
2537                 setlocale(LC_ALL,"");
2538
2539                 ## process arguments
2540                 ## load file
2541                 ## process input
2542
2543                 return rv;
2544         }
2545
2546 ## The SHIFT/REDUCE parser
2547
2548 Having analysed the grammar and generated all the tables, we only need
2549 the shift/reduce engine to bring it all together.
2550
2551 ### Goto table lookup
2552
2553 The parser generator has nicely provided us with goto tables sorted by
2554 symbol number.  We need a binary search function to find a symbol in the
2555 table.
2556
2557 ###### parser functions
2558
2559         static int search(const struct state *l, int sym)
2560         {
2561                 int lo = 0;
2562                 int hi = l->go_to_cnt;
2563
2564                 if (hi == 0)
2565                         return -1;
2566                 while (lo + 1 < hi) {
2567                         int mid = (lo + hi) / 2;
2568                         if (l->go_to[mid].sym <= sym)
2569                                 lo = mid;
2570                         else
2571                                 hi = mid;
2572                 }
2573                 if (l->go_to[lo].sym == sym)
2574                         return l->go_to[lo].state;
2575                 else
2576                         return -1;
2577         }
2578
2579 ### Memory allocation
2580
2581 The `scanner` returns tokens in a local variable - we want them in allocated
2582 memory so they can live in the `asn_stack`.  So we provide `tok_copy` to
2583 make an allocated copy as required.
2584
2585 ###### parser includes
2586         #include <memory.h>
2587
2588 ###### parser functions
2589
2590         static struct token *tok_copy(struct token tk)
2591         {
2592                 struct token *new = malloc(sizeof(*new));
2593                 *new = tk;
2594                 return new;
2595         }
2596
2597 ### The state stack.
2598
2599 The core data structure for the parser is the stack.  This tracks all
2600 the symbols that have been recognised or partially recognised.
2601
2602 The stack usually won't grow very large - maybe a few tens of entries.
2603 So we dynamically grow an array as required but never bother to
2604 shrink it down again.
2605
2606 We keep the stack as two separate allocations.  One, `asn_stack` stores
2607 the "abstract syntax nodes" which are created by each reduction.  When
2608 we call `do_reduce` we need to pass an array of the `asn`s of the body
2609 of the production, and by keeping a separate `asn` stack, we can just
2610 pass a pointer into this stack.
2611
2612 The other allocation stores all other stack fields of which there are
2613 several.  The `state` is the most important one and guides the parsing
2614 process.  The `sym` is nearly unnecessary as it is implicit in the
2615 state.  However when we want to free entries from the `asn_stack`, it
2616 helps to know what type they are so we can call the right freeing
2617 function.  The symbol leads us to the right free function through
2618 `do_free`.
2619
2620 The `indents` count tracks the line indents with-in the symbol or
2621 immediately follow it.  These are used to allow indent information to
2622 guide parsing and error recovery.
2623
2624 `since_newline` tracks how many stack frames since the last
2625 start-of-line (whether indented or not).  So if `since_newline` is
2626 zero, then this symbol is at the start of a line.  Similarly
2627 `since_indent` counts the number of states since an indent, it is zero
2628 precisely when `indents` is not zero.
2629
2630 `newline_permitted` keeps track of whether newlines should be ignored
2631 or not.
2632
2633 The stack is most properly seen as alternating states and symbols -
2634 states, like the 'DOT' in items, are between symbols.  Each frame in
2635 our stack holds a state and the symbol that was before it.  The
2636 bottom of stack holds the start state but no symbol, as nothing came
2637 before the beginning.  As we need to store some value, `TK_eof` is used
2638 to mark the beginning of the file as well as the end.
2639
2640 ###### parser functions
2641
2642         struct parser {
2643                 struct frame {
2644                         short state;
2645                         short newline_permitted;
2646
2647                         short sym;
2648                         short indents;
2649                         short since_newline;
2650                         short since_indent;
2651                 } *stack;
2652                 void **asn_stack;
2653                 int stack_size;
2654                 int tos;
2655         };
2656
2657 #### Shift and pop
2658
2659 Two operations are needed on the stack - shift (which is like push) and pop.
2660
2661 Shift applies not only to terminals but also to non-terminals.  When we
2662 reduce a production we will pop off frames corresponding to the body
2663 symbols, then push on a frame for the head of the production.  This last
2664 is exactly the same process as shifting in a terminal so we use the same
2665 function for both.  In both cases we provide the symbol, the number of
2666 indents the symbol contains (which will be zero for a terminal symbol)
2667 and a flag indicating the the symbol was at (or was reduced from a
2668 symbol which was at) the start of a line.  The state is deduced from the
2669 current top-of-stack state and the new symbol.
2670
2671 To simplify other code we arrange for `shift` to fail if there is no `goto`
2672 state for the symbol.  This is useful in basic parsing due to our design
2673 that we shift when we can, and reduce when we cannot.  So the `shift`
2674 function reports if it could.
2675
2676 `shift` is also used to push state zero onto the stack, so if the
2677 stack is empty, it always chooses zero as the next state.
2678
2679 So `shift` finds the next state.  If that succeeds it extends the
2680 allocations if needed and pushes all the information onto the stacks.
2681
2682 ###### parser functions
2683
2684         static int shift(struct parser *p,
2685                          short sym, short indents, short start_of_line,
2686                          void *asn,
2687                          const struct state states[])
2688         {
2689                 // Push an entry onto the stack
2690                 struct frame next = {0};
2691                 int newstate = p->tos
2692                         ? search(&states[p->stack[p->tos-1].state],
2693                                  sym)
2694                         : 0;
2695                 if (newstate < 0)
2696                         return 0;
2697                 if (p->tos >= p->stack_size) {
2698                         p->stack_size += 10;
2699                         p->stack = realloc(p->stack, p->stack_size
2700                                            * sizeof(p->stack[0]));
2701                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2702                                            * sizeof(p->asn_stack[0]));
2703                 }
2704                 next.sym = sym;
2705                 next.indents = indents;
2706                 next.state = newstate;
2707
2708                 if (!start_of_line) {
2709                         if (p->tos)
2710                                 next.since_newline = p->stack[p->tos-1].since_newline + 1;
2711                         else
2712                                 next.since_newline = 1;
2713                 }
2714                 if (indents)
2715                         next.since_indent = 0;
2716                 else if (p->tos)
2717                         next.since_indent = p->stack[p->tos-1].since_indent + 1;
2718                 else
2719                         next.since_indent = 1;
2720
2721                 p->stack[p->tos] = next;
2722                 p->asn_stack[p->tos] = asn;
2723                 p->tos++;
2724                 return 1;
2725         }
2726
2727 `pop` primarily moves the top of stack (`tos`) back down the required
2728 amount and frees any `asn` entries that need to be freed.  It also
2729 collects a summary of the indents and line starts in the symbols that
2730 are being removed. It is called _after_ we reduce a production, just
2731 before we `shift` the nonterminal in.
2732
2733 ###### parser functions
2734
2735         static int pop(struct parser *p, int num,
2736                        short *start_of_line,
2737                        void(*do_free)(short sym, void *asn))
2738         {
2739                 int i;
2740                 short indents = 0;
2741                 int sol = 0;
2742
2743                 p->tos -= num;
2744                 for (i = 0; i < num; i++) {
2745                         sol |= !p->stack[p->tos+i].since_newline;
2746                         indents += p->stack[p->tos+i].indents;
2747                         do_free(p->stack[p->tos+i].sym,
2748                                 p->asn_stack[p->tos+i]);
2749                 }
2750                 if (start_of_line)
2751                         *start_of_line = sol;
2752                 return indents;
2753         }
2754
2755 ### The heart of the parser.
2756
2757 Now we have the parser.  For each token we might shift it, trigger a
2758 reduction, or start error handling.  2D tokens (IN, OUT, EOL) also need
2759 to be handled.
2760
2761 We return whatever `asn` was returned by reducing production zero.
2762
2763 If we can neither shift nor reduce we have an error to handle.  We pop
2764 single entries off the stack until we can shift the `TK_error` symbol,
2765 then drop input tokens until we find one we can shift into the new error
2766 state.  We need to ensure that something is dropped or shifted after an
2767 error, or we could get into an infinite loop, only shifting in
2768 `TK_error`, then reducing.  So we track if there has been a shift since
2769 the last error, and if not the next error always discards one token.
2770
2771 When we find `TK_in` and `TK_out` tokens which report indents we need
2772 to handle them directly as the grammar cannot express what we want to
2773 do with them.
2774
2775 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2776 record how many indents there are following the previous token.
2777
2778 `TK_out` tokens must be canceled against an indent count
2779 within the stack.  If we can reduce some symbols that are all since
2780 the most recent indent, then we do that first.  If the minimum prefix
2781 of the current state then extends back before the most recent indent,
2782 that indent can be cancelled.  If the minimum prefix is shorter then
2783 the indent had ended prematurely and we must start error handling, which
2784 is still a work-in-progress.
2785
2786 `TK_newline` tokens are ignored unless the top stack frame records
2787 that they are permitted.  In that case they will not be considered for
2788 shifting if it is possible to reduce some symbols that are all since
2789 the most recent start of line.  This is how a newline forcibly
2790 terminates any line-like structure - we try to reduce down to at most
2791 one symbol for each line where newlines are allowed.
2792 A consequence of this is that a rule like
2793
2794 ###### Example: newlines - broken
2795
2796         Newlines ->
2797                 | NEWLINE Newlines
2798         IfStatement -> Newlines if ....
2799
2800 cannot work, as the NEWLINE will never be shifted as the empty string
2801 will be reduced first.  Optional sets of newlines need to be include
2802 in the thing that preceed:
2803
2804 ###### Example: newlines - works
2805
2806         If -> if
2807                 | NEWLINE If
2808         IfStatement -> If ....
2809
2810 Here the NEWLINE will be shifted because nothing can be reduced until
2811 the `if` is seen.
2812
2813 When during error handling we discard tokens read in, we want to keep
2814 discarding until we see one that is recognised.  If we had a full set
2815 of LR(1) grammar states, this would mean looking in the look-ahead set,
2816 but we don't keep a full look-ahead set.  We only record the subset
2817 that leads to SHIFT.  We can, however, deduce the look-ahead set by
2818 looking at the SHIFT subsets for all states that we can get to by
2819 reducing zero or more times.  So we need a little function which
2820 checks if a given token is in any of these look-ahead sets.
2821
2822 ###### parser includes
2823         #include "parser.h"
2824
2825 ###### parser_run
2826
2827         static int in_lookahead(struct token *tk, const struct state *states, int state)
2828         {
2829                 while (state >= 0) {
2830                         if (search(&states[state], tk->num) >= 0)
2831                                 return 1;
2832                         if (states[state].reduce_prod < 0)
2833                                 return 0;
2834                         state = search(&states[state], states[state].reduce_sym);
2835                 }
2836                 return 0;
2837         }
2838
2839         void *parser_run(struct token_state *tokens,
2840                          const struct state states[],
2841                          int (*do_reduce)(int, void**, struct token_config*, void*),
2842                          void (*do_free)(short, void*),
2843                          FILE *trace, const char *non_term[],
2844                          struct token_config *config)
2845         {
2846                 struct parser p = { 0 };
2847                 struct token *tk = NULL;
2848                 int accepted = 0;
2849                 int shift_since_err = 1;
2850                 void *ret = NULL;
2851
2852                 shift(&p, TK_eof, 0, 1, NULL, states);
2853                 while (!accepted && p.tos > 0) {
2854                         struct token *err_tk;
2855                         struct frame *tos = &p.stack[p.tos-1];
2856                         if (!tk)
2857                                 tk = tok_copy(token_next(tokens));
2858                         parser_trace(trace, &p,
2859                                      tk, states, non_term, config->known_count);
2860
2861                         if (tk->num == TK_in) {
2862                                 tos->indents += 1;
2863                                 tos->since_newline = 0;
2864                                 tos->since_indent = 0;
2865                                 free(tk);
2866                                 tk = NULL;
2867                                 parser_trace_action(trace, "Record");
2868                                 continue;
2869                         }
2870                         if (tk->num == TK_out) {
2871                                 if (states[tos->state].reduce_size >= 0 &&
2872                                     states[tos->state].reduce_size <= tos->since_indent)
2873                                         goto force_reduce;
2874                                 if (1) {
2875                                         // OK to cancel
2876                                         struct frame *in = tos - tos->since_indent;
2877                                         in->indents -= 1;
2878                                         if (in->indents == 0) {
2879                                                 /* Reassess since_indent and newline_permitted */
2880                                                 if (in > p.stack) {
2881                                                         in->since_indent = in[-1].since_indent + 1;
2882                                                         in->newline_permitted = in[-1].newline_permitted;
2883                                                 } else {
2884                                                         in->since_indent = 0;
2885                                                         in->newline_permitted = 0;
2886                                                 }
2887                                                 while (in < tos) {
2888                                                         in += 1;
2889                                                         in->since_indent = in[-1].since_indent + 1;
2890                                                 }
2891                                         }
2892                                         free(tk);
2893                                         tk = NULL;
2894                                         parser_trace_action(trace, "Cancel");
2895                                         continue;
2896                                 }
2897                                 // fall through to error handling as both SHIFT and REDUCE
2898                                 // will fail.
2899                         }
2900                         if (tk->num == TK_newline) {
2901                                 if (!tos->newline_permitted) {
2902                                         free(tk);
2903                                         tk = NULL;
2904                                         parser_trace_action(trace, "Discard");
2905                                         continue;
2906                                 }
2907                                 if (tos->since_newline > 1 &&
2908                                     states[tos->state].reduce_size >= 0 &&
2909                                     states[tos->state].reduce_size <= tos->since_newline)
2910                                         goto force_reduce;
2911                         }
2912                         if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
2913                                 shift_since_err = 1;
2914                                 tk = NULL;
2915                                 parser_trace_action(trace, "Shift");
2916                                 continue;
2917                         }
2918                 force_reduce:
2919                         if (states[tos->state].reduce_prod >= 0 &&
2920                             !(tk->num == TK_newline ||
2921                               tk->num == TK_eof ||
2922                               tk->num == TK_out ||
2923                               (tos->indents == 0 && tos->since_newline == 0))) {
2924                                 /* Anything other than newline or out or eof
2925                                  * in an error unless we are already at start
2926                                  * of line, as this production must end at EOL.
2927                                  */
2928                         } else if (states[tos->state].reduce_prod >= 0) {
2929                                 void **body;
2930                                 void *res;
2931                                 const struct state *nextstate = &states[tos->state];
2932                                 int prod = nextstate->reduce_prod;
2933                                 int size = nextstate->reduce_size;
2934                                 int res_size = nextstate->result_size;
2935                                 short indents, start_of_line;
2936
2937                                 body = p.asn_stack + (p.tos - size);
2938                                 res = res_size ? calloc(1, res_size) : NULL;
2939                                 res_size = do_reduce(prod, body, config, res);
2940                                 if (res_size != nextstate->result_size)
2941                                         abort();
2942
2943                                 indents = pop(&p, size, &start_of_line,
2944                                               do_free);
2945
2946                                 if (!shift(&p, nextstate->reduce_sym,
2947                                            indents, start_of_line,
2948                                            res, states)) {
2949                                         if (prod != 0) abort();
2950                                         accepted = 1;
2951                                         ret = res;
2952                                 }
2953                                 parser_trace_action(trace, "Reduce");
2954                                 continue;
2955                         }
2956                         /* Error. We walk up the stack until we
2957                          * find a state which will accept TK_error.
2958                          * We then shift in TK_error and see what state
2959                          * that takes us too.
2960                          * Then we discard input tokens until
2961                          * we find one that is acceptable.
2962                          */
2963                         parser_trace_action(trace, "ERROR");
2964                         short indents = 0, start_of_line;
2965
2966                         err_tk = tok_copy(*tk);
2967                         while (p.tos > 0 &&
2968                                shift(&p, TK_error, 0, 0,
2969                                      err_tk, states) == 0)
2970                                 // discard this state
2971                                 indents += pop(&p, 1, &start_of_line, do_free);
2972                         if (p.tos == 0) {
2973                                 free(err_tk);
2974                                 // no state accepted TK_error
2975                                 break;
2976                         }
2977                         if (!shift_since_err) {
2978                                 /* must discard at least one token to avoid
2979                                  * infinite loop.
2980                                  */
2981                                 if (tk->num == TK_eof)
2982                                         break;
2983                                 free(tk);
2984                                 tk = tok_copy(token_next(tokens));
2985                         }
2986                         shift_since_err = 0;
2987                         tos = &p.stack[p.tos-1];
2988                         while (!in_lookahead(tk, states, tos->state) &&
2989                                tk->num != TK_eof) {
2990                                 free(tk);
2991                                 tk = tok_copy(token_next(tokens));
2992                                 shift_since_err = 1;
2993                                 if (tk->num == TK_in)
2994                                         indents += 1;
2995                                 if (tk->num == TK_out) {
2996                                         if (indents == 0)
2997                                                 break;
2998                                         indents -= 1;
2999                                         // FIXME update since_indent here
3000                                 }
3001                         }
3002                         tos->indents += indents;
3003                 }
3004                 free(tk);
3005                 pop(&p, p.tos, NULL, do_free);
3006                 free(p.asn_stack);
3007                 free(p.stack);
3008                 return ret;
3009         }
3010
3011 ###### exported functions
3012         void *parser_run(struct token_state *tokens,
3013                          const struct state states[],
3014                          int (*do_reduce)(int, void**, struct token_config*, void*),
3015                          void (*do_free)(short, void*),
3016                          FILE *trace, const char *non_term[],
3017                          struct token_config *config);
3018
3019 ### Tracing
3020
3021 Being able to visualize the parser in action can be invaluable when
3022 debugging the parser code, or trying to understand how the parse of a
3023 particular grammar progresses.  The stack contains all the important
3024 state, so just printing out the stack every time around the parse loop
3025 can make it possible to see exactly what is happening.
3026
3027 This doesn't explicitly show each SHIFT and REDUCE action.  However they
3028 are easily deduced from the change between consecutive lines, and the
3029 details of each state can be found by cross referencing the states list
3030 in the stack with the "report" that parsergen can generate.
3031
3032 For terminal symbols, we just dump the token.  For non-terminals we
3033 print the name of the symbol.  The look ahead token is reported at the
3034 end inside square brackets.
3035
3036 ###### parser functions
3037
3038         static char *reserved_words[] = {
3039                 [TK_error]        = "ERROR",
3040                 [TK_in]           = "IN",
3041                 [TK_out]          = "OUT",
3042                 [TK_newline]      = "NEWLINE",
3043                 [TK_eof]          = "$eof",
3044         };
3045
3046         void parser_trace(FILE *trace, struct parser *p,
3047                           struct token *tk, const struct state states[],
3048                           const char *non_term[], int knowns)
3049         {
3050                 int i;
3051                 if (!trace)
3052                         return;
3053                 for (i = 0; i < p->tos; i++) {
3054                         struct frame *f = &p->stack[i];
3055                         if (i) {
3056                                 int sym = f->sym;
3057                                 if (sym < TK_reserved &&
3058                                     reserved_words[sym] != NULL)
3059                                         fputs(reserved_words[sym], trace);
3060                                 else if (sym < TK_reserved + knowns) {
3061                                         struct token *t = p->asn_stack[i];
3062                                         text_dump(trace, t->txt, 20);
3063                                 } else
3064                                         fputs(non_term[sym - TK_reserved - knowns],
3065                                               trace);
3066                                 if (f->indents)
3067                                         fprintf(trace, ".%d", f->indents);
3068                                 if (f->since_newline == 0)
3069                                         fputs("/", trace);
3070                                 fputs(" ", trace);
3071                         }
3072                         fprintf(trace, "(%d) ", f->state);
3073                 }
3074                 fprintf(trace, "[");
3075                 if (tk->num < TK_reserved &&
3076                     reserved_words[tk->num] != NULL)
3077                         fputs(reserved_words[tk->num], trace);
3078                 else
3079                         text_dump(trace, tk->txt, 20);
3080                 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3081         }
3082
3083         void parser_trace_action(FILE *trace, char *action)
3084         {
3085                 if (trace)
3086                         fprintf(trace, " - %s\n", action);
3087         }
3088
3089 # A Worked Example
3090
3091 The obvious example for a parser is a calculator.
3092
3093 As `scanner` provides number parsing function using `libgmp` it is not much
3094 work to perform arbitrary rational number calculations.
3095
3096 This calculator takes one expression, or an equality test, per line.
3097 The results are printed and if any equality test fails, the program
3098 exits with an error.
3099
3100 ###### File: parsergen.mk
3101         calc.c calc.h : parsergen parsergen.mdc
3102                 ./parsergen --tag calc -o calc parsergen.mdc
3103         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3104                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3105         calctest : calc
3106                 ./calc parsergen.mdc
3107         demos :: calctest
3108
3109 # calc: header
3110
3111         #include "parse_number.h"
3112         // what do we use for a demo-grammar?  A calculator of course.
3113         struct number {
3114                 mpq_t val;
3115                 char tail[2];
3116                 int err;
3117         };
3118
3119 # calc: code
3120
3121         #include <stdlib.h>
3122         #include <unistd.h>
3123         #include <fcntl.h>
3124         #include <sys/mman.h>
3125         #include <stdio.h>
3126         #include <malloc.h>
3127         #include <gmp.h>
3128         #include <string.h>
3129         #include "mdcode.h"
3130         #include "scanner.h"
3131         #include "parser.h"
3132
3133         #include "calc.h"
3134
3135         static void free_number(struct number *n)
3136         {
3137                 mpq_clear(n->val);
3138                 free(n);
3139         }
3140
3141         static int text_is(struct text t, char *s)
3142         {
3143                 return (strlen(s) == t.len &&
3144                         strncmp(s, t.txt, t.len) == 0);
3145         }
3146
3147         int main(int argc, char *argv[])
3148         {
3149                 int fd = open(argv[1], O_RDONLY);
3150                 int len = lseek(fd, 0, 2);
3151                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3152                 struct section *table = code_extract(file, file+len, NULL);
3153                 struct section *s;
3154                 struct token_config config = {
3155                         .ignored = (1 << TK_line_comment)
3156                                  | (1 << TK_in)
3157                                  | (1 << TK_out),
3158                         .number_chars = ".,_+-",
3159                         .word_start = "",
3160                         .word_cont = "",
3161                 };
3162                 for (s = table; s; s = s->next)
3163                         if (text_is(s->section, "example: input"))
3164                                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3165                 while (table) {
3166                         struct section *t = table->next;
3167                         code_free(table->code);
3168                         free(table);
3169                         table = t;
3170                 }
3171                 exit(0);
3172         }
3173
3174 # calc: grammar
3175
3176         $LEFT + -
3177         $LEFT * / //
3178
3179         Session -> Session Line
3180                 | Line
3181
3182         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3183                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3184                                         gmp_printf("  or as a decimal: %Fg\n", fl);
3185                                         mpf_clear(fl);
3186                                         }
3187                                      }$
3188                 | Expression = Expression NEWLINE ${
3189                         if (mpq_equal($1.val, $3.val))
3190                                 gmp_printf("Both equal %Qd\n", $1.val);
3191                         else {
3192                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
3193                                         $1.val, $3.val);
3194                                 exit(1);
3195                         }
3196                 }$
3197                 | NEWLINE ${ printf("Blank line\n"); }$
3198                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3199
3200         $number
3201         Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3202                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3203                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3204                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3205                 | Expression // Expression ${ {
3206                         mpz_t z0, z1, z2;
3207                         mpq_init($0.val);
3208                         mpz_init(z0); mpz_init(z1); mpz_init(z2);
3209                         mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3210                         mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3211                         mpz_tdiv_q(z0, z1, z2);
3212                         mpq_set_z($0.val, z0);
3213                         mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3214                 } }$
3215                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3216                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3217
3218 # example: input
3219
3220         355/113
3221         3.1415926535 - 355/113
3222         2 + 4 * 5
3223         1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3224         10 * 9 / 2
3225         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
3226
3227         355//113
3228
3229         error