ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator with 2D support #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 "2D support" means that indentation and line breaks can be significant.
   8 Indent tokens (IN and OUT) and end-of-line (EOL) tokens can be used to
   9 describe the grammar and the parser can selectively ignore these where
  10 they aren't relevant.
  11
  12 There are several distinct sections.
  13
  14 1. `grammar_read` will read a grammar from a literate-code file and
  15    store the productions, symbols, and code fragments.
  16 2. `grammar_analyse` will take that grammar and build LR parsing
  17    states and look-ahead tables.
  18 3. `grammar_report` will present the details of the analysis
  19    in a readable format and will report any conflicts.
  20 4. `parser_generate` will write out C code files with various
  21    tables and with the code fragments arranged in useful places.
  22 5. `parser_run` is a library function intended to be linked together
  23    with the generated parser tables to complete the implementation of
  24    a parser.
  25 6. Finally `calc` is a test grammar for a simple calculator.  The
  26    `parsergen` program built from the C code in this file can extract
  27    that grammar directly from this file and process it.
  28
  29 ###### File: parsergen.c
  30         #include <unistd.h>
  31         #include <stdlib.h>
  32         #include <stdio.h>
  33         ## includes
  34         ## forward declarations
  35         ## declarations
  36         ## functions
  37         ## grammar_read
  38         ## grammar_analyse
  39         ## grammar_report
  40         ## parser_generate
  41         ## main
  42 ###### File: parser.h
  43         ## exported types
  44         ## exported functions
  45 ###### File: libparser.c
  46         #include <unistd.h>
  47         #include <stdlib.h>
  48         #include <stdio.h>
  49         ## parser includes
  50         ## parser functions
  51         ## parser_run
  52 ###### File: parsergen.mk
  53         CFLAGS += -Wall -g
  54         all :: parsergen calc
  55         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  56                 ./md2c parsergen.mdc
  57
  58 ## Reading the grammar
  59
  60 The grammar must be stored in a markdown literate code file as parsed by
  61 "[mdcode][]".  It may have up to four top level (i.e.  unreferenced)
  62 sections: `header`, `code`, `grammar` and `reduce`.  The first two, if
  63 present, will be literally copied into the generated `.c` and `.h`
  64 files.  The third is required and contains the grammar which is
  65 tokenised with "[scanner][]".  `reduce` can provide variable
  66 declarations and common intialization code for all the reduction code
  67 fragments in the grammar.
  68
  69 If the `--tag` option is given, then any top level header that doesn't
  70 start with the tag is ignored, and the tag is striped from the rest.  So
  71 `--tag Foo` means that the four sections expected are `Foo: header`,
  72 `Foo: code`, `Foo: grammar` and `Foo: reduce`.  The tag `calc` is used
  73 to extract the sample calculator from this file.
  74
  75 [mdcode]: mdcode.html
  76 [scanner]: scanner.html
  77
  78 ###### includes
  79         #include "mdcode.h"
  80         #include "scanner.h"
  81
  82 ###### parser includes
  83         #include "mdcode.h"
  84         #include "scanner.h"
  85
  86 The grammar contains production sets, precedence/associativity
  87 declarations, general terminal declarations, and data type declarations.
  88 These are all parsed with _ad hoc_ parsing as we don't have a parser
  89 generator yet.
  90
  91 The precedence and associativity can be set for each production, and
  92 can be inherited from symbols.  The data type (either structure or a
  93 reference to a structure) is potentially defined for each non-terminal
  94 and describes what C structure is used to store information about each
  95 symbol.
  96
  97 ###### declarations
  98         enum assoc {Left, Right, Non};
  99         char *assoc_names[] = {"Left","Right","Non"};
 100
 101         struct symbol {
 102                 struct text struct_name;
 103                 int isref;
 104                 enum assoc assoc;
 105                 unsigned short precedence;
 106                 ## symbol fields
 107         };
 108         struct production {
 109                 unsigned short precedence;
 110                 enum assoc assoc;
 111                 ## production fields
 112         };
 113         struct grammar {
 114                 ## grammar fields
 115         };
 116
 117 The strings reported by `mdcode` and `scanner` are `struct text` which
 118 have length rather than being nul terminated.  To help with printing and
 119 comparing we define `text_is` and `prtxt`, which should possibly go in
 120 `mdcode`.  `scanner` does provide `text_dump` which is useful for
 121 strings which might contain control characters.
 122
 123 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 124 because that is what we need to detect tags.
 125
 126 ###### functions
 127         static int text_is(struct text t, char *s)
 128         {
 129                 return (strlen(s) == t.len &&
 130                         strncmp(s, t.txt, t.len) == 0);
 131         }
 132         static void prtxt(struct text t)
 133         {
 134                 printf("%.*s", t.len, t.txt);
 135         }
 136
 137         static int strip_tag(struct text *t, char *tag)
 138         {
 139                 int skip = strlen(tag) + 1;
 140                 if (skip >= t->len ||
 141                     strncmp(t->txt, tag, skip-1) != 0 ||
 142                     t->txt[skip-1] != ':')
 143                         return 0;
 144                 while (skip < t->len && t->txt[skip] == ' ')
 145                         skip++;
 146                 t->len -= skip;
 147                 t->txt += skip;
 148                 return 1;
 149         }
 150
 151 ### Symbols
 152
 153 Productions are comprised primarily of symbols - terminal and
 154 non-terminal.  We do not make a syntactic distinction between the two,
 155 though non-terminals must be identifiers.  Non-terminal symbols are
 156 those which appear in the head of a production, terminal symbols are
 157 those which don't.  There are also "virtual" symbols used for precedence
 158 marking discussed later, and sometimes we won't know what type a symbol
 159 is yet.
 160
 161 To help with code safety it is possible to declare the terminal symbols.
 162 If this is done, then any symbol used in a production that does not
 163 appear in the head of any production and is not declared as a terminal
 164 is treated as an error.
 165
 166 ###### forward declarations
 167         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 168         char *symtypes = "UVTN";
 169 ###### symbol fields
 170         enum symtype type;
 171 ###### grammar fields
 172         int terminals_declared;
 173
 174 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 175 table of known symbols and the resulting parser will report them as
 176 `TK_reserved + N`.  A small set of identifiers are reserved for the
 177 different token types that `scanner` can report, and an even smaller set
 178 are reserved for a special token that the parser can generate (`EOL`) as
 179 will be described later.  This latter set cannot use predefined numbers,
 180 so they are marked as `isspecial` for now and will get assigned a number
 181 with the non-terminals later.
 182
 183 ###### declarations
 184
 185         static struct { int num; char *name; } reserved_words[] = {
 186                 { TK_error,        "ERROR" },
 187                 { TK_number,       "NUMBER" },
 188                 { TK_ident,        "IDENTIFIER" },
 189                 { TK_mark,         "MARK" },
 190                 { TK_string,       "STRING" },
 191                 { TK_multi_string, "MULTI_STRING" },
 192                 { TK_in,           "IN" },
 193                 { TK_out,          "OUT" },
 194                 { TK_newline,      "NEWLINE" },
 195                 { TK_eof,          "$eof" },
 196                 { -1,              "EOL" },
 197         };
 198
 199 ###### symbol fields
 200         short num;
 201         unsigned int isspecial:1;
 202
 203 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 204 recognised.  The former is automatically expected at the end of the text
 205 being parsed. The latter are always ignored by the parser.
 206
 207 All of these symbols are stored in a simple symbol table.  We use the
 208 `struct text` from `mdcode` to store the name and link them together
 209 into a sorted list using an insertion sort.
 210
 211 We don't have separate `find` and `insert` functions as any symbol we
 212 find needs to be remembered.  We simply expect `find` to always return a
 213 symbol, but its type might be `Unknown`.
 214
 215 ###### includes
 216         #include <string.h>
 217
 218 ###### symbol fields
 219         struct text name;
 220         struct symbol *next;
 221
 222 ###### grammar fields
 223         struct symbol *syms;
 224         int num_syms;
 225
 226 ###### functions
 227         static struct symbol *sym_find(struct grammar *g, struct text s)
 228         {
 229                 struct symbol **l = &g->syms;
 230                 struct symbol *n;
 231                 int cmp = 1;
 232
 233                 while (*l &&
 234                         (cmp = text_cmp((*l)->name, s)) < 0)
 235                                 l = & (*l)->next;
 236                 if (cmp == 0)
 237                         return *l;
 238                 n = calloc(1, sizeof(*n));
 239                 n->name = s;
 240                 n->type = Unknown;
 241                 n->next = *l;
 242                 n->num = -1;
 243                 *l = n;
 244                 return n;
 245         }
 246
 247         static void symbols_init(struct grammar *g)
 248         {
 249                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 250                 int i;
 251                 for (i = 0; i < entries; i++) {
 252                         struct text t;
 253                         struct symbol *s;
 254                         t.txt = reserved_words[i].name;
 255                         t.len = strlen(t.txt);
 256                         s = sym_find(g, t);
 257                         s->type = Terminal;
 258                         s->num = reserved_words[i].num;
 259                         s->isspecial = 1;
 260                 }
 261         }
 262
 263 ### Data types and precedence.
 264
 265 Data type specification, precedence specification, and declaration of
 266 terminals are all introduced by a dollar sign at the start of the line.
 267 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 268 precedence, if it is `TERM` then the line declares terminals without
 269 precedence, otherwise it specifies a data type.
 270
 271 The data type name is simply stored and applied to the head of all
 272 subsequent productions.  It must be the name of a structure optionally
 273 preceded by an asterisk which means a reference or "pointer".  So
 274 `$expression` maps to `struct expression` and `$*statement` maps to
 275 `struct statement *`.
 276
 277 Any productions given before the first data type declaration will have
 278 no data type associated with them and can carry no information.  In
 279 order to allow other non-terminals to have no type, the data type
 280 `$void` can be given.  This does *not* mean that `struct void` will be
 281 used, but rather than no type will be associated with subsequent
 282 non-terminals.
 283
 284 The precedence line must contain a list of symbols, either terminal
 285 symbols or virtual symbols.  It can only contain symbols that have not
 286 been seen yet, so precedence declaration must precede the productions
 287 that they affect.
 288
 289 A "virtual" symbol is an identifier preceded by `$$`.  Virtual symbols
 290 carry precedence information but are not included in the grammar.  A
 291 production can declare that it inherits the precedence of a given
 292 virtual symbol.
 293
 294 This use for `$$` precludes it from being used as a symbol in the
 295 described language.  Two other symbols: `${` and `}$` are also
 296 unavailable.
 297
 298 Each new precedence line introduces a new precedence level and
 299 declares how it associates.  This level is stored in each symbol
 300 listed and may be inherited by any production which uses the symbol.  A
 301 production inherits from the last symbol which has a precedence.
 302
 303 The symbols on the first precedence line have the lowest precedence.
 304 Subsequent lines introduce symbols with higher precedence and so bind
 305 more tightly.
 306
 307 Note that a declaration line (or "dollar line") can start with either of
 308 two different marks: "$" or "$*".  The `dollar_line()` function defined
 309 here is told which was found in the `isref` argument.
 310
 311 ###### grammar fields
 312         struct text current_type;
 313         int type_isref;
 314         int prec_levels;
 315
 316 ###### declarations
 317         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 318         static const char *known[] = { "$$", "${", "}$" };
 319
 320 ###### functions
 321         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 322         {
 323                 struct token t = token_next(ts);
 324                 char *err;
 325                 enum assoc assoc;
 326                 int term = 0;
 327                 int found;
 328
 329                 if (t.num != TK_ident) {
 330                         err = "type or assoc expected after '$'";
 331                         goto abort;
 332                 }
 333                 if (text_is(t.txt, "LEFT"))
 334                         assoc = Left;
 335                 else if (text_is(t.txt, "RIGHT"))
 336                         assoc = Right;
 337                 else if (text_is(t.txt, "NON"))
 338                         assoc = Non;
 339                 else if (text_is(t.txt, "TERM")) {
 340                         term = 1;
 341                         g->terminals_declared = 1;
 342                 } else {
 343                         g->current_type = t.txt;
 344                         g->type_isref = isref;
 345                         if (text_is(t.txt, "void"))
 346                                 g->current_type.txt = NULL;
 347                         t = token_next(ts);
 348                         if (t.num != TK_newline) {
 349                                 err = "Extra tokens after type name";
 350                                 goto abort;
 351                         }
 352                         return NULL;
 353                 }
 354
 355                 if (isref) {
 356                         err = "$* cannot be followed by a precedence";
 357                         goto abort;
 358                 }
 359
 360                 // This is a precedence or TERM line, need some symbols.
 361                 found = 0;
 362                 g->prec_levels += 1;
 363                 t = token_next(ts);
 364                 while (t.num != TK_newline) {
 365                         enum symtype type = Terminal;
 366                         struct symbol *s;
 367                         if (t.num == TK_virtual) {
 368                                 type = Virtual;
 369                                 t = token_next(ts);
 370                                 if (t.num != TK_ident) {
 371                                         err = "$$ must be followed by a word";
 372                                         goto abort;
 373                                 }
 374                                 if (term) {
 375                                         err = "Virtual symbols not permitted on $TERM line";
 376                                         goto abort;
 377                                 }
 378                         } else if (t.num != TK_ident &&
 379                                    t.num != TK_mark) {
 380                                 err = "Illegal token in precedence line";
 381                                 goto abort;
 382                         }
 383                         s = sym_find(g, t.txt);
 384                         if (s->type != Unknown) {
 385                                 err = "Symbols in precedence/TERM line must not already be known.";
 386                                 goto abort;
 387                         }
 388                         s->type = type;
 389                         if (!term) {
 390                                 s->precedence = g->prec_levels;
 391                                 s->assoc = assoc;
 392                         }
 393                         found += 1;
 394                         t = token_next(ts);
 395                 }
 396                 if (found == 0)
 397                         err = "No symbols given on precedence/TERM line";
 398                         goto abort;
 399                 return NULL;
 400         abort:
 401                 while (t.num != TK_newline && t.num != TK_eof)
 402                         t = token_next(ts);
 403                 return err;
 404         }
 405
 406 ### Productions
 407
 408 A production either starts with an identifier which is the head
 409 non-terminal, or a vertical bar (`|`) in which case this production
 410 uses the same head as the previous one.  The identifier must be
 411 followed by a `->` mark.  All productions for a given non-terminal must
 412 be grouped together with the `nonterminal ->` given only once.
 413
 414 After this a (possibly empty) sequence of identifiers and marks form
 415 the body of the production.  A virtual symbol may be given after the
 416 body by preceding it with `$$`.  If a virtual symbol is given, the
 417 precedence of the production is that for the virtual symbol.  If none
 418 is given, the precedence is inherited from the last symbol in the
 419 production which has a precedence specified.
 420
 421 After the optional precedence may come the `${` mark.  This indicates
 422 the start of a code fragment.  If present, this must be on the same
 423 line as the start of the production.
 424
 425 All of the text from the `${` through to the matching `}$` is
 426 collected and forms the code-fragment for the production.  It must all
 427 be in one `code_node` of the literate code.  The `}$` must be
 428 at the end of a line.
 429
 430 Text in the code fragment will undergo substitutions where `$N` or
 431 `$<N`,for some numeric `N` (or non-numeric indicator as described
 432 later), will be replaced with a variable holding the parse information
 433 for the particular symbol in the production.  `$0` is the head of the
 434 production, `$1` is the first symbol of the body, etc.  The type of `$N`
 435 for a terminal symbol is `struct token`.  For a non-terminal, it is
 436 whatever has been declared for that symbol.  The `<` may be included and
 437 means that the value (usually a reference) is being moved out, so it
 438 will not automatically be freed.  The effect of using '<' is that the
 439 variable is cleared to all-zeros.
 440
 441 While building productions we will need to add to an array which needs to
 442 grow dynamically.
 443
 444 ###### functions
 445         static void array_add(void *varray, int *cnt, void *new)
 446         {
 447                 void ***array = varray;
 448                 int current = 0;
 449                 const int step = 8;
 450                 current = ((*cnt-1) | (step-1))+1;
 451                 if (*cnt == current) {
 452                         /* must grow */
 453                         current += step;
 454                         *array = realloc(*array, current * sizeof(void*));
 455                 }
 456                 (*array)[*cnt] = new;
 457                 (*cnt) += 1;
 458         }
 459
 460 Collecting the code fragment simply involves reading tokens until we
 461 hit the end token `}$`, and noting the character position of the start and
 462 the end.
 463
 464 ###### functions
 465         static struct text collect_code(struct token_state *state,
 466                                         struct token start)
 467         {
 468                 struct text code;
 469                 struct token t;
 470                 code.txt = start.txt.txt + start.txt.len;
 471                 do
 472                         t = token_next(state);
 473                 while (t.node == start.node &&
 474                        t.num != TK_close && t.num != TK_error &&
 475                        t.num != TK_eof);
 476                 if (t.num == TK_close && t.node == start.node)
 477                         code.len = t.txt.txt - code.txt;
 478                 else
 479                         code.txt = NULL;
 480                 return code;
 481         }
 482
 483 Now we have all the bits we need to parse a full production.
 484
 485 ###### includes
 486         #include <memory.h>
 487
 488 ###### grammar fields
 489         struct production **productions;
 490         int production_count;
 491
 492 ###### production fields
 493         struct symbol  *head;
 494         struct symbol **body;
 495         int             body_size;
 496         struct text     code;
 497         int             code_line;
 498
 499 ###### symbol fields
 500         int first_production;
 501
 502 ###### functions
 503         static char *parse_production(struct grammar *g,
 504                                       struct symbol *head,
 505                                       struct token_state *state)
 506         {
 507                 /* Head has already been parsed. */
 508                 struct token tk;
 509                 char *err;
 510                 struct production p, *pp;
 511
 512                 memset(&p, 0, sizeof(p));
 513                 p.head = head;
 514                 tk = token_next(state);
 515                 while (tk.num == TK_ident || tk.num == TK_mark) {
 516                         struct symbol *bs = sym_find(g, tk.txt);
 517                         if (bs->type == Unknown) {
 518                                 if (!g->terminals_declared)
 519                                         bs->type = Terminal;
 520                         }
 521                         if (bs->type == Virtual) {
 522                                 err = "Virtual symbol not permitted in production";
 523                                 goto abort;
 524                         }
 525                         if (bs->precedence) {
 526                                 p.precedence = bs->precedence;
 527                                 p.assoc = bs->assoc;
 528                         }
 529                         array_add(&p.body, &p.body_size, bs);
 530                         tk = token_next(state);
 531                 }
 532                 if (tk.num == TK_virtual) {
 533                         struct symbol *vs;
 534                         tk = token_next(state);
 535                         if (tk.num != TK_ident) {
 536                                 err = "word required after $$";
 537                                 goto abort;
 538                         }
 539                         vs = sym_find(g, tk.txt);
 540                         if (vs->precedence == 0) {
 541                                 err = "symbol after $$ must have precedence";
 542                                 goto abort;
 543                         } else {
 544                                 p.precedence = vs->precedence;
 545                                 p.assoc = vs->assoc;
 546                         }
 547                         tk = token_next(state);
 548                 }
 549                 if (tk.num == TK_open) {
 550                         p.code_line = tk.line;
 551                         p.code = collect_code(state, tk);
 552                         if (p.code.txt == NULL) {
 553                                 err = "code fragment not closed properly";
 554                                 goto abort;
 555                         }
 556                         tk = token_next(state);
 557                 }
 558
 559                 if (tk.num != TK_newline && tk.num != TK_eof) {
 560                         err = "stray tokens at end of line";
 561                         goto abort;
 562                 }
 563                 pp = malloc(sizeof(*pp));
 564                 *pp = p;
 565                 array_add(&g->productions, &g->production_count, pp);
 566                 return NULL;
 567         abort:
 568                 while (tk.num != TK_newline && tk.num != TK_eof)
 569                         tk = token_next(state);
 570                 free(p.body);
 571                 return err;
 572         }
 573
 574 With the ability to parse productions and dollar-lines, we have nearly all
 575 that we need to parse a grammar from a `code_node`.
 576
 577 The head of the first production will effectively be the `start` symbol
 578 of the grammar.  However it won't _actually_ be so.  Processing the
 579 grammar is greatly simplified if the real start symbol only has a single
 580 production, and expects `$eof` as the final terminal.  So when we find
 581 the first explicit production we insert an extra implicit production as
 582 production zero which looks like
 583
 584 ###### Example: production 0
 585         $start -> START $eof
 586
 587 where `START` is the first non-terminal given.
 588
 589 ###### create production zero
 590         struct production *p = calloc(1,sizeof(*p));
 591         struct text start = {"$start",6};
 592         struct text eof = {"$eof",4};
 593         struct text code = {"$0 = $<1;", 9};
 594         p->head = sym_find(g, start);
 595         p->head->type = Nonterminal;
 596         p->head->struct_name = g->current_type;
 597         p->head->isref = g->type_isref;
 598         if (g->current_type.txt)
 599                 p->code = code;
 600         array_add(&p->body, &p->body_size, head);
 601         array_add(&p->body, &p->body_size, sym_find(g, eof));
 602         p->head->first_production = g->production_count;
 603         array_add(&g->productions, &g->production_count, p);
 604
 605 Now we are ready to read in the grammar.  We ignore comments
 606 and strings so that the marks which introduce them can be
 607 used as terminals in the grammar.  We don't ignore numbers
 608 even though we don't allow them as that causes the scanner
 609 to produce errors that the parser is better positioned to handle.
 610 We also ignore indentation, but we expect and use newlines.
 611
 612 To allow comments in the grammar, we explicitly check for "//" as the
 613 first token on the line and those lines are skipped.  "//" can still be
 614 used as a terminal anywhere that a terminal is expected.
 615
 616 ###### grammar_read
 617         static struct grammar *grammar_read(struct code_node *code)
 618         {
 619                 struct token_config conf = {
 620                         .word_start = "",
 621                         .word_cont = "",
 622                         .words_marks = known,
 623                         .known_count = sizeof(known)/sizeof(known[0]),
 624                         .number_chars = "",
 625                         .ignored = (1 << TK_line_comment)
 626                                  | (1 << TK_block_comment)
 627                                  | (0 << TK_number)
 628                                  | (1 << TK_string)
 629                                  | (1 << TK_multi_string)
 630                                  | (1 << TK_in)
 631                                  | (1 << TK_out),
 632                 };
 633
 634                 struct token_state *state = token_open(code, &conf);
 635                 struct token tk;
 636                 struct symbol *head = NULL;
 637                 struct grammar *g;
 638                 char *err = NULL;
 639
 640                 g = calloc(1, sizeof(*g));
 641                 symbols_init(g);
 642
 643                 for (tk = token_next(state); tk.num != TK_eof;
 644                      tk = token_next(state)) {
 645                         if (tk.num == TK_newline)
 646                                 continue;
 647                         if (tk.num == TK_ident) {
 648                                 // new non-terminal
 649                                 head = sym_find(g, tk.txt);
 650                                 if (head->type == Nonterminal)
 651                                         err = "This non-terminal has already be used.";
 652                                 else if (head->type == Virtual)
 653                                         err = "Virtual symbol not permitted in head of production";
 654                                 else {
 655                                         head->type = Nonterminal;
 656                                         head->struct_name = g->current_type;
 657                                         head->isref = g->type_isref;
 658                                         if (g->production_count == 0) {
 659                                                 ## create production zero
 660                                         }
 661                                         head->first_production = g->production_count;
 662                                         tk = token_next(state);
 663                                         if (tk.num == TK_mark &&
 664                                             text_is(tk.txt, "->"))
 665                                                 err = parse_production(g, head, state);
 666                                         else
 667                                                 err = "'->' missing in production";
 668                                 }
 669                         } else if (tk.num == TK_mark
 670                                    && text_is(tk.txt, "|")) {
 671                                 // another production for same non-term
 672                                 if (head)
 673                                         err = parse_production(g, head, state);
 674                                 else
 675                                         err = "First production must have a head";
 676                         } else if (tk.num == TK_mark
 677                                    && text_is(tk.txt, "$")) {
 678                                 err = dollar_line(state, g, 0);
 679                         } else if (tk.num == TK_mark
 680                                    && text_is(tk.txt, "$*")) {
 681                                 err = dollar_line(state, g, 1);
 682                         } else if (tk.num == TK_mark
 683                                    && text_is(tk.txt, "//")) {
 684                                 while (tk.num != TK_newline &&
 685                                        tk.num != TK_eof)
 686                                         tk = token_next(state);
 687                         } else {
 688                                 err = "Unrecognised token at start of line.";
 689                         }
 690                         if (err)
 691                                 goto abort;
 692                 }
 693                 token_close(state);
 694                 if (g->terminals_declared) {
 695                         struct symbol *s;
 696                         int errs = 0;
 697                         for (s = g->syms; s; s = s->next) {
 698                                 if (s->type != Unknown)
 699                                         continue;
 700                                 errs += 1;
 701                                 fprintf(stderr, "Token %.*s not declared\n",
 702                                         s->name.len, s->name.txt);
 703                         }
 704                         if (errs) {
 705                                 free(g); // FIXME free content
 706                                 g = NULL;
 707                         }
 708                 }
 709                 return g;
 710         abort:
 711                 fprintf(stderr, "Error at line %d: %s\n",
 712                         tk.line, err);
 713                 token_close(state);
 714                 free(g); // FIXME free content
 715                 return NULL;
 716         }
 717
 718 ## Analysing the grammar
 719
 720 The central task in analysing the grammar is to determine a set of
 721 states to drive the parsing process.  This will require finding various
 722 sets of symbols and of "items".  Some of these sets will need to attach
 723 information to each element in the set, so they are more like maps.
 724
 725 Each "item" may have a 'look-ahead' or `LA` set associated with it.
 726 Many of these will be the same as each other.  To avoid memory wastage,
 727 and to simplify some comparisons of sets, these sets will be stored in a
 728 list of unique sets, each assigned a number.
 729
 730 Once we have the data structures in hand to manage these sets and lists,
 731 we can start setting the 'nullable' flag, build the 'FIRST' sets, and
 732 then create the item sets which define the various states.
 733
 734 ### Symbol sets.
 735
 736 Though we don't only store symbols in these sets, they are the main
 737 things we store, so they are called symbol sets or "symsets".
 738
 739 A symset has a size, an array of shorts, and an optional array of data
 740 values, which are also shorts.  If the array of data is not present, we
 741 store an impossible pointer, as `NULL` is used to indicate that no
 742 memory has been allocated yet;
 743
 744 ###### declarations
 745         struct symset {
 746                 short cnt;
 747                 unsigned short *syms, *data;
 748         };
 749         #define NO_DATA ((unsigned short *)1)
 750         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 751         const struct symset INIT_DATASET = { 0, NULL, NULL };
 752
 753 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
 754 using an insertion sort.  We don't explicitly record the amount of
 755 allocated space as it can be derived directly from the current `cnt`
 756 using `((cnt - 1) | 7) + 1`.
 757
 758 ###### functions
 759         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 760         {
 761                 int i;
 762                 int current = ((s->cnt-1) | 7) + 1;
 763                 if (current == s->cnt) {
 764                         current += 8;
 765                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 766                         if (s->data != NO_DATA)
 767                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 768                 }
 769                 i = s->cnt;
 770                 while (i > 0 && s->syms[i-1] > key) {
 771                         s->syms[i] = s->syms[i-1];
 772                         if (s->data != NO_DATA)
 773                                 s->data[i] = s->data[i-1];
 774                         i--;
 775                 }
 776                 s->syms[i] = key;
 777                 if (s->data != NO_DATA)
 778                         s->data[i] = val;
 779                 s->cnt += 1;
 780         }
 781
 782 Finding a symbol (or item) in a `symset` uses a simple binary search.
 783 We return the index where the value was found (so data can be accessed),
 784 or `-1` to indicate failure.
 785
 786         static int symset_find(struct symset *ss, unsigned short key)
 787         {
 788                 int lo = 0;
 789                 int hi = ss->cnt;
 790
 791                 if (hi == 0)
 792                         return -1;
 793                 while (lo + 1 < hi) {
 794                         int mid = (lo + hi) / 2;
 795                         if (ss->syms[mid] <= key)
 796                                 lo = mid;
 797                         else
 798                                 hi = mid;
 799                 }
 800                 if (ss->syms[lo] == key)
 801                         return lo;
 802                 return -1;
 803         }
 804
 805 We will often want to form the union of two symsets.  When we do, we
 806 will often want to know if anything changed (as that might mean there is
 807 more work to do).  So `symset_union` reports whether anything was added
 808 to the first set.  We use a slow quadratic approach as these sets are
 809 rarely large.  If profiling shows this to be a problem it can be
 810 optimised later.
 811
 812         static int symset_union(struct symset *a, struct symset *b)
 813         {
 814                 int i;
 815                 int added = 0;
 816                 for (i = 0; i < b->cnt; i++)
 817                         if (symset_find(a, b->syms[i]) < 0) {
 818                                 unsigned short data = 0;
 819                                 if (b->data != NO_DATA)
 820                                         data = b->data[i];
 821                                 symset_add(a, b->syms[i], data);
 822                                 added++;
 823                         }
 824                 return added;
 825         }
 826
 827 And of course we must be able to free a symset.
 828
 829         static void symset_free(struct symset ss)
 830         {
 831                 free(ss.syms);
 832                 if (ss.data != NO_DATA)
 833                         free(ss.data);
 834         }
 835
 836 ### Symset Storage
 837
 838 Some symsets are simply stored somewhere appropriate in the `grammar`
 839 data structure, others needs a bit of indirection.  This applies
 840 particularly to "LA" sets.  These will be paired with "items" in an
 841 "itemset".  As itemsets will be stored in a symset, the "LA" set needs
 842 to be stored in the `data` field, so we need a mapping from a small
 843 (short) number to an LA `symset`.
 844
 845 As mentioned earlier this involves creating a list of unique symsets.
 846
 847 For now, we just use a linear list sorted by insertion.  A skiplist
 848 would be more efficient and may be added later.
 849
 850 ###### declarations
 851
 852         struct setlist {
 853                 struct symset ss;
 854                 int num;
 855                 struct setlist *next;
 856         };
 857
 858 ###### grammar fields
 859         struct setlist *sets;
 860         int nextset;
 861
 862 ###### functions
 863
 864         static int ss_cmp(struct symset a, struct symset b)
 865         {
 866                 int i;
 867                 int diff = a.cnt - b.cnt;
 868                 if (diff)
 869                         return diff;
 870                 for (i = 0; i < a.cnt; i++) {
 871                         diff = (int)a.syms[i] - (int)b.syms[i];
 872                         if (diff)
 873                                 return diff;
 874                 }
 875                 return 0;
 876         }
 877
 878         static int save_set(struct grammar *g, struct symset ss)
 879         {
 880                 struct setlist **sl = &g->sets;
 881                 int cmp = 1;
 882                 struct setlist *s;
 883
 884                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 885                         sl = & (*sl)->next;
 886                 if (cmp == 0) {
 887                         symset_free(ss);
 888                         return (*sl)->num;
 889                 }
 890
 891                 s = malloc(sizeof(*s));
 892                 s->ss = ss;
 893                 s->num = g->nextset;
 894                 g->nextset += 1;
 895                 s->next = *sl;
 896                 *sl = s;
 897                 return s->num;
 898         }
 899
 900 Finding a set by number is currently performed by a simple linear
 901 search.  If this turns out to hurt performance, we can store the sets in
 902 a dynamic array like the productions.
 903
 904         static struct symset set_find(struct grammar *g, int num)
 905         {
 906                 struct setlist *sl = g->sets;
 907                 while (sl && sl->num != num)
 908                         sl = sl->next;
 909                 return sl->ss;
 910         }
 911
 912 ### Setting `nullable`
 913
 914 We set `nullable` on the head symbol for any production for which all
 915 the body symbols (if any) are nullable.  As this is a recursive
 916 definition, any change in the `nullable` setting means that we need to
 917 re-evaluate where it needs to be set.
 918
 919 We simply loop around performing the same calculations until no more
 920 changes happen.
 921
 922 ###### symbol fields
 923         int nullable;
 924
 925 ###### functions
 926         static void set_nullable(struct grammar *g)
 927         {
 928                 int check_again = 1;
 929                 while (check_again) {
 930                         int p;
 931                         check_again = 0;
 932                         for (p = 0; p < g->production_count; p++) {
 933                                 struct production *pr = g->productions[p];
 934                                 int s;
 935
 936                                 if (pr->head->nullable)
 937                                         continue;
 938                                 for (s = 0; s < pr->body_size; s++)
 939                                         if (! pr->body[s]->nullable)
 940                                                 break;
 941                                 if (s == pr->body_size) {
 942                                         pr->head->nullable = 1;
 943                                         check_again = 1;
 944                                 }
 945                         }
 946                 }
 947         }
 948
 949 ### Building the `first` sets
 950
 951 When calculating what can follow a particular non-terminal, we will need
 952 to know what the "first" terminal in any subsequent non-terminal might
 953 be.  So we calculate the `first` set for every non-terminal and store
 954 them in an array.  We don't bother recording the "first" set for
 955 terminals as they are trivial.
 956
 957 As the "first" for one symbol might depend on the "first" of another, we
 958 repeat the calculations until no changes happen, just like with
 959 `set_nullable`.  This makes use of the fact that `symset_union` reports
 960 if any change happens.
 961
 962 The core of this, which finds the "first" of part of a production body,
 963 will be reused for computing the "follow" sets, so we split that out
 964 into a separate function, `add_first`, which also reports if it got all
 965 the way to the end of the production without finding a non-nullable
 966 symbol.
 967
 968 ###### grammar fields
 969         struct symset *first;
 970
 971 ###### functions
 972
 973         static int add_first(struct production *pr, int start,
 974                              struct symset *target, struct grammar *g,
 975                              int *to_end)
 976         {
 977                 int s;
 978                 int changed = 0;
 979                 for (s = start; s < pr->body_size; s++) {
 980                         struct symbol *bs = pr->body[s];
 981                         if (bs->type == Terminal) {
 982                                 if (symset_find(target, bs->num) < 0) {
 983                                         symset_add(target, bs->num, 0);
 984                                         changed = 1;
 985                                 }
 986                                 break;
 987                         } else if (symset_union(target, &g->first[bs->num]))
 988                                 changed = 1;
 989                         if (!bs->nullable)
 990                                 break;
 991                 }
 992                 if (to_end)
 993                         *to_end = (s == pr->body_size);
 994                 return changed;
 995         }
 996
 997         static void build_first(struct grammar *g)
 998         {
 999                 int check_again = 1;
1000                 int s;
1001                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1002                 for (s = 0; s < g->num_syms; s++)
1003                         g->first[s] = INIT_SYMSET;
1004
1005                 while (check_again) {
1006                         int p;
1007                         check_again = 0;
1008                         for (p = 0; p < g->production_count; p++) {
1009                                 struct production *pr = g->productions[p];
1010                                 struct symset *head = &g->first[pr->head->num];
1011
1012                                 if (add_first(pr, 0, head, g, NULL))
1013                                         check_again = 1;
1014                         }
1015                 }
1016         }
1017
1018 ### Building the `follow` sets.
1019
1020 There are two different situations when we will want to generate
1021 "follow" sets.  If we are doing an SLR analysis, we want to generate a
1022 single "follow" set for each non-terminal in the grammar.  That is what
1023 is happening here.  If we are doing an LALR or LR analysis we will want
1024 to generate a separate "LA" set for each item.  We do that later in
1025 state generation.
1026
1027 There are two parts to generating a "follow" set.  Firstly we look at
1028 every place that any non-terminal appears in the body of any production,
1029 and we find the set of possible "first" symbols after there.  This is
1030 done using `add_first` above and only needs to be done once as the
1031 "first" sets are now stable and will not change.
1032
1033 ###### grammar fields
1034         struct symset *follow;
1035
1036 ###### follow code
1037
1038         for (p = 0; p < g->production_count; p++) {
1039                 struct production *pr = g->productions[p];
1040                 int b;
1041
1042                 for (b = 0; b < pr->body_size - 1; b++) {
1043                         struct symbol *bs = pr->body[b];
1044                         if (bs->type == Terminal)
1045                                 continue;
1046                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1047                 }
1048         }
1049
1050 The second part is to add the "follow" set of the head of a production
1051 to the "follow" sets of the final symbol in the production, and any
1052 other symbol which is followed only by `nullable` symbols.  As this
1053 depends on "follow" itself we need to repeatedly perform the process
1054 until no further changes happen.
1055
1056 Rather than 2 nested loops, one that repeats the whole process until
1057 there is no change, and one that iterates of the set of productions, we
1058 combine these two functions into a single loop.
1059
1060 ###### follow code
1061
1062         for (check_again = 0, p = 0;
1063              p < g->production_count;
1064              p < g->production_count-1
1065                 ? p++ : check_again ? (check_again = 0, p = 0)
1066                               : p++) {
1067                 struct production *pr = g->productions[p];
1068                 int b;
1069
1070                 for (b = pr->body_size - 1; b >= 0; b--) {
1071                         struct symbol *bs = pr->body[b];
1072                         if (bs->type == Terminal)
1073                                 break;
1074                         if (symset_union(&g->follow[bs->num],
1075                                          &g->follow[pr->head->num]))
1076                                 check_again = 1;
1077                         if (!bs->nullable)
1078                                 break;
1079                 }
1080         }
1081
1082 We now just need to create and initialise the `follow` list to get a
1083 complete function.
1084
1085 ###### functions
1086         static void build_follow(struct grammar *g)
1087         {
1088                 int s, check_again, p;
1089                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1090                 for (s = 0; s < g->num_syms; s++)
1091                         g->follow[s] = INIT_SYMSET;
1092                 ## follow code
1093         }
1094
1095 ### Building itemsets and states
1096
1097 There are three different levels of detail that can be chosen for
1098 building the itemsets and states for the LR grammar.  They are:
1099
1100 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1101    itemsets - look-ahead, if used at all, is simply the 'follow' sets
1102    already computed,
1103 2. LALR(1) where we build look-ahead sets with each item and merge
1104    the LA sets when we find two paths to the same "kernel" set of items.
1105 3. LR(1) where different look-ahead for any item in the set means
1106    a different item set must be created.
1107
1108 ###### forward declarations
1109         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1110
1111 We need to be able to look through existing states to see if a newly
1112 generated state already exists.  For now we use a simple sorted linked
1113 list.
1114
1115 An item is a pair of numbers: the production number and the position of
1116 "DOT", which is an index into the body of the production.  As the
1117 numbers are not enormous we can combine them into a single "short" and
1118 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1119 production number (so 2000 productions with maximum of 31 symbols in the
1120 body).
1121
1122 Comparing the itemsets will be a little different to comparing symsets
1123 as we want to do the lookup after generating the "kernel" of an
1124 itemset, so we need to ignore the offset=zero items which are added during
1125 completion.
1126
1127 To facilitate this, we modify the "DOT" number so that "0" sorts to
1128 the end of the list in the symset, and then only compare items before
1129 the first "0".
1130
1131 ###### declarations
1132         static inline unsigned short item_num(int production, int dot)
1133         {
1134                 return production | ((31-dot) << 11);
1135         }
1136         static inline int item_prod(unsigned short item)
1137         {
1138                 return item & 0x7ff;
1139         }
1140         static inline int item_dot(unsigned short item)
1141         {
1142                 return (31-(item >> 11)) & 0x1f;
1143         }
1144
1145 For LR(1) analysis we need to compare not just the itemset in a state
1146 but also the LA sets.  As we assign each unique LA set a number, we
1147 can just compare the symset and the data values together.
1148
1149 ###### functions
1150         static int itemset_cmp(struct symset a, struct symset b,
1151                                enum grammar_type type)
1152         {
1153                 int i;
1154                 int av, bv;
1155
1156                 for (i = 0;
1157                      i < a.cnt && i < b.cnt &&
1158                      item_dot(a.syms[i]) > 0 &&
1159                      item_dot(b.syms[i]) > 0;
1160                      i++) {
1161                         int diff = a.syms[i] - b.syms[i];
1162                         if (diff)
1163                                 return diff;
1164                         if (type == LR1) {
1165                                 diff = a.data[i] - b.data[i];
1166                                 if (diff)
1167                                         return diff;
1168                         }
1169                 }
1170                 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1171                         av = -1;
1172                 else
1173                         av = a.syms[i];
1174                 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1175                         bv = -1;
1176                 else
1177                         bv = b.syms[i];
1178                 if (av - bv)
1179                         return av - bv;
1180                 if (type < LR1 || av == -1)
1181                         return 0;
1182                 return
1183                         a.data[i] - b.data[i];
1184         }
1185
1186 It will be helpful to know if an itemset has been "completed" or not,
1187 particularly for LALR where itemsets get merged, at which point they
1188 need to be consider for completion again.  So a `completed` flag is
1189 needed.
1190
1191 And now we can build the list of itemsets.  The lookup routine returns
1192 both a success flag and a pointer to where in the list an insert should
1193 happen, so we don't need to search a second time.
1194
1195 ###### declarations
1196         struct itemset {
1197                 struct itemset *next;
1198                 short state;
1199                 struct symset items;
1200                 struct symset go_to;
1201                 enum assoc assoc;
1202                 unsigned short precedence;
1203                 char completed;
1204         };
1205
1206 ###### grammar fields
1207         struct itemset *items;
1208         int states;
1209
1210 ###### functions
1211         static int itemset_find(struct grammar *g, struct itemset ***where,
1212                                 struct symset kernel, enum grammar_type type)
1213         {
1214                 struct itemset **ip;
1215
1216                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1217                         struct itemset *i = *ip;
1218                         int diff;
1219                         diff = itemset_cmp(i->items, kernel, type);
1220                         if (diff < 0)
1221                                 continue;
1222                         if (diff > 0)
1223                                 break;
1224                         /* found */
1225                         *where = ip;
1226                         return 1;
1227                 }
1228                 *where = ip;
1229                 return 0;
1230         }
1231
1232 Adding an itemset may require merging the LA sets if LALR analysis is
1233 happening.  If any new LA set adds any symbols that weren't in the old
1234 LA set, we clear the `completed` flag so that the dependants of this
1235 itemset will be recalculated and their LA sets updated.
1236
1237 `add_itemset` must consume the symsets it is passed, either by adding
1238 them to a data structure, of freeing them.
1239
1240         static int add_itemset(struct grammar *g, struct symset ss,
1241                                enum assoc assoc, unsigned short precedence,
1242                                enum grammar_type type)
1243         {
1244                 struct itemset **where, *is;
1245                 int i;
1246                 int found = itemset_find(g, &where, ss, type);
1247                 if (!found) {
1248                         is = calloc(1, sizeof(*is));
1249                         is->state = g->states;
1250                         g->states += 1;
1251                         is->items = ss;
1252                         is->assoc = assoc;
1253                         is->precedence = precedence;
1254                         is->next = *where;
1255                         is->go_to = INIT_DATASET;
1256                         *where = is;
1257                         return is->state;
1258                 }
1259                 is = *where;
1260                 if (type != LALR) {
1261                         symset_free(ss);
1262                         return is->state;
1263                 }
1264                 for (i = 0; i < ss.cnt; i++) {
1265                         struct symset temp = INIT_SYMSET, s;
1266                         if (ss.data[i] == is->items.data[i])
1267                                 continue;
1268                         s = set_find(g, is->items.data[i]);
1269                         symset_union(&temp, &s);
1270                         s = set_find(g, ss.data[i]);
1271                         if (symset_union(&temp, &s)) {
1272                                 is->items.data[i] = save_set(g, temp);
1273                                 is->completed = 0;
1274                         } else
1275                                 symset_free(temp);
1276                 }
1277                 symset_free(ss);
1278                 return is->state;
1279         }
1280
1281 #### The build
1282
1283 To build all the itemsets, we first insert the initial itemset made from
1284 production zero, complete each itemset, and then generate new itemsets
1285 from old until no new ones can be made.
1286
1287 Completing an itemset means finding all the items where "DOT" is
1288 followed by a nonterminal and adding "DOT=0" items for every production
1289 from that non-terminal - providing each item hasn't already been added.
1290 When we add such an item it might get inserted before the current
1291 one, so to make sure we process it we reset the loop counter to the
1292 start.
1293
1294 If LA sets are needed, the LA set for each new item is found using
1295 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This
1296 may be supplemented by the LA set for the item which produced the new
1297 item.
1298
1299 We also collect a set of all symbols which follow "DOT" (in `done`) as
1300 this is used in the next stage.
1301
1302 When itemsets are created we assign a precedence to the itemset from the
1303 complete item, if there is one.  We ignore the possibility of there
1304 being two and don't (currently) handle precedence in such grammars.
1305 When completing a grammar we ignore any item where DOT is followed by a
1306 terminal with a precedence lower than that for the itemset.  Unless the
1307 terminal has right associativity, we also ignore items where the
1308 terminal has the same precedence.  The result is that unwanted items are
1309 still in the itemset, but the terminal doesn't get into the go to set,
1310 so the item is ineffective.
1311
1312 ###### complete itemset
1313         for (i = 0; i < is->items.cnt; i++) {
1314                 int p = item_prod(is->items.syms[i]);
1315                 int bs = item_dot(is->items.syms[i]);
1316                 struct production *pr = g->productions[p];
1317                 int p2;
1318                 struct symbol *s;
1319                 struct symset LA = INIT_SYMSET;
1320                 unsigned short sn = 0;
1321
1322                 if (bs == pr->body_size)
1323                         continue;
1324                 s = pr->body[bs];
1325                 if (s->precedence && is->precedence &&
1326                     is->precedence > s->precedence)
1327                         /* This terminal has a low precedence and
1328                          * shouldn't be shifted
1329                          */
1330                         continue;
1331                 if (s->precedence && is->precedence &&
1332                     is->precedence == s->precedence && s->assoc != Right)
1333                         /* This terminal has a matching precedence and is
1334                          * not Right-associative, so we mustn't shift it.
1335                          */
1336                         continue;
1337                 if (symset_find(&done, s->num) < 0)
1338                         symset_add(&done, s->num, 0);
1339
1340                 if (s->type != Nonterminal)
1341                         continue;
1342                 check_again = 1;
1343                 if (type >= LALR) {
1344                         // Need the LA set.
1345                         int to_end;
1346                         add_first(pr, bs+1, &LA, g, &to_end);
1347                         if (to_end) {
1348                                 struct symset ss = set_find(g, is->items.data[i]);
1349                                 symset_union(&LA, &ss);
1350                         }
1351                         sn = save_set(g, LA);
1352                         LA = set_find(g, sn);
1353                 }
1354
1355                 /* Add productions for this symbol */
1356                 for (p2 = s->first_production;
1357                      p2 < g->production_count &&
1358                       g->productions[p2]->head == s;
1359                      p2++) {
1360                         int itm = item_num(p2, 0);
1361                         int pos = symset_find(&is->items, itm);
1362                         if (pos < 0) {
1363                                 symset_add(&is->items, itm, sn);
1364                                 /* Will have re-ordered, so start
1365                                  * from beginning again */
1366                                 i = -1;
1367                         } else if (type >= LALR) {
1368                                 struct symset ss = set_find(g, is->items.data[pos]);
1369                                 struct symset tmp = INIT_SYMSET;
1370                                 struct symset *la = &LA;
1371
1372                                 symset_union(&tmp, &ss);
1373                                 if (symset_union(&tmp, la)) {
1374                                         is->items.data[pos] = save_set(g, tmp);
1375                                         i = -1;
1376                                 } else
1377                                         symset_free(tmp);
1378                         }
1379                 }
1380         }
1381
1382 For each symbol we found (and placed in `done`) we collect all the items
1383 for which this symbol is next, and create a new itemset with "DOT"
1384 advanced over the symbol.  This is then added to the collection of
1385 itemsets (or merged with a pre-existing itemset).
1386
1387 ###### derive itemsets
1388         // Now we have a completed itemset, so we need to
1389         // compute all the 'next' itemsets and create them
1390         // if they don't exist.
1391         for (i = 0; i < done.cnt; i++) {
1392                 int j;
1393                 unsigned short state;
1394                 struct symbol *sym = g->symtab[done.syms[i]];
1395                 enum assoc assoc = Non;
1396                 unsigned short precedence = 0;
1397                 struct symset newitemset = INIT_SYMSET;
1398                 if (type >= LALR)
1399                         newitemset = INIT_DATASET;
1400
1401                 for (j = 0; j < is->items.cnt; j++) {
1402                         int itm = is->items.syms[j];
1403                         int p = item_prod(itm);
1404                         int bp = item_dot(itm);
1405                         struct production *pr = g->productions[p];
1406                         unsigned short la = 0;
1407                         int pos;
1408
1409                         if (bp == pr->body_size)
1410                                 continue;
1411                         if (pr->body[bp] != sym)
1412                                 continue;
1413
1414                         bp += 1;
1415                         if (type >= LALR)
1416                                 la = is->items.data[j];
1417                         if (bp == pr->body_size &&
1418                             pr->precedence > 0 &&
1419                             pr->precedence > precedence) {
1420                                 // new itemset is reducible and has a precedence.
1421                                 precedence = pr->precedence;
1422                                 assoc = pr->assoc;
1423                         }
1424                         pos = symset_find(&newitemset, item_num(p, bp));
1425
1426                         if (pos < 0)
1427                                 symset_add(&newitemset, item_num(p, bp), la);
1428                         else if (type >= LALR) {
1429                                 // Need to merge la set.
1430                                 int la2 = newitemset.data[pos];
1431                                 if (la != la2) {
1432                                         struct symset ss = set_find(g, la2);
1433                                         struct symset LA = INIT_SYMSET;
1434                                         symset_union(&LA, &ss);
1435                                         ss = set_find(g, la);
1436                                         if (symset_union(&LA, &ss))
1437                                                 newitemset.data[pos] = save_set(g, LA);
1438                                         else
1439                                                 symset_free(LA);
1440                                 }
1441                         }
1442                 }
1443                 state = add_itemset(g, newitemset, assoc, precedence, type);
1444                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1445                         symset_add(&is->go_to, done.syms[i], state);
1446         }
1447
1448 All that is left is to create the initial itemset from production zero, and
1449 with `TK_eof` as the LA set.
1450
1451 ###### functions
1452         static void build_itemsets(struct grammar *g, enum grammar_type type)
1453         {
1454                 struct symset first = INIT_SYMSET;
1455                 struct itemset *is;
1456                 int check_again;
1457                 unsigned short la = 0;
1458                 if (type >= LALR) {
1459                         // LA set just has eof
1460                         struct symset eof = INIT_SYMSET;
1461                         symset_add(&eof, TK_eof, 0);
1462                         la = save_set(g, eof);
1463                         first = INIT_DATASET;
1464                 }
1465                 // production 0, offset 0 (with no data)
1466                 symset_add(&first, item_num(0, 0), la);
1467                 add_itemset(g, first, Non, 0, type);
1468                 for (check_again = 0, is = g->items;
1469                      is;
1470                      is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1471                         int i;
1472                         struct symset done = INIT_SYMSET;
1473                         if (is->completed)
1474                                 continue;
1475                         is->completed = 1;
1476                         check_again = 1;
1477                         ## complete itemset
1478                         ## derive itemsets
1479                         symset_free(done);
1480                 }
1481         }
1482
1483 ### Completing the analysis.
1484
1485 The exact process of analysis depends on which level was requested.  For
1486 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1487 `LR1` we need first, but not follow.  For `SLR` we need both.
1488
1489 We don't build the "action" tables that you might expect as the parser
1490 doesn't use them.  They will be calculated to some extent if needed for
1491 a report.
1492
1493 Once we have built everything we allocate arrays for the two lists:
1494 symbols and itemsets.  This allows more efficient access during
1495 reporting.  The symbols are grouped as terminals, then non-terminals,
1496 then virtual, with the start of non-terminals recorded as `first_nonterm`.
1497 Special terminals -- meaning just EOL -- are included with the
1498 non-terminals so that they are not expected by the scanner.
1499
1500 ###### grammar fields
1501         struct symbol **symtab;
1502         struct itemset **statetab;
1503         int first_nonterm;
1504
1505 ###### grammar_analyse
1506
1507         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1508         {
1509                 struct symbol *s;
1510                 struct itemset *is;
1511                 int snum = TK_reserved;
1512                 for (s = g->syms; s; s = s->next)
1513                         if (s->num < 0 && s->type == Terminal && !s->isspecial) {
1514                                 s->num = snum;
1515                                 snum++;
1516                         }
1517                 g->first_nonterm = snum;
1518                 for (s = g->syms; s; s = s->next)
1519                         if (s->num < 0 && s->type != Virtual) {
1520                                 s->num = snum;
1521                                 snum++;
1522                         }
1523                 for (s = g->syms; s; s = s->next)
1524                         if (s->num < 0) {
1525                                 s->num = snum;
1526                                 snum++;
1527                         }
1528                 g->num_syms = snum;
1529                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1530                 for (s = g->syms; s; s = s->next)
1531                         g->symtab[s->num] = s;
1532
1533                 set_nullable(g);
1534                 if (type >= SLR)
1535                         build_first(g);
1536
1537                 if (type == SLR)
1538                         build_follow(g);
1539
1540                 build_itemsets(g, type);
1541
1542                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1543                 for (is = g->items; is ; is = is->next)
1544                         g->statetab[is->state] = is;
1545         }
1546
1547 ## Reporting on the Grammar
1548
1549 The purpose of the report is to give the grammar developer insight into
1550 how the grammar parser will work.  It is basically a structured dump of
1551 all the tables that have been generated, plus a description of any conflicts.
1552
1553 ###### grammar_report
1554         static int grammar_report(struct grammar *g, enum grammar_type type)
1555         {
1556                 report_symbols(g);
1557                 if (g->follow)
1558                         report_follow(g);
1559                 report_itemsets(g);
1560                 return report_conflicts(g, type);
1561         }
1562
1563 Firstly we have the complete list of symbols, together with the
1564 "FIRST" set if that was generated.  We add a mark to each symbol to
1565 show if it is nullable (`.`).
1566
1567 ###### functions
1568
1569         static void report_symbols(struct grammar *g)
1570         {
1571                 int n;
1572                 if (g->first)
1573                         printf("SYMBOLS + FIRST:\n");
1574                 else
1575                         printf("SYMBOLS:\n");
1576
1577                 for (n = 0; n < g->num_syms; n++) {
1578                         struct symbol *s = g->symtab[n];
1579                         if (!s)
1580                                 continue;
1581
1582                         printf(" %c%3d%c: ",
1583                                s->nullable ? '.':' ',
1584                                s->num, symtypes[s->type]);
1585                         prtxt(s->name);
1586                         if (s->precedence)
1587                                 printf(" (%d%s)", s->precedence,
1588                                        assoc_names[s->assoc]);
1589
1590                         if (g->first && s->type == Nonterminal) {
1591                                 int j;
1592                                 char c = ':';
1593                                 for (j = 0; j < g->first[n].cnt; j++) {
1594                                         printf("%c ", c);
1595                                         c = ',';
1596                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1597                                 }
1598                         }
1599                         printf("\n");
1600                 }
1601         }
1602
1603 Then we have the follow sets if they were computed.
1604
1605         static void report_follow(struct grammar *g)
1606         {
1607                 int n;
1608                 printf("FOLLOW:\n");
1609                 for (n = 0; n < g->num_syms; n++)
1610                         if (g->follow[n].cnt) {
1611                                 int j;
1612                                 char c = ':';
1613                                 printf("  ");
1614                                 prtxt(g->symtab[n]->name);
1615                                 for (j = 0; j < g->follow[n].cnt; j++) {
1616                                         printf("%c ", c);
1617                                         c = ',';
1618                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1619                                 }
1620                                 printf("\n");
1621                         }
1622         }
1623
1624 And finally the item sets.  These include the GO TO tables and, for
1625 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1626 it up a bit.  First the items, with production number and associativity.
1627
1628         static void report_item(struct grammar *g, int itm)
1629         {
1630                 int p = item_prod(itm);
1631                 int dot = item_dot(itm);
1632                 struct production *pr = g->productions[p];
1633                 int i;
1634
1635                 printf("    ");
1636                 prtxt(pr->head->name);
1637                 printf(" ->");
1638                 for (i = 0; i < pr->body_size; i++) {
1639                         printf(" %s", (dot == i ? ". ": ""));
1640                         prtxt(pr->body[i]->name);
1641                 }
1642                 if (dot == pr->body_size)
1643                         printf(" .");
1644                 printf(" [%d]", p);
1645                 if (pr->precedence && dot == pr->body_size)
1646                         printf(" (%d%s)", pr->precedence,
1647                                assoc_names[pr->assoc]);
1648                 if (dot < pr->body_size &&
1649                     pr->body[dot]->precedence) {
1650                         struct symbol *s = pr->body[dot];
1651                         printf(" [%d%s]", s->precedence,
1652                                assoc_names[s->assoc]);
1653                 }
1654                 printf("\n");
1655         }
1656
1657 The LA sets which are (possibly) reported with each item:
1658
1659         static void report_la(struct grammar *g, int lanum)
1660         {
1661                 struct symset la = set_find(g, lanum);
1662                 int i;
1663                 char c = ':';
1664
1665                 printf("        LOOK AHEAD(%d)", lanum);
1666                 for (i = 0; i < la.cnt; i++) {
1667                         printf("%c ", c);
1668                         c = ',';
1669                         prtxt(g->symtab[la.syms[i]]->name);
1670                 }
1671                 printf("\n");
1672         }
1673
1674 Then the go to sets:
1675
1676         static void report_goto(struct grammar *g, struct symset gt)
1677         {
1678                 int i;
1679                 printf("    GOTO:\n");
1680
1681                 for (i = 0; i < gt.cnt; i++) {
1682                         printf("      ");
1683                         prtxt(g->symtab[gt.syms[i]]->name);
1684                         printf(" -> %d\n", gt.data[i]);
1685                 }
1686         }
1687
1688 Now we can report all the item sets complete with items, LA sets, and GO TO.
1689
1690         static void report_itemsets(struct grammar *g)
1691         {
1692                 int s;
1693                 printf("ITEM SETS(%d)\n", g->states);
1694                 for (s = 0; s < g->states; s++) {
1695                         int j;
1696                         struct itemset *is = g->statetab[s];
1697                         printf("  Itemset %d:", s);
1698                         if (is->precedence)
1699                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1700                         printf("\n");
1701                         for (j = 0; j < is->items.cnt; j++) {
1702                                 report_item(g, is->items.syms[j]);
1703                                 if (is->items.data != NO_DATA)
1704                                         report_la(g, is->items.data[j]);
1705                         }
1706                         report_goto(g, is->go_to);
1707                 }
1708         }
1709
1710 ### Reporting conflicts
1711
1712 Conflict detection varies a lot among different analysis levels.  However
1713 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1714 LR1 are also similar as they have FOLLOW or LA sets.
1715
1716 ###### functions
1717
1718         ## conflict functions
1719
1720         static int report_conflicts(struct grammar *g, enum grammar_type type)
1721         {
1722                 int cnt = 0;
1723                 printf("Conflicts:\n");
1724                 if (type < SLR)
1725                         cnt = conflicts_lr0(g, type);
1726                 else
1727                         cnt = conflicts_slr(g, type);
1728                 if (cnt == 0)
1729                         printf(" - no conflicts\n");
1730                 return cnt;
1731         }
1732
1733 LR0 conflicts are any state which have both a reducible item and
1734 a shiftable item, or two reducible items.
1735
1736 LR05 conflicts only occur if two possible reductions exist,
1737 as shifts always over-ride reductions.
1738
1739 ###### conflict functions
1740         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1741         {
1742                 int i;
1743                 int cnt = 0;
1744                 for (i = 0; i < g->states; i++) {
1745                         struct itemset *is = g->statetab[i];
1746                         int last_reduce = -1;
1747                         int prev_reduce = -1;
1748                         int last_shift = -1;
1749                         int j;
1750                         if (!is)
1751                                 continue;
1752                         for (j = 0; j < is->items.cnt; j++) {
1753                                 int itm = is->items.syms[j];
1754                                 int p = item_prod(itm);
1755                                 int bp = item_dot(itm);
1756                                 struct production *pr = g->productions[p];
1757
1758                                 if (bp == pr->body_size) {
1759                                         prev_reduce = last_reduce;
1760                                         last_reduce = j;
1761                                         continue;
1762                                 }
1763                                 if (pr->body[bp]->type == Terminal)
1764                                         last_shift = j;
1765                         }
1766                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1767                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1768                                 report_item(g, is->items.syms[last_shift]);
1769                                 report_item(g, is->items.syms[last_reduce]);
1770                                 cnt++;
1771                         }
1772                         if (prev_reduce >= 0) {
1773                                 printf("  State %d has 2 (or more) reducible items\n", i);
1774                                 report_item(g, is->items.syms[prev_reduce]);
1775                                 report_item(g, is->items.syms[last_reduce]);
1776                                 cnt++;
1777                         }
1778                 }
1779                 return cnt;
1780         }
1781
1782 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1783 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1784 in the source of the look ahead set.
1785
1786 We build two datasets to reflect the "action" table: one which maps
1787 terminals to items where that terminal could be shifted and another
1788 which maps terminals to items that could be reduced when the terminal
1789 is in look-ahead.  We report when we get conflicts between the two.
1790
1791         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1792         {
1793                 int i;
1794                 int cnt = 0;
1795
1796                 for (i = 0; i < g->states; i++) {
1797                         struct itemset *is = g->statetab[i];
1798                         struct symset shifts = INIT_DATASET;
1799                         struct symset reduce = INIT_DATASET;
1800                         int j;
1801                         if (!is)
1802                                 continue;
1803                         /* First collect the shifts */
1804                         for (j = 0; j < is->items.cnt; j++) {
1805                                 unsigned short itm = is->items.syms[j];
1806                                 int p = item_prod(itm);
1807                                 int bp = item_dot(itm);
1808                                 struct production *pr = g->productions[p];
1809                                 struct symbol *s;
1810
1811                                 if (bp >= pr->body_size ||
1812                                     pr->body[bp]->type != Terminal)
1813                                         /* not shiftable */
1814                                         continue;
1815
1816                                 s = pr->body[bp];
1817                                 if (s->precedence && is->precedence)
1818                                         /* Precedence resolves this, so no conflict */
1819                                         continue;
1820
1821                                 if (symset_find(&shifts, s->num) < 0)
1822                                         symset_add(&shifts, s->num, itm);
1823                         }
1824                         /* Now look for reductions and conflicts */
1825                         for (j = 0; j < is->items.cnt; j++) {
1826                                 unsigned short itm = is->items.syms[j];
1827                                 int p = item_prod(itm);
1828                                 int bp = item_dot(itm);
1829                                 struct production *pr = g->productions[p];
1830
1831                                 if (bp < pr->body_size)
1832                                         continue;
1833                                 /* reducible */
1834                                 struct symset la;
1835                                 if (type == SLR)
1836                                         la = g->follow[pr->head->num];
1837                                 else
1838                                         la = set_find(g, is->items.data[j]);
1839                                 int k;
1840                                 for (k = 0; k < la.cnt; k++) {
1841                                         int pos = symset_find(&shifts, la.syms[k]);
1842                                         if (pos >= 0) {
1843                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1844                                                 cnt++;
1845                                                         prtxt(g->symtab[la.syms[k]]->name);
1846                                                 printf(":\n");
1847                                                 report_item(g, shifts.data[pos]);
1848                                                 report_item(g, itm);
1849                                         }
1850                                         pos = symset_find(&reduce, la.syms[k]);
1851                                         if (pos < 0) {
1852                                                 symset_add(&reduce, la.syms[k], itm);
1853                                                 continue;
1854                                         }
1855                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1856                                         prtxt(g->symtab[la.syms[k]]->name);
1857                                         printf(":\n");
1858                                         report_item(g, itm);
1859                                         report_item(g, reduce.data[pos]);
1860                                         cnt++;
1861                                 }
1862                         }
1863                         symset_free(shifts);
1864                         symset_free(reduce);
1865                 }
1866                 return cnt;
1867         }
1868
1869
1870 ## Generating the parser
1871
1872 The exported part of the parser is the `parse_XX` function, where the name
1873 `XX` is based on the name of the parser files.
1874
1875 This takes a `code_node`, a partially initialized `token_config`, and an
1876 optional `FILE` to send tracing to.  The `token_config` gets the list of
1877 known words added and then is used with the `code_node` to initialize the
1878 scanner.
1879
1880 `parse_XX` then calls the library function `parser_run` to actually complete
1881 the parse.  This needs the `states` table and functions to call the various
1882 pieces of code provided in the grammar file, so they are generated first.
1883
1884 ###### parser_generate
1885
1886         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1887                                struct code_node *pre_reduce)
1888         {
1889                 gen_known(f, g);
1890                 gen_non_term(f, g);
1891                 gen_goto(f, g);
1892                 gen_states(f, g);
1893                 gen_reduce(f, g, file, pre_reduce);
1894                 gen_free(f, g);
1895
1896                 fprintf(f, "#line 0 \"gen_parser\"\n");
1897                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1898                         name);
1899                 fprintf(f, "{\n");
1900                 fprintf(f, "\tstruct token_state *tokens;\n");
1901                 fprintf(f, "\tconfig->words_marks = known;\n");
1902                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1903                 fprintf(f, "\ttokens = token_open(code, config);\n");
1904                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1905                 fprintf(f, "\ttoken_close(tokens);\n");
1906                 fprintf(f, "\treturn rv;\n");
1907                 fprintf(f, "}\n\n");
1908         }
1909
1910 ### Known words table
1911
1912 The known words table is simply an array of terminal symbols.
1913 The table of nonterminals used for tracing is a similar array.
1914
1915 ###### functions
1916
1917         static void gen_known(FILE *f, struct grammar *g)
1918         {
1919                 int i;
1920                 fprintf(f, "#line 0 \"gen_known\"\n");
1921                 fprintf(f, "static const char *known[] = {\n");
1922                 for (i = TK_reserved;
1923                      i < g->num_syms && g->symtab[i]->type == Terminal;
1924                      i++)
1925                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1926                                 g->symtab[i]->name.txt);
1927                 fprintf(f, "};\n\n");
1928         }
1929
1930         static void gen_non_term(FILE *f, struct grammar *g)
1931         {
1932                 int i;
1933                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1934                 fprintf(f, "static const char *non_term[] = {\n");
1935                 for (i = TK_reserved;
1936                      i < g->num_syms;
1937                      i++)
1938                         if (g->symtab[i]->type == Nonterminal ||
1939                             g->symtab[i]->isspecial)
1940                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1941                                         g->symtab[i]->name.txt);
1942                 fprintf(f, "};\n\n");
1943         }
1944
1945 ### States and the goto tables.
1946
1947 For each state we record the goto table and details of the reducible
1948 production if there is one.
1949 Some of the details of the reducible production are stored in the
1950 `do_reduce` function to come later.  Here we store the production
1951 number, the body size (useful for stack management), and the resulting
1952 symbol (useful for knowing how to free data later).
1953
1954 The go to table is stored in a simple array of `sym` and corresponding
1955 `state`.
1956
1957 ###### exported types
1958
1959         struct lookup {
1960                 short sym;
1961                 short state;
1962         };
1963         struct state {
1964                 short go_to_cnt;
1965                 const struct lookup * go_to;
1966                 short reduce_prod;
1967                 short reduce_size;
1968                 short reduce_sym;
1969                 short result_size;
1970         };
1971
1972 ###### functions
1973
1974         static void gen_goto(FILE *f, struct grammar *g)
1975         {
1976                 int i;
1977                 fprintf(f, "#line 0 \"gen_goto\"\n");
1978                 for (i = 0; i < g->states; i++) {
1979                         struct symset gt = g->statetab[i]->go_to;
1980                         int j;
1981
1982                         if (gt.cnt == 0)
1983                                 continue;
1984                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
1985                                 i);
1986                         for (j = 0; j < gt.cnt; j++)
1987                                 fprintf(f, "\t{ %d, %d },\n",
1988                                         gt.syms[j], gt.data[j]);
1989                         fprintf(f, "};\n");
1990                 }
1991         }
1992
1993         static void gen_states(FILE *f, struct grammar *g)
1994         {
1995                 int i;
1996                 fprintf(f, "#line 0 \"gen_states\"\n");
1997                 fprintf(f, "static const struct state states[] = {\n");
1998                 for (i = 0; i < g->states; i++) {
1999                         struct itemset *is = g->statetab[i];
2000                         int j, prod = -1, prod_len;
2001
2002                         for (j = 0; j < is->items.cnt; j++) {
2003                                 int itm = is->items.syms[j];
2004                                 int p = item_prod(itm);
2005                                 int bp = item_dot(itm);
2006                                 struct production *pr = g->productions[p];
2007
2008                                 if (bp < pr->body_size)
2009                                         continue;
2010                                 /* This is what we reduce - choose longest */
2011                                 if (prod < 0 || prod_len < pr->body_size) {
2012                                         prod = p;
2013                                         prod_len = pr->body_size;
2014                                 }
2015                         }
2016                         if (is->go_to.cnt)
2017                                 fprintf(f, "\t[%d] = { %d, goto_%d, ",
2018                                         i, is->go_to.cnt, i);
2019                         else
2020                                 fprintf(f, "\t[%d] = { 0, NULL, ", i);
2021                         if (prod >= 0) {
2022                                 struct production *pr = g->productions[prod];
2023                                 struct symbol *hd = pr->head;
2024                                 fprintf(f, "%d, %d, %d, ",
2025                                         prod, pr->body_size, pr->head->num);
2026                                 if (hd->struct_name.txt == NULL)
2027                                         fprintf(f, "0 },\n");
2028                                 else
2029                                         fprintf(f, "sizeof(struct %.*s%s) },\n",
2030                                                 hd->struct_name.len,
2031                                                 hd->struct_name.txt,
2032                                                 hd->isref ? "*" : "");
2033                         } else
2034                                 fprintf(f, "-1, -1, -1, -1 },\n");
2035                 }
2036                 fprintf(f, "};\n\n");
2037         }
2038
2039 ### The `do_reduce` function and the code
2040
2041 When the parser engine decides to reduce a production, it calls
2042 `do_reduce` which runs the code that was included with the production,
2043 if any.
2044
2045 This code needs to be able to store data somewhere.  Rather than
2046 requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
2047 buffer and have `do_reduce` return the size to be saved.
2048
2049 In order for the code to access "global" context, we pass in the
2050 "config" pointer that was passed to the parser function.  If the `struct
2051 token_config` is embedded in some larger structure, the reducing code
2052 can access the larger structure using pointer manipulation.  Performing
2053 that pointer manipulation, and any other common code or variables for
2054 all reduce actions, can be provided in code node called "reduce" which
2055 is passed around in `pre_reduce` which you might have already noticed.
2056
2057 The code fragments require translation when written out.  Any `$N` needs
2058 to be converted to a reference either to that buffer (if `$0`) or to the
2059 structure returned by a previous reduction.  These pointers need to be
2060 cast to the appropriate type for each access.  All this is handled in
2061 `gen_code`.
2062
2063 `gen_code` also allows symbol references to contain a '`<`' as in
2064 '`$<2`'.  This is particularly useful for references (or pointers), but
2065 can be used with structures too.  The `<` implies that the value is
2066 being moved out, so the object will not be automatically freed.  It is
2067 equivalent to assigning `NULL` to the pointer or filling a structure
2068 with zeros.
2069
2070 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2071 and possibly a number.  A number by itself (other than zero) selects a
2072 symbol from the body of the production.  A sequence of letters selects
2073 the shortest symbol in the body which contains those letters in the
2074 given order.  If a number follows the letters, then a later occurrence
2075 of that symbol is chosen.  So "`$AB2`" will refer to the structure
2076 attached to the second occurrence of the shortest symbol which contains
2077 an `A` followed by a `B`.  If there is no unique shortest system, or if
2078 the number given is too large, then the symbol reference is not
2079 transformed, and will cause an error when the code is compiled.
2080
2081 ###### functions
2082
2083         static int textchr(struct text t, char c, int s)
2084         {
2085                 int i;
2086                 for (i = s; i < t.len; i++)
2087                         if (t.txt[i] == c)
2088                                 return i;
2089                 return -1;
2090         }
2091
2092         static int subseq_match(char *seq, int slen, struct text name)
2093         {
2094                 int st = 0;
2095                 while (slen > 0) {
2096                         st = textchr(name, *seq, st);
2097                         if (st < 0)
2098                                 return 0;
2099                         slen -= 1;
2100                         seq += 1;
2101                         st += 1;
2102                 }
2103                 return 1;
2104         }
2105
2106         static int choose_sym(char **namep, int len, struct production *p)
2107         {
2108                 char *name = *namep;
2109                 char *nam = name;
2110                 int namlen;
2111                 int n = 0;
2112                 int i, s, slen;
2113                 char c;
2114
2115                 c = *name;
2116                 while (len > 0 &&
2117                        ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2118                         name += 1;
2119                         len -= 1;
2120                         c = *name;
2121                 }
2122                 namlen = name-nam;
2123                 while (len > 0 && (c >= '0' && c <= '9')) {
2124                         name += 1;
2125                         len -= 1;
2126                         n = n * 10 + (c - '0');
2127                         c = *name;
2128                 }
2129                 if (namlen == 0) {
2130                         if (name == *namep || n > p->body_size)
2131                                 return -1;
2132                         *namep = name;
2133                         return n;
2134                 }
2135                 slen = 0; s = -1;
2136                 for (i = 0; i < p->body_size; i++) {
2137                         if (!subseq_match(nam, namlen, p->body[i]->name))
2138                                 continue;
2139                         if (slen == 0 || p->body[i]->name.len < slen) {
2140                                 s = i;
2141                                 slen = p->body[i]->name.len;
2142                         }
2143                         if (s >= 0 && p->body[i] != p->body[s] &&
2144                             p->body[i]->name.len == p->body[s]->name.len)
2145                                 /* not unique, so s cannot be used */
2146                                 s = -1;
2147                 }
2148                 if (s < 0)
2149                         return -1;
2150                 if (n == 0)
2151                         n = 1;
2152                 for (i = 0; i < p->body_size; i++)
2153                         if (p->body[i] == p->body[s]) {
2154                                 n -= 1;
2155                                 if (n == 0)
2156                                         break;
2157                         }
2158                 if (n > 0 || i == p->body_size)
2159                         return -1;
2160                 *namep = name;
2161                 return i + 1;
2162         }
2163
2164         static void gen_code(struct production *p, FILE *f, struct grammar *g)
2165         {
2166                 char *c;
2167                 char *used = calloc(1, p->body_size);
2168                 int i;
2169
2170                 fprintf(f, "\t\t\t");
2171                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2172                         int n;
2173                         int use = 0;
2174                         if (*c != '$') {
2175                                 fputc(*c, f);
2176                                 if (*c == '\n')
2177                                         fputs("\t\t\t", f);
2178                                 continue;
2179                         }
2180                         c++;
2181                         if (*c == '<') {
2182                                 use = 1;
2183                                 c++;
2184                         }
2185                         n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2186                         if (n < 0) {
2187                                 fputc('$', f);
2188                                 if (use)
2189                                         fputc('<', f);
2190                                 fputc(*c, f);
2191                                 continue;
2192                         }
2193                         if (n == 0)
2194                                 fprintf(f, "(*(struct %.*s*%s)ret)",
2195                                         p->head->struct_name.len,
2196                                         p->head->struct_name.txt,
2197                                         p->head->isref ? "*":"");
2198                         else if (p->body[n-1]->type == Terminal)
2199                                 fprintf(f, "(*(struct token *)body[%d])",
2200                                         n-1);
2201                         else if (p->body[n-1]->struct_name.txt == NULL)
2202                                 fprintf(f, "$%d", n);
2203                         else {
2204                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2205                                         p->body[n-1]->struct_name.len,
2206                                         p->body[n-1]->struct_name.txt,
2207                                         p->body[n-1]->isref ? "*":"", n-1);
2208                                 used[n-1] = use;
2209                         }
2210                         c -= 1;
2211                 }
2212                 fputs("\n", f);
2213                 for (i = 0; i < p->body_size; i++) {
2214                         if (p->body[i]->struct_name.txt &&
2215                             used[i]) {
2216                                 // assume this has been copied out
2217                                 if (p->body[i]->isref)
2218                                         fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2219                                 else
2220                                         fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2221                         }
2222                 }
2223                 free(used);
2224         }
2225
2226 ###### functions
2227
2228         static void gen_reduce(FILE *f, struct grammar *g, char *file,
2229                                struct code_node *pre_reduce)
2230         {
2231                 int i;
2232                 fprintf(f, "#line 1 \"gen_reduce\"\n");
2233                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2234                 fprintf(f, "{\n");
2235                 fprintf(f, "\tint ret_size = 0;\n");
2236                 if (pre_reduce)
2237                         code_node_print(f, pre_reduce, file);
2238
2239                 fprintf(f, "#line 4 \"gen_reduce\"\n");
2240                 fprintf(f, "\tswitch(prod) {\n");
2241                 for (i = 0; i < g->production_count; i++) {
2242                         struct production *p = g->productions[i];
2243                         fprintf(f, "\tcase %d:\n", i);
2244
2245                         if (p->code.txt) {
2246                                 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2247                                 gen_code(p, f, g);
2248                         }
2249
2250                         if (p->head->struct_name.txt)
2251                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2252                                         p->head->struct_name.len,
2253                                         p->head->struct_name.txt,
2254                                         p->head->isref ? "*":"");
2255
2256                         fprintf(f, "\t\tbreak;\n");
2257                 }
2258                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2259         }
2260
2261 ### `do_free`
2262
2263 As each non-terminal can potentially cause a different type of data
2264 structure to be allocated and filled in, we need to be able to free it when
2265 done.
2266
2267 It is particularly important to have fine control over freeing during error
2268 recovery where individual stack frames might need to be freed.
2269
2270 For this, the grammar author is required to defined a `free_XX` function for
2271 each structure that is used by a non-terminal.  `do_free` will call whichever
2272 is appropriate given a symbol number, and will call `free` (as is
2273 appropriate for tokens) on any terminal symbol.
2274
2275 ###### functions
2276
2277         static void gen_free(FILE *f, struct grammar *g)
2278         {
2279                 int i;
2280
2281                 fprintf(f, "#line 0 \"gen_free\"\n");
2282                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2283                 fprintf(f, "{\n");
2284                 fprintf(f, "\tif (!asn) return;\n");
2285                 fprintf(f, "\tif (sym < %d", g->first_nonterm);
2286                 /* Need to handle special terminals too */
2287                 for (i = 0; i < g->num_syms; i++) {
2288                         struct symbol *s = g->symtab[i];
2289                         if (i >= g->first_nonterm && s->type == Terminal &&
2290                             s->isspecial)
2291                                 fprintf(f, " || sym == %d", s->num);
2292                 }
2293                 fprintf(f, ") {\n");
2294                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2295                 fprintf(f, "\tswitch(sym) {\n");
2296
2297                 for (i = 0; i < g->num_syms; i++) {
2298                         struct symbol *s = g->symtab[i];
2299                         if (!s ||
2300                             s->type != Nonterminal ||
2301                             s->struct_name.txt == NULL)
2302                                 continue;
2303
2304                         fprintf(f, "\tcase %d:\n", s->num);
2305                         if (s->isref) {
2306                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2307                                         s->struct_name.len,
2308                                         s->struct_name.txt);
2309                                 fprintf(f, "\t\tfree(asn);\n");
2310                         } else
2311                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2312                                         s->struct_name.len,
2313                                         s->struct_name.txt);
2314                         fprintf(f, "\t\tbreak;\n");
2315                 }
2316                 fprintf(f, "\t}\n}\n\n");
2317         }
2318
2319 ## The main routine.
2320
2321 There are three key parts to the "main" function of parsergen: processing
2322 the arguments, loading the grammar file, and dealing with the grammar.
2323
2324 ### Argument processing.
2325
2326 Command line options allow the selection of analysis mode, name of output
2327 file, and whether or not a report should be generated.  By default we create
2328 a report only if no code output was requested.
2329
2330 The `parse_XX` function name uses the basename of the output file which
2331 should not have a suffix (`.c`).  `.c` is added to the given name for the
2332 code, and `.h` is added for the header (if header text is specifed in the
2333 grammar file).
2334
2335 ###### includes
2336         #include <getopt.h>
2337
2338 ###### declarations
2339         static const struct option long_options[] = {
2340                 { "LR0",        0, NULL, '0' },
2341                 { "LR05",       0, NULL, '5' },
2342                 { "SLR",        0, NULL, 'S' },
2343                 { "LALR",       0, NULL, 'L' },
2344                 { "LR1",        0, NULL, '1' },
2345                 { "tag",        1, NULL, 't' },
2346                 { "report",     0, NULL, 'R' },
2347                 { "output",     1, NULL, 'o' },
2348                 { NULL,         0, NULL, 0   }
2349         };
2350         const char *options = "05SL1t:Ro:";
2351
2352 ###### process arguments
2353         int opt;
2354         char *outfile = NULL;
2355         char *infile;
2356         char *name;
2357         char *tag = NULL;
2358         int report = 1;
2359         enum grammar_type type = LR05;
2360         while ((opt = getopt_long(argc, argv, options,
2361                                   long_options, NULL)) != -1) {
2362                 switch(opt) {
2363                 case '0':
2364                         type = LR0; break;
2365                 case '5':
2366                         type = LR05; break;
2367                 case 'S':
2368                         type = SLR; break;
2369                 case 'L':
2370                         type = LALR; break;
2371                 case '1':
2372                         type = LR1; break;
2373                 case 'R':
2374                         report = 2; break;
2375                 case 'o':
2376                         outfile = optarg; break;
2377                 case 't':
2378                         tag = optarg; break;
2379                 default:
2380                         fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2381                         exit(1);
2382                 }
2383         }
2384         if (optind < argc)
2385                 infile = argv[optind++];
2386         else {
2387                 fprintf(stderr, "No input file given\n");
2388                 exit(1);
2389         }
2390         if (outfile && report == 1)
2391                 report = 0;
2392         name = outfile;
2393         if (name && strchr(name, '/'))
2394                 name = strrchr(name, '/')+1;
2395
2396         if (optind < argc) {
2397                 fprintf(stderr, "Excess command line arguments\n");
2398                 exit(1);
2399         }
2400
2401 ### Loading the grammar file
2402
2403 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2404 map it.
2405
2406 Once we have extracted the code (with `mdcode`) we expect to find three
2407 or four sections: header, code, grammar, reduce.
2408 Anything else that is not excluded by the `--tag` option is an error.
2409
2410 "header", "code", and "reduce" are optional, though it is hard to build
2411 a working parser without the first two.  "grammar" must be provided.
2412
2413 ###### includes
2414         #include <fcntl.h>
2415         #include <sys/mman.h>
2416         #include <errno.h>
2417
2418 ###### functions
2419         static int errs;
2420         static void pr_err(char *msg)
2421         {
2422                 errs++;
2423                 fprintf(stderr, "%s\n", msg);
2424         }
2425
2426 ###### load file
2427         struct section *table;
2428         int fd;
2429         int len;
2430         char *file;
2431         fd = open(infile, O_RDONLY);
2432         if (fd < 0) {
2433                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2434                         infile, strerror(errno));
2435                 exit(1);
2436         }
2437         len = lseek(fd, 0, 2);
2438         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2439         table = code_extract(file, file+len, pr_err);
2440
2441         struct code_node *hdr = NULL;
2442         struct code_node *code = NULL;
2443         struct code_node *gram = NULL;
2444         struct code_node *pre_reduce = NULL;
2445         for (s = table; s; s = s->next) {
2446                 struct text sec = s->section;
2447                 if (tag && !strip_tag(&sec, tag))
2448                         continue;
2449                 if (text_is(sec, "header"))
2450                         hdr = s->code;
2451                 else if (text_is(sec, "code"))
2452                         code = s->code;
2453                 else if (text_is(sec, "grammar"))
2454                         gram = s->code;
2455                 else if (text_is(sec, "reduce"))
2456                         pre_reduce = s->code;
2457                 else {
2458                         fprintf(stderr, "Unknown content section: %.*s\n",
2459                                 s->section.len, s->section.txt);
2460                         rv |= 2;
2461                 }
2462         }
2463
2464 ### Processing the input
2465
2466 As we need to append an extention to a filename and then open it for
2467 writing, and we need to do this twice, it helps to have a separate function.
2468
2469 ###### functions
2470
2471         static FILE *open_ext(char *base, char *ext)
2472         {
2473                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2474                 FILE *f;
2475                 strcat(strcpy(fn, base), ext);
2476                 f = fopen(fn, "w");
2477                 free(fn);
2478                 return f;
2479         }
2480
2481 If we can read the grammar, then we analyse and optionally report on it.  If
2482 the report finds conflicts we will exit with an error status.
2483
2484 ###### process input
2485         struct grammar *g = NULL;
2486         if (gram == NULL) {
2487                 fprintf(stderr, "No grammar section provided\n");
2488                 rv |= 2;
2489         } else {
2490                 g = grammar_read(gram);
2491                 if (!g) {
2492                         fprintf(stderr, "Failure to parse grammar\n");
2493                         rv |= 2;
2494                 }
2495         }
2496         if (g) {
2497                 grammar_analyse(g, type);
2498                 if (report)
2499                         if (grammar_report(g, type))
2500                                 rv |= 1;
2501         }
2502
2503 If a "headers" section is defined, we write it out and include a declaration
2504 for the `parse_XX` function so it can be used from separate code.
2505
2506         if (rv == 0 && hdr && outfile) {
2507                 FILE *f = open_ext(outfile, ".h");
2508                 if (f) {
2509                         code_node_print(f, hdr, infile);
2510                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2511                                 name);
2512                         fclose(f);
2513                 } else {
2514                         fprintf(stderr, "Cannot create %s.h\n",
2515                                 outfile);
2516                         rv |= 4;
2517                 }
2518         }
2519
2520 And if all goes well and an output file was provided, we create the `.c`
2521 file with the code section (if any) and the parser tables and function.
2522
2523         if (rv == 0 && outfile) {
2524                 FILE *f = open_ext(outfile, ".c");
2525                 if (f) {
2526                         if (code)
2527                                 code_node_print(f, code, infile);
2528                         gen_parser(f, g, infile, name, pre_reduce);
2529                         fclose(f);
2530                 } else {
2531                         fprintf(stderr, "Cannot create %s.c\n",
2532                                 outfile);
2533                         rv |= 4;
2534                 }
2535         }
2536
2537 And that about wraps it up.  We need to set the locale so that UTF-8 is
2538 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2539
2540 ###### File: parsergen.mk
2541         parsergen : parsergen.o libscanner.o libmdcode.o
2542                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2543
2544 ###### includes
2545         #include <locale.h>
2546
2547 ###### main
2548
2549         int main(int argc, char *argv[])
2550         {
2551                 struct section *s;
2552                 int rv = 0;
2553
2554                 setlocale(LC_ALL,"");
2555
2556                 ## process arguments
2557                 ## load file
2558                 ## process input
2559
2560                 return rv;
2561         }
2562
2563 ## The SHIFT/REDUCE parser
2564
2565 Having analysed the grammar and generated all the tables, we only need
2566 the shift/reduce engine to bring it all together.
2567
2568 ### Goto table lookup
2569
2570 The parser generator has nicely provided us with goto tables sorted by
2571 symbol number.  We need a binary search function to find a symbol in the
2572 table.
2573
2574 ###### parser functions
2575
2576         static int search(const struct state *l, int sym)
2577         {
2578                 int lo = 0;
2579                 int hi = l->go_to_cnt;
2580
2581                 if (hi == 0)
2582                         return -1;
2583                 while (lo + 1 < hi) {
2584                         int mid = (lo + hi) / 2;
2585                         if (l->go_to[mid].sym <= sym)
2586                                 lo = mid;
2587                         else
2588                                 hi = mid;
2589                 }
2590                 if (l->go_to[lo].sym == sym)
2591                         return l->go_to[lo].state;
2592                 else
2593                         return -1;
2594         }
2595
2596 ### Memory allocation
2597
2598 The `scanner` returns tokens in a local variable - we want them in allocated
2599 memory so they can live in the `asn_stack`.  So we provide `tok_copy` to
2600 make an allocated copy as required.
2601
2602 ###### parser includes
2603         #include <memory.h>
2604
2605 ###### parser functions
2606
2607         static struct token *tok_copy(struct token tk)
2608         {
2609                 struct token *new = malloc(sizeof(*new));
2610                 *new = tk;
2611                 return new;
2612         }
2613
2614 ### The state stack.
2615
2616 The core data structure for the parser is the stack.  This tracks all
2617 the symbols that have been recognised or partially recognised.
2618
2619 The stack usually won't grow very large - maybe a few tens of entries.
2620 So we dynamically grow an array as required but never bother to
2621 shrink it down again.
2622
2623 We keep the stack as two separate allocations.  One, `asn_stack` stores
2624 the "abstract syntax nodes" which are created by each reduction.  When
2625 we call `do_reduce` we need to pass an array of the `asn`s of the body
2626 of the production, and by keeping a separate `asn` stack, we can just
2627 pass a pointer into this stack.
2628
2629 The other allocation stores all other stack fields of which there are
2630 several.  The `state` is the most important one and guides the parsing
2631 process.  The `sym` is nearly unnecessary as it is implicit in the
2632 state.  However when we want to free entries from the `asn_stack`, it
2633 helps to know what type they are so we can call the right freeing
2634 function.  The symbol leads us to the right free function through
2635 `do_free`.
2636
2637 The stack is most properly seen as alternating states and symbols -
2638 states, like the 'DOT' in items, are between symbols.  Each frame in
2639 our stack holds a state and the symbol that was before it.  The
2640 bottom of stack holds the start state but no symbol, as nothing came
2641 before the beginning.  As we need to store some value, `TK_eof` is used
2642 to mark the beginning of the file as well as the end.
2643
2644 Indents (IN) are sometimes shifted and sometimes only accounted.
2645 Whatever decision is made must apply equally to the matching OUT.  To
2646 ensure this we keep a stack of bits in `ignored_indents` and
2647 `indent_depth`.  When we process an IN, we record if it was ignored.
2648 When we see an out, we pop of the relavant bit and use it to decide how
2649 to handle the OUT.
2650
2651 ###### parser functions
2652
2653         struct parser {
2654                 struct frame {
2655                         short state;
2656                         short sym;
2657                 } *stack;
2658                 void **asn_stack;
2659                 int stack_size;
2660                 int tos;
2661
2662                 ## parser state
2663         };
2664
2665 #### Shift and pop
2666
2667 Two operations are needed on the stack - shift (which is like push) and pop.
2668
2669 Shift applies not only to terminals but also to non-terminals.  When we
2670 reduce a production we will pop off frames corresponding to the body
2671 symbols, then push on a frame for the head of the production.  This last
2672 is exactly the same process as shifting in a terminal so we use the same
2673 function for both.  In both cases we provide the symbol.  The state is
2674 deduced from the current top-of-stack state and the new symbol.
2675
2676 To simplify other code we arrange for `shift` to fail if there is no `goto`
2677 state for the symbol.  This is useful in basic parsing due to our design
2678 that we shift when we can, and reduce when we cannot.  So the `shift`
2679 function reports if it could.
2680
2681 `shift` is also used to push state zero onto the stack, so if the
2682 stack is empty, it always chooses zero as the next state.
2683
2684 So `shift` finds the next state.  If that succeeds it extends the
2685 allocations if needed and pushes all the information onto the stacks.
2686
2687 An extra complication is added to `shift` by the `EOL` token.  This
2688 token must be generated when a `NEWLINE` is seen, but an `EOL` is
2689 expected.  When this happens, the `NEWLINE` is NOT consumed, so multiple
2690 EOL can appear before a NEWLINE.  To indicate that the token was shifted
2691 by not consumed, `shift` can return the special value `2`.  The token
2692 number for `EOL` cannot be statically declared, so when the parser
2693 starts we need to look through the array of non-terminals to find the
2694 EOL.
2695
2696 ###### parser state
2697         int tk_eol;
2698
2699 ###### find eol
2700         p.tk_eol = 0;
2701         while (strcmp(non_term[p.tk_eol], "EOL") != 0)
2702                 p.tk_eol += 1;
2703         p.tk_eol += TK_reserved + config->known_count;
2704
2705
2706 ###### parser functions
2707
2708         static int shift(struct parser *p,
2709                          short sym, void *asn,
2710                          const struct state states[])
2711         {
2712                 struct frame next = {0};
2713                 int ret;
2714                 int newstate = p->tos
2715                         ? search(&states[p->stack[p->tos-1].state],
2716                                  sym)
2717                         : 0;
2718                 if (newstate >= 0)
2719                         ret = 1;
2720                 else if (sym != TK_newline)
2721                         return 0;
2722                 else {
2723                         // have a NEWLINE, might need an EOL
2724                         sym = p->tk_eol;
2725                         newstate = p->tos
2726                                 ? search(&states[p->stack[p->tos-1].state],
2727                                          sym)
2728                                 : 0;
2729                         if (newstate < 0)
2730                                 return 0;
2731                         ret = 2;
2732                         asn = tok_copy(*(struct token*)asn);
2733                 }
2734
2735                 if (p->tos >= p->stack_size) {
2736                         p->stack_size += 10;
2737                         p->stack = realloc(p->stack, p->stack_size
2738                                            * sizeof(p->stack[0]));
2739                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2740                                            * sizeof(p->asn_stack[0]));
2741                 }
2742                 next.sym = sym;
2743                 next.state = newstate;
2744
2745                 p->stack[p->tos] = next;
2746                 p->asn_stack[p->tos] = asn;
2747                 p->tos++;
2748                 return ret;
2749         }
2750
2751 `pop` primarily moves the top of stack (`tos`) back down the required
2752 amount and frees any `asn` entries that need to be freed.  It is called
2753 _after_ we reduce a production, just before we `shift` the nonterminal
2754 in.
2755
2756 ###### parser functions
2757
2758         static void pop(struct parser *p, int num,
2759                         void(*do_free)(short sym, void *asn))
2760         {
2761                 int i;
2762
2763                 p->tos -= num;
2764                 for (i = 0; i < num; i++)
2765                         do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]);
2766         }
2767
2768 ### The heart of the parser.
2769
2770 Now we have the parser.  For each token we might shift it, trigger a
2771 reduction, or start error handling.  2D tokens (IN, OUT, NEWLINE, EOL)
2772 might also be ignored.  Ignoring tokens is combined with shifting.
2773
2774 ###### parser vars
2775
2776         struct parser p = { 0 };
2777         struct token *tk = NULL;
2778         int accepted = 0;
2779
2780 ###### heart of parser
2781
2782         shift(&p, TK_eof, NULL, states);
2783         while (!accepted && p.tos > 0) {
2784                 struct frame *tos = &p.stack[p.tos-1];
2785                 if (!tk)
2786                         tk = tok_copy(token_next(tokens));
2787                 parser_trace(trace, &p,
2788                              tk, states, non_term, config->known_count);
2789
2790                 ## try shift or ignore
2791                 ## try reduce
2792                 ## handle error
2793         }
2794
2795 Indents are ignored unless they can be shifted onto the stack
2796 immediately.  The end of an indented section - the OUT token - is
2797 ignored precisely when the indent was ignored.  To keep track of this we
2798 need a small stack of flags, which is easily stored as bits in an
2799 `unsigned long`.  This will never overflow and the scanner only allows
2800 20 levels of indentation.
2801
2802 ###### parser state
2803         unsigned long ignored_indents;
2804         int indent_depth;
2805
2806 NEWLINE/EOL is ignored when in an indented section of text which was not
2807 explicitly expected by the grammar.  So if the most recent indent is
2808 ignored, so is any EOL token.
2809
2810 For other tokens, we shift the next token if that is possible, otherwise
2811 we try to reduce a production.
2812
2813 ###### try shift or ignore
2814
2815         if ((tk->num == TK_newline || tk->num == TK_out) &&
2816             (p.ignored_indents & (1 << p.indent_depth))) {
2817                 /* indented, so ignore OUT and NEWLINE */
2818                 if (tk->num == TK_out)
2819                         p.indent_depth -= 1;
2820                 free(tk);
2821                 tk = NULL;
2822                 parser_trace_action(trace, "Ignore");
2823                 continue;
2824         }
2825
2826         switch (shift(&p, tk->num, tk, states)) {
2827         case 1:
2828                 if (tk->num == TK_out)
2829                         p.indent_depth -= 1;
2830                 if (tk->num == TK_in) {
2831                         p.indent_depth += 1;
2832                         p.ignored_indents &= ~(1 << p.indent_depth);
2833                 }
2834                 tk = NULL;
2835                 /* fallthrough */
2836         case 2:
2837                 parser_trace_action(trace, tk ? "ShiftEOL" : "Shift");
2838                 ## did shift
2839                 continue;
2840         }
2841
2842         if (tk->num == TK_in) {
2843                 /* No indent expected here, so ignore IN */
2844                 free(tk);
2845                 tk = NULL;
2846                 p.indent_depth += 1;
2847                 p.ignored_indents |= (1 << p.indent_depth);
2848                 parser_trace_action(trace, "Ignore");
2849                 continue;
2850         }
2851
2852 We have already discussed the bulk of the handling of a "reduce" action,
2853 with the `pop()` and `shift()` functions doing much of the work.  There
2854 is a little more complexity needed to manage storage for the asn (Abstract
2855 Syntax Node), and also a test of whether the reduction is permitted.
2856
2857 When we try to shift the result of reducing production-zero, it will
2858 fail because there is no next state.  In this case the asn will not have
2859 been stored on the stack, so it get stored in the `ret` variable, and we
2860 report that that input has been accepted.
2861
2862 ###### parser vars
2863
2864         void *ret = NULL;
2865
2866 ###### try reduce
2867
2868         if (states[tos->state].reduce_prod >= 0) {
2869                 void **body;
2870                 void *res;
2871                 const struct state *nextstate = &states[tos->state];
2872                 int prod = nextstate->reduce_prod;
2873                 int size = nextstate->reduce_size;
2874                 int res_size = nextstate->result_size;
2875
2876                 body = p.asn_stack + (p.tos - size);
2877                 res = res_size ? calloc(1, res_size) : NULL;
2878                 res_size = do_reduce(prod, body, config, res);
2879                 if (res_size != nextstate->result_size)
2880                         abort();
2881                 pop(&p, size, do_free);
2882                 if (!shift(&p, nextstate->reduce_sym, res, states)) {
2883                         accepted = 1;
2884                         ret = res;
2885                         parser_trace_action(trace, "Accept");
2886                 } else
2887                         parser_trace_action(trace, "Reduce");
2888                 continue;
2889         }
2890
2891 If we can neither shift nor reduce we have an error to handle.  There
2892 are two possible responses to an error: we can pop single frames off the
2893 stack until we find a frame that can shift `TK_error`, or we can discard
2894 the current look-ahead symbol.
2895
2896 When we first see an error we do the first of these and set a flag to
2897 record that we are processing an error.  If the normal shift/reduce
2898 tests fail to find that anything can be done from that state, we start
2899 dropping tokens until either we manage to shift one, or reach end-of-file.
2900
2901 ###### parser vars
2902
2903         int in_err = 0;
2904
2905 ###### did shift
2906
2907         in_err = 0;
2908
2909 ###### handle error
2910
2911         if (in_err) {
2912                 parser_trace_action(trace, "DISCARD");
2913                 if (tk->num == TK_eof)
2914                         break;
2915                 free(tk);
2916                 tk = NULL;
2917         } else {
2918                 struct token *err_tk;
2919
2920                 parser_trace_action(trace, "ERROR");
2921
2922                 err_tk = tok_copy(*tk);
2923                 while (p.tos > 0 && shift(&p, TK_error, err_tk, states) == 0)
2924                         // discard this state
2925                         pop(&p, 1, do_free);
2926                 if (p.tos == 0) {
2927                         free(err_tk);
2928                         // no state accepted TK_error
2929                         break;
2930                 }
2931                 in_err = 1;
2932         }
2933
2934 ###### parser includes
2935         #include "parser.h"
2936
2937 ###### parser_run
2938
2939         void *parser_run(struct token_state *tokens,
2940                          const struct state states[],
2941                          int (*do_reduce)(int, void**, struct token_config*, void*),
2942                          void (*do_free)(short, void*),
2943                          FILE *trace, const char *non_term[],
2944                          struct token_config *config)
2945         {
2946                 ## parser vars
2947
2948                 ## find eol
2949
2950                 ## heart of parser
2951
2952                 free(tk);
2953                 pop(&p, p.tos, do_free);
2954                 free(p.asn_stack);
2955                 free(p.stack);
2956                 return ret;
2957         }
2958
2959 ###### exported functions
2960         void *parser_run(struct token_state *tokens,
2961                          const struct state states[],
2962                          int (*do_reduce)(int, void**, struct token_config*, void*),
2963                          void (*do_free)(short, void*),
2964                          FILE *trace, const char *non_term[],
2965                          struct token_config *config);
2966
2967 ### Tracing
2968
2969 Being able to visualize the parser in action can be invaluable when
2970 debugging the parser code, or trying to understand how the parse of a
2971 particular grammar progresses.  The stack contains all the important
2972 state, so just printing out the stack every time around the parse loop
2973 can make it possible to see exactly what is happening.
2974
2975 This doesn't explicitly show each SHIFT and REDUCE action.  However they
2976 are easily deduced from the change between consecutive lines, and the
2977 details of each state can be found by cross referencing the states list
2978 in the stack with the "report" that parsergen can generate.
2979
2980 For terminal symbols, we just dump the token.  For non-terminals we
2981 print the name of the symbol.  The look ahead token is reported at the
2982 end inside square brackets.
2983
2984 ###### parser functions
2985
2986         static char *reserved_words[] = {
2987                 [TK_error]        = "ERROR",
2988                 [TK_in]           = "IN",
2989                 [TK_out]          = "OUT",
2990                 [TK_newline]      = "NEWLINE",
2991                 [TK_eof]          = "$eof",
2992         };
2993
2994         void parser_trace(FILE *trace, struct parser *p,
2995                           struct token *tk, const struct state states[],
2996                           const char *non_term[], int knowns)
2997         {
2998                 int i;
2999                 if (!trace)
3000                         return;
3001                 for (i = 0; i < p->tos; i++) {
3002                         struct frame *f = &p->stack[i];
3003                         if (i) {
3004                                 int sym = f->sym;
3005                                 if (sym < TK_reserved &&
3006                                     reserved_words[sym] != NULL)
3007                                         fputs(reserved_words[sym], trace);
3008                                 else if (sym < TK_reserved + knowns) {
3009                                         struct token *t = p->asn_stack[i];
3010                                         text_dump(trace, t->txt, 20);
3011                                 } else
3012                                         fputs(non_term[sym - TK_reserved - knowns],
3013                                               trace);
3014                                 fputs(" ", trace);
3015                         }
3016                         fprintf(trace, "(%d) ", f->state);
3017                 }
3018                 fprintf(trace, "[");
3019                 if (tk->num < TK_reserved &&
3020                     reserved_words[tk->num] != NULL)
3021                         fputs(reserved_words[tk->num], trace);
3022                 else
3023                         text_dump(trace, tk->txt, 20);
3024                 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3025         }
3026
3027         void parser_trace_action(FILE *trace, char *action)
3028         {
3029                 if (trace)
3030                         fprintf(trace, " - %s\n", action);
3031         }
3032
3033 # A Worked Example
3034
3035 The obvious example for a parser is a calculator.
3036
3037 As `scanner` provides number parsing function using `libgmp` it is not much
3038 work to perform arbitrary rational number calculations.
3039
3040 This calculator takes one expression, or an equality test, per line.
3041 The results are printed and if any equality test fails, the program
3042 exits with an error.
3043
3044 ###### File: parsergen.mk
3045         calc.c calc.h : parsergen parsergen.mdc
3046                 ./parsergen --tag calc -o calc parsergen.mdc
3047         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3048                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3049         calctest : calc
3050                 ./calc parsergen.mdc
3051         demos :: calctest
3052
3053 # calc: header
3054
3055         #include "parse_number.h"
3056         // what do we use for a demo-grammar?  A calculator of course.
3057         struct number {
3058                 mpq_t val;
3059                 char tail[2];
3060                 int err;
3061         };
3062
3063 # calc: code
3064
3065         #include <stdlib.h>
3066         #include <unistd.h>
3067         #include <fcntl.h>
3068         #include <sys/mman.h>
3069         #include <stdio.h>
3070         #include <malloc.h>
3071         #include <gmp.h>
3072         #include <string.h>
3073         #include "mdcode.h"
3074         #include "scanner.h"
3075         #include "parser.h"
3076
3077         #include "calc.h"
3078
3079         static void free_number(struct number *n)
3080         {
3081                 mpq_clear(n->val);
3082                 free(n);
3083         }
3084
3085         static int text_is(struct text t, char *s)
3086         {
3087                 return (strlen(s) == t.len &&
3088                         strncmp(s, t.txt, t.len) == 0);
3089         }
3090
3091         int main(int argc, char *argv[])
3092         {
3093                 int fd = open(argv[1], O_RDONLY);
3094                 int len = lseek(fd, 0, 2);
3095                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3096                 struct section *table = code_extract(file, file+len, NULL);
3097                 struct section *s;
3098                 struct token_config config = {
3099                         .ignored = (1 << TK_line_comment)
3100                                  | (1 << TK_in)
3101                                  | (1 << TK_out),
3102                         .number_chars = ".,_+-",
3103                         .word_start = "",
3104                         .word_cont = "",
3105                 };
3106                 for (s = table; s; s = s->next)
3107                         if (text_is(s->section, "example: input"))
3108                                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3109                 while (table) {
3110                         struct section *t = table->next;
3111                         code_free(table->code);
3112                         free(table);
3113                         table = t;
3114                 }
3115                 exit(0);
3116         }
3117
3118 # calc: grammar
3119
3120         $LEFT + -
3121         $LEFT * / //
3122
3123         Session -> Session Line
3124                 | Line
3125
3126         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3127                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3128                                         gmp_printf("  or as a decimal: %Fg\n", fl);
3129                                         mpf_clear(fl);
3130                                         }
3131                                      }$
3132                 | Expression = Expression NEWLINE ${
3133                         if (mpq_equal($1.val, $3.val))
3134                                 gmp_printf("Both equal %Qd\n", $1.val);
3135                         else {
3136                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
3137                                         $1.val, $3.val);
3138                                 exit(1);
3139                         }
3140                 }$
3141                 | NEWLINE ${ printf("Blank line\n"); }$
3142                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3143
3144         $number
3145         Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3146                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3147                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3148                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3149                 | Expression // Expression ${ {
3150                         mpz_t z0, z1, z2;
3151                         mpq_init($0.val);
3152                         mpz_init(z0); mpz_init(z1); mpz_init(z2);
3153                         mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3154                         mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3155                         mpz_tdiv_q(z0, z1, z2);
3156                         mpq_set_z($0.val, z0);
3157                         mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3158                 } }$
3159                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3160                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3161
3162 # example: input
3163
3164         355/113
3165         3.1415926535 - 355/113
3166         2 + 4 * 5
3167         1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3168         10 * 9 / 2
3169         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
3170
3171         355//113
3172
3173         error