ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator with 2D support #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 "2D support" means that indentation and line breaks can be significant.
   8 Indent tokens (IN and OUT) and end-of-line tokens (EOL and NEWLINE) can
   9 be used to describe the grammar and the parser can selectively ignore
  10 these where they aren't relevant.
  11
  12 There are several distinct sections.
  13
  14 1. `grammar_read` will read a grammar from a literate-code file and
  15    store the productions, symbols, and code fragments.
  16 2. `grammar_analyse` will take that grammar and build LR parsing
  17    states and look-ahead tables.
  18 3. `grammar_report` will present the details of the analysis
  19    in a readable format and will report any conflicts.
  20 4. `parser_generate` will write out C code files with various
  21    tables and with the code fragments arranged in useful places.
  22 5. `parser_run` is a library function intended to be linked together
  23    with the generated parser tables to complete the implementation of
  24    a parser.
  25 6. Finally `calc` is a test grammar for a simple calculator.  The
  26    `parsergen` program built from the C code in this file can extract
  27    that grammar directly from this file and process it.
  28
  29 ###### File: parsergen.c
  30         #include <unistd.h>
  31         #include <stdlib.h>
  32         #include <stdio.h>
  33         ## includes
  34         ## forward declarations
  35         ## declarations
  36         ## functions
  37         ## grammar_read
  38         ## grammar_analyse
  39         ## grammar_report
  40         ## parser_generate
  41         ## main
  42 ###### File: parser.h
  43         ## exported types
  44         ## exported functions
  45 ###### File: libparser.c
  46         #include <unistd.h>
  47         #include <stdlib.h>
  48         #include <stdio.h>
  49         ## parser includes
  50         ## parser functions
  51         ## parser_run
  52 ###### File: parsergen.mk
  53         CFLAGS += -Wall -g
  54         all :: parsergen calc
  55         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  56                 ./md2c parsergen.mdc
  57
  58 ## Reading the grammar
  59
  60 The grammar must be stored in a markdown literate code file as parsed by
  61 "[mdcode][]".  It may have up to four top level (i.e.  unreferenced)
  62 sections: `header`, `code`, `grammar` and `reduce`.  The first two, if
  63 present, will be literally copied into the generated `.c` and `.h`
  64 files.  The third is required and contains the grammar which is
  65 tokenised with "[scanner][]".  `reduce` can provide variable
  66 declarations and common intialization code for all the reduction code
  67 fragments in the grammar.
  68
  69 If the `--tag` option is given, then any top level header that doesn't
  70 start with the tag is ignored, and the tag is striped from the rest.  So
  71 `--tag Foo` means that the four sections expected are `Foo: header`,
  72 `Foo: code`, `Foo: grammar` and `Foo: reduce`.  The tag `calc` is used
  73 to extract the sample calculator from this file.
  74
  75 [mdcode]: mdcode.html
  76 [scanner]: scanner.html
  77
  78 ###### includes
  79         #include "mdcode.h"
  80         #include "scanner.h"
  81
  82 ###### parser includes
  83         #include "mdcode.h"
  84         #include "scanner.h"
  85
  86 The grammar contains production sets, precedence/associativity
  87 declarations, general terminal declarations, and data type declarations.
  88 These are all parsed with _ad hoc_ parsing as we don't have a parser
  89 generator yet.
  90
  91 The precedence and associativity can be set for each production, and
  92 can be inherited from symbols.  The data type (either structure or a
  93 reference to a structure) is potentially defined for each non-terminal
  94 and describes what C structure is used to store information about each
  95 symbol.
  96
  97 ###### declarations
  98         enum assoc {Left, Right, Non};
  99         char *assoc_names[] = {"Left","Right","Non"};
 100
 101         struct symbol {
 102                 struct text struct_name;
 103                 int isref;
 104                 enum assoc assoc;
 105                 unsigned short precedence;
 106                 ## symbol fields
 107         };
 108         struct production {
 109                 unsigned short precedence;
 110                 enum assoc assoc;
 111                 ## production fields
 112         };
 113         struct grammar {
 114                 ## grammar fields
 115         };
 116
 117 The strings reported by `mdcode` and `scanner` are `struct text` which
 118 have length rather than being nul terminated.  To help with printing and
 119 comparing we define `text_is` and `prtxt`, which should possibly go in
 120 `mdcode`.  `scanner` does provide `text_dump` which is useful for
 121 strings which might contain control characters.
 122
 123 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 124 because that is what we need to detect tags.
 125
 126 ###### functions
 127         static int text_is(struct text t, char *s)
 128         {
 129                 return (strlen(s) == t.len &&
 130                         strncmp(s, t.txt, t.len) == 0);
 131         }
 132         static void prtxt(struct text t)
 133         {
 134                 printf("%.*s", t.len, t.txt);
 135         }
 136
 137         static int strip_tag(struct text *t, char *tag)
 138         {
 139                 int skip = strlen(tag) + 1;
 140                 if (skip >= t->len ||
 141                     strncmp(t->txt, tag, skip-1) != 0 ||
 142                     t->txt[skip-1] != ':')
 143                         return 0;
 144                 while (skip < t->len && t->txt[skip] == ' ')
 145                         skip++;
 146                 t->len -= skip;
 147                 t->txt += skip;
 148                 return 1;
 149         }
 150
 151 ### Symbols
 152
 153 Productions are comprised primarily of symbols - terminal and
 154 non-terminal.  We do not make a syntactic distinction between the two,
 155 though non-terminals must be identifiers while terminals can also be
 156 marks.  Non-terminal symbols are those which appear in the head of a
 157 production, terminal symbols are those which don't.  There are also
 158 "virtual" symbols used for precedence marking discussed later, and
 159 sometimes we won't know what type a symbol is yet.
 160
 161 To help with code safety it is possible to declare the terminal symbols.
 162 If this is done, then any symbol used in a production that does not
 163 appear in the head of any production and is not declared as a terminal
 164 is treated as an error.
 165
 166 ###### forward declarations
 167         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 168         char *symtypes = "UVTN";
 169 ###### symbol fields
 170         enum symtype type;
 171 ###### grammar fields
 172         int terminals_declared;
 173
 174 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 175 table of known symbols and the resulting parser will report them as
 176 `TK_reserved + N`.  A small set of identifiers are reserved for the
 177 different token types that `scanner` can report, and an even smaller set
 178 are reserved for a special token that the parser can generate (`EOL`) as
 179 will be described later.  This latter set cannot use predefined numbers,
 180 so they are marked as `isspecial` for now and will get assigned a number
 181 with the non-terminals later.
 182
 183 ###### declarations
 184
 185         static struct { int num; char *name; } reserved_words[] = {
 186                 { TK_error,        "ERROR" },
 187                 { TK_number,       "NUMBER" },
 188                 { TK_ident,        "IDENTIFIER" },
 189                 { TK_mark,         "MARK" },
 190                 { TK_string,       "STRING" },
 191                 { TK_multi_string, "MULTI_STRING" },
 192                 { TK_in,           "IN" },
 193                 { TK_out,          "OUT" },
 194                 { TK_newline,      "NEWLINE" },
 195                 { TK_eof,          "$eof" },
 196                 { -1,              "EOL" },
 197         };
 198
 199 ###### symbol fields
 200         short num;
 201         unsigned int isspecial:1;
 202
 203 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 204 recognised.  The former is automatically expected at the end of the text
 205 being parsed. The latter are always ignored by the parser.
 206
 207 All of these symbols are stored in a simple symbol table.  We use the
 208 `struct text` from `mdcode` to store the name and link them together
 209 into a sorted list using an insertion sort.
 210
 211 We don't have separate `find` and `insert` functions as any symbol we
 212 find needs to be remembered.  We simply expect `find` to always return a
 213 symbol, but its type might be `Unknown`.
 214
 215 ###### includes
 216         #include <string.h>
 217
 218 ###### symbol fields
 219         struct text name;
 220         struct symbol *next;
 221
 222 ###### grammar fields
 223         struct symbol *syms;
 224         int num_syms;
 225
 226 ###### functions
 227         static struct symbol *sym_find(struct grammar *g, struct text s)
 228         {
 229                 struct symbol **l = &g->syms;
 230                 struct symbol *n;
 231                 int cmp = 1;
 232
 233                 while (*l &&
 234                         (cmp = text_cmp((*l)->name, s)) < 0)
 235                                 l = & (*l)->next;
 236                 if (cmp == 0)
 237                         return *l;
 238                 n = calloc(1, sizeof(*n));
 239                 n->name = s;
 240                 n->type = Unknown;
 241                 n->next = *l;
 242                 n->num = -1;
 243                 *l = n;
 244                 return n;
 245         }
 246
 247         static void symbols_init(struct grammar *g)
 248         {
 249                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 250                 int i;
 251                 for (i = 0; i < entries; i++) {
 252                         struct text t;
 253                         struct symbol *s;
 254                         t.txt = reserved_words[i].name;
 255                         t.len = strlen(t.txt);
 256                         s = sym_find(g, t);
 257                         s->type = Terminal;
 258                         s->num = reserved_words[i].num;
 259                         s->isspecial = 1;
 260                 }
 261         }
 262
 263 ### Data types and precedence.
 264
 265 Data type specification, precedence specification, and declaration of
 266 terminals are all introduced by a dollar sign at the start of the line.
 267 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 268 precedence, if it is `TERM` then the line declares terminals without
 269 precedence, otherwise it specifies a data type.
 270
 271 The data type name is simply stored and applied to the head of all
 272 subsequent productions.  It must be the name of a structure optionally
 273 preceded by an asterisk which means a reference or "pointer".  So
 274 `$expression` maps to `struct expression` and `$*statement` maps to
 275 `struct statement *`.
 276
 277 Any productions given before the first data type declaration will have
 278 no data type associated with them and can carry no information.  In
 279 order to allow other non-terminals to have no type, the data type
 280 `$void` can be given.  This does *not* mean that `struct void` will be
 281 used, but rather than no type will be associated with subsequent
 282 non-terminals.
 283
 284 The precedence line must contain a list of symbols, either terminal
 285 symbols or virtual symbols.  It can only contain symbols that have not
 286 been seen yet, so precedence declaration must precede the productions
 287 that they affect.
 288
 289 A "virtual" symbol is an identifier preceded by `$$`.  Virtual symbols
 290 carry precedence information but are not included in the grammar.  A
 291 production can declare that it inherits the precedence of a given
 292 virtual symbol.
 293
 294 This use of `$$` precludes it from being used as a symbol in the
 295 described language.  Two other symbols: `${` and `}$` are also
 296 unavailable.
 297
 298 Each new precedence line introduces a new precedence level and
 299 declares how it associates.  This level is stored in each symbol
 300 listed and may be inherited by any production which uses the symbol.  A
 301 production inherits from the last symbol which has a precedence.
 302
 303 The symbols on the first precedence line have the lowest precedence.
 304 Subsequent lines introduce symbols with higher precedence and so bind
 305 more tightly.
 306
 307 Note that a declaration line (or "dollar line") can start with either of
 308 two different marks: "$" or "$*".  The `dollar_line()` function defined
 309 here is told which was found in the `isref` argument.
 310
 311 ###### grammar fields
 312         struct text current_type;
 313         int type_isref;
 314         int prec_levels;
 315
 316 ###### declarations
 317         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 318         static const char *known[] = { "$$", "${", "}$" };
 319
 320 ###### functions
 321         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 322         {
 323                 struct token t = token_next(ts);
 324                 char *err;
 325                 enum assoc assoc;
 326                 int term = 0;
 327                 int found;
 328
 329                 if (t.num != TK_ident) {
 330                         err = "type or assoc expected after '$'";
 331                         goto abort;
 332                 }
 333                 if (text_is(t.txt, "LEFT"))
 334                         assoc = Left;
 335                 else if (text_is(t.txt, "RIGHT"))
 336                         assoc = Right;
 337                 else if (text_is(t.txt, "NON"))
 338                         assoc = Non;
 339                 else if (text_is(t.txt, "TERM")) {
 340                         term = 1;
 341                         g->terminals_declared = 1;
 342                 } else {
 343                         g->current_type = t.txt;
 344                         g->type_isref = isref;
 345                         if (text_is(t.txt, "void"))
 346                                 g->current_type.txt = NULL;
 347                         t = token_next(ts);
 348                         if (t.num != TK_newline) {
 349                                 err = "Extra tokens after type name";
 350                                 goto abort;
 351                         }
 352                         return NULL;
 353                 }
 354
 355                 if (isref) {
 356                         err = "$* cannot be followed by a precedence";
 357                         goto abort;
 358                 }
 359
 360                 // This is a precedence or TERM line, need some symbols.
 361                 found = 0;
 362                 g->prec_levels += 1;
 363                 t = token_next(ts);
 364                 while (t.num != TK_newline) {
 365                         enum symtype type = Terminal;
 366                         struct symbol *s;
 367                         if (t.num == TK_virtual) {
 368                                 type = Virtual;
 369                                 t = token_next(ts);
 370                                 if (t.num != TK_ident) {
 371                                         err = "$$ must be followed by a word";
 372                                         goto abort;
 373                                 }
 374                                 if (term) {
 375                                         err = "Virtual symbols not permitted on $TERM line";
 376                                         goto abort;
 377                                 }
 378                         } else if (t.num != TK_ident &&
 379                                    t.num != TK_mark) {
 380                                 err = "Illegal token in precedence line";
 381                                 goto abort;
 382                         }
 383                         s = sym_find(g, t.txt);
 384                         if (s->type != Unknown) {
 385                                 err = "Symbols in precedence/TERM line must not already be known.";
 386                                 goto abort;
 387                         }
 388                         s->type = type;
 389                         if (!term) {
 390                                 s->precedence = g->prec_levels;
 391                                 s->assoc = assoc;
 392                         }
 393                         found += 1;
 394                         t = token_next(ts);
 395                 }
 396                 if (found == 0) {
 397                         err = "No symbols given on precedence/TERM line";
 398                         goto abort;
 399                 }
 400                 return NULL;
 401         abort:
 402                 while (t.num != TK_newline && t.num != TK_eof)
 403                         t = token_next(ts);
 404                 return err;
 405         }
 406
 407 ### Productions
 408
 409 A production either starts with an identifier which is the head
 410 non-terminal, or a vertical bar (`|`) in which case this production
 411 uses the same head as the previous one.  The identifier must be
 412 followed by a `->` mark.  All productions for a given non-terminal must
 413 be grouped together with the `nonterminal ->` given only once.
 414
 415 After this a (possibly empty) sequence of identifiers and marks form
 416 the body of the production.  A virtual symbol may be given after the
 417 body by preceding it with `$$`.  If a virtual symbol is given, the
 418 precedence of the production is that for the virtual symbol.  If none
 419 is given, the precedence is inherited from the last symbol in the
 420 production which has a precedence specified.
 421
 422 After the optional precedence may come the `${` mark.  This indicates
 423 the start of a code fragment.  If present, this must be on the same
 424 line as the start of the production.
 425
 426 All of the text from the `${` through to the matching `}$` is
 427 collected and forms the code-fragment for the production.  It must all
 428 be in one `code_node` of the literate code.  The `}$` must be
 429 at the end of a line.
 430
 431 Text in the code fragment will undergo substitutions where `$N` or
 432 `$<N`,for some numeric `N` (or non-numeric indicator as described
 433 later), will be replaced with a variable holding the parse information
 434 for the particular symbol in the production.  `$0` is the head of the
 435 production, `$1` is the first symbol of the body, etc.  The type of `$N`
 436 for a terminal symbol is `struct token`.  For a non-terminal, it is
 437 whatever has been declared for that symbol.  The `<` may be included and
 438 means that the value (usually a reference) is being moved out, so it
 439 will not automatically be freed.  The effect of using '<' is that the
 440 variable is cleared to all-zeros.
 441
 442 While building productions we will need to add to an array which needs to
 443 grow dynamically.
 444
 445 ###### functions
 446         static void array_add(void *varray, int *cnt, void *new)
 447         {
 448                 void ***array = varray;
 449                 int current = 0;
 450                 const int step = 8;
 451                 current = ((*cnt-1) | (step-1))+1;
 452                 if (*cnt == current) {
 453                         /* must grow */
 454                         current += step;
 455                         *array = realloc(*array, current * sizeof(void*));
 456                 }
 457                 (*array)[*cnt] = new;
 458                 (*cnt) += 1;
 459         }
 460
 461 Collecting the code fragment simply involves reading tokens until we
 462 hit the end token `}$`, and noting the character position of the start and
 463 the end.
 464
 465 ###### functions
 466         static struct text collect_code(struct token_state *state,
 467                                         struct token start)
 468         {
 469                 struct text code;
 470                 struct token t;
 471                 code.txt = start.txt.txt + start.txt.len;
 472                 do
 473                         t = token_next(state);
 474                 while (t.node == start.node &&
 475                        t.num != TK_close && t.num != TK_error &&
 476                        t.num != TK_eof);
 477                 if (t.num == TK_close && t.node == start.node)
 478                         code.len = t.txt.txt - code.txt;
 479                 else
 480                         code.txt = NULL;
 481                 return code;
 482         }
 483
 484 Now we have all the bits we need to parse a full production.
 485
 486 ###### includes
 487         #include <memory.h>
 488
 489 ###### grammar fields
 490         struct production **productions;
 491         int production_count;
 492
 493 ###### production fields
 494         struct symbol  *head;
 495         struct symbol **body;
 496         int             body_size;
 497         struct text     code;
 498         int             code_line;
 499
 500 ###### symbol fields
 501         int first_production;
 502
 503 ###### functions
 504         static char *parse_production(struct grammar *g,
 505                                       struct symbol *head,
 506                                       struct token_state *state)
 507         {
 508                 /* Head has already been parsed. */
 509                 struct token tk;
 510                 char *err;
 511                 struct production p, *pp;
 512
 513                 memset(&p, 0, sizeof(p));
 514                 p.head = head;
 515                 tk = token_next(state);
 516                 while (tk.num == TK_ident || tk.num == TK_mark) {
 517                         struct symbol *bs = sym_find(g, tk.txt);
 518                         if (bs->type == Unknown) {
 519                                 if (!g->terminals_declared)
 520                                         bs->type = Terminal;
 521                         }
 522                         if (bs->type == Virtual) {
 523                                 err = "Virtual symbol not permitted in production";
 524                                 goto abort;
 525                         }
 526                         if (bs->precedence) {
 527                                 p.precedence = bs->precedence;
 528                                 p.assoc = bs->assoc;
 529                         }
 530                         array_add(&p.body, &p.body_size, bs);
 531                         tk = token_next(state);
 532                 }
 533                 if (tk.num == TK_virtual) {
 534                         struct symbol *vs;
 535                         tk = token_next(state);
 536                         if (tk.num != TK_ident) {
 537                                 err = "word required after $$";
 538                                 goto abort;
 539                         }
 540                         vs = sym_find(g, tk.txt);
 541                         if (vs->precedence == 0) {
 542                                 err = "symbol after $$ must have precedence";
 543                                 goto abort;
 544                         } else {
 545                                 p.precedence = vs->precedence;
 546                                 p.assoc = vs->assoc;
 547                         }
 548                         tk = token_next(state);
 549                 }
 550                 if (tk.num == TK_open) {
 551                         p.code_line = tk.line;
 552                         p.code = collect_code(state, tk);
 553                         if (p.code.txt == NULL) {
 554                                 err = "code fragment not closed properly";
 555                                 goto abort;
 556                         }
 557                         tk = token_next(state);
 558                 }
 559
 560                 if (tk.num != TK_newline && tk.num != TK_eof) {
 561                         err = "stray tokens at end of line";
 562                         goto abort;
 563                 }
 564                 pp = malloc(sizeof(*pp));
 565                 *pp = p;
 566                 array_add(&g->productions, &g->production_count, pp);
 567                 return NULL;
 568         abort:
 569                 while (tk.num != TK_newline && tk.num != TK_eof)
 570                         tk = token_next(state);
 571                 free(p.body);
 572                 return err;
 573         }
 574
 575 With the ability to parse productions and dollar-lines, we have nearly all
 576 that we need to parse a grammar from a `code_node`.
 577
 578 The head of the first production will effectively be the `start` symbol
 579 of the grammar.  However it won't _actually_ be so.  Processing the
 580 grammar is greatly simplified if the real start symbol only has a single
 581 production, and expects `$eof` as the final terminal.  So when we find
 582 the first explicit production we insert an extra implicit production as
 583 production zero which looks like
 584
 585 ###### Example: production 0
 586         $start -> START $eof
 587
 588 where `START` is the first non-terminal given.
 589
 590 ###### create production zero
 591         struct production *p = calloc(1,sizeof(*p));
 592         struct text start = {"$start",6};
 593         struct text eof = {"$eof",4};
 594         struct text code = {"$0 = $<1;", 9};
 595         p->head = sym_find(g, start);
 596         p->head->type = Nonterminal;
 597         p->head->struct_name = g->current_type;
 598         p->head->isref = g->type_isref;
 599         if (g->current_type.txt)
 600                 p->code = code;
 601         array_add(&p->body, &p->body_size, head);
 602         array_add(&p->body, &p->body_size, sym_find(g, eof));
 603         p->head->first_production = g->production_count;
 604         array_add(&g->productions, &g->production_count, p);
 605
 606 Now we are ready to read in the grammar.  We ignore comments
 607 and strings so that the marks which introduce them can be
 608 used as terminals in the grammar.  We don't ignore numbers
 609 even though we don't allow them as that causes the scanner
 610 to produce errors that the parser is better positioned to handle.
 611 We also ignore indentation, but we expect and use newlines.
 612
 613 To allow comments in the grammar, we explicitly check for "//" as the
 614 first token on the line and those lines are skipped.  "//" can still be
 615 used as a terminal anywhere that a terminal is expected.
 616
 617 ###### grammar_read
 618         static struct grammar *grammar_read(struct code_node *code)
 619         {
 620                 struct token_config conf = {
 621                         .word_start = "",
 622                         .word_cont = "",
 623                         .words_marks = known,
 624                         .known_count = sizeof(known)/sizeof(known[0]),
 625                         .number_chars = "",
 626                         .ignored = (1 << TK_line_comment)
 627                                  | (1 << TK_block_comment)
 628                                  | (0 << TK_number)
 629                                  | (1 << TK_string)
 630                                  | (1 << TK_multi_string)
 631                                  | (1 << TK_in)
 632                                  | (1 << TK_out),
 633                 };
 634
 635                 struct token_state *state = token_open(code, &conf);
 636                 struct token tk;
 637                 struct symbol *head = NULL;
 638                 struct grammar *g;
 639                 char *err = NULL;
 640
 641                 g = calloc(1, sizeof(*g));
 642                 symbols_init(g);
 643
 644                 for (tk = token_next(state); tk.num != TK_eof;
 645                      tk = token_next(state)) {
 646                         if (tk.num == TK_newline)
 647                                 continue;
 648                         if (tk.num == TK_ident) {
 649                                 // new non-terminal
 650                                 head = sym_find(g, tk.txt);
 651                                 if (head->type == Nonterminal)
 652                                         err = "This non-terminal has already be used.";
 653                                 else if (head->type == Virtual)
 654                                         err = "Virtual symbol not permitted in head of production";
 655                                 else {
 656                                         head->type = Nonterminal;
 657                                         head->struct_name = g->current_type;
 658                                         head->isref = g->type_isref;
 659                                         if (g->production_count == 0) {
 660                                                 ## create production zero
 661                                         }
 662                                         head->first_production = g->production_count;
 663                                         tk = token_next(state);
 664                                         if (tk.num == TK_mark &&
 665                                             text_is(tk.txt, "->"))
 666                                                 err = parse_production(g, head, state);
 667                                         else
 668                                                 err = "'->' missing in production";
 669                                 }
 670                         } else if (tk.num == TK_mark
 671                                    && text_is(tk.txt, "|")) {
 672                                 // another production for same non-term
 673                                 if (head)
 674                                         err = parse_production(g, head, state);
 675                                 else
 676                                         err = "First production must have a head";
 677                         } else if (tk.num == TK_mark
 678                                    && text_is(tk.txt, "$")) {
 679                                 err = dollar_line(state, g, 0);
 680                         } else if (tk.num == TK_mark
 681                                    && text_is(tk.txt, "$*")) {
 682                                 err = dollar_line(state, g, 1);
 683                         } else if (tk.num == TK_mark
 684                                    && text_is(tk.txt, "//")) {
 685                                 while (tk.num != TK_newline &&
 686                                        tk.num != TK_eof)
 687                                         tk = token_next(state);
 688                         } else {
 689                                 err = "Unrecognised token at start of line.";
 690                         }
 691                         if (err)
 692                                 goto abort;
 693                 }
 694                 token_close(state);
 695                 if (g->terminals_declared) {
 696                         struct symbol *s;
 697                         int errs = 0;
 698                         for (s = g->syms; s; s = s->next) {
 699                                 if (s->type != Unknown)
 700                                         continue;
 701                                 errs += 1;
 702                                 fprintf(stderr, "Token %.*s not declared\n",
 703                                         s->name.len, s->name.txt);
 704                         }
 705                         if (errs) {
 706                                 free(g); // FIXME free content
 707                                 g = NULL;
 708                         }
 709                 }
 710                 return g;
 711         abort:
 712                 fprintf(stderr, "Error at line %d: %s\n",
 713                         tk.line, err);
 714                 token_close(state);
 715                 free(g); // FIXME free content
 716                 return NULL;
 717         }
 718
 719 ## Analysing the grammar
 720
 721 The central task in analysing the grammar is to determine a set of
 722 states to drive the parsing process.  This will require finding various
 723 sets of symbols and of "items".  Some of these sets will need to attach
 724 information to each element in the set, so they are more like maps.
 725
 726 Each "item" may have a 'look-ahead' or `LA` set associated with it.
 727 Many of these will be the same as each other.  To avoid memory wastage,
 728 and to simplify some comparisons of sets, these sets will be stored in a
 729 list of unique sets, each assigned a number.
 730
 731 Once we have the data structures in hand to manage these sets and lists,
 732 we can start setting the 'nullable' flag, build the 'FIRST' and 'FOLLOW'
 733 sets, and then create the item sets which define the various states.
 734
 735 ### Symbol sets.
 736
 737 Though we don't only store symbols in these sets, they are the main
 738 things we store, so they are called symbol sets or "symsets".
 739
 740 A symset has a size, an array of shorts, and an optional array of data
 741 values, which are also shorts.  If the array of data is not present, we
 742 store an impossible pointer, as `NULL` is used to indicate that no
 743 memory has been allocated yet;
 744
 745 ###### declarations
 746         struct symset {
 747                 short cnt;
 748                 unsigned short *syms, *data;
 749         };
 750         #define NO_DATA ((unsigned short *)1)
 751         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 752         const struct symset INIT_DATASET = { 0, NULL, NULL };
 753
 754 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
 755 using an insertion sort.  We don't explicitly record the amount of
 756 allocated space as it can be derived directly from the current `cnt`
 757 using `((cnt - 1) | 7) + 1`.
 758
 759 ###### functions
 760         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 761         {
 762                 int i;
 763                 int current = ((s->cnt-1) | 7) + 1;
 764                 if (current == s->cnt) {
 765                         current += 8;
 766                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 767                         if (s->data != NO_DATA)
 768                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 769                 }
 770                 i = s->cnt;
 771                 while (i > 0 && s->syms[i-1] > key) {
 772                         s->syms[i] = s->syms[i-1];
 773                         if (s->data != NO_DATA)
 774                                 s->data[i] = s->data[i-1];
 775                         i--;
 776                 }
 777                 s->syms[i] = key;
 778                 if (s->data != NO_DATA)
 779                         s->data[i] = val;
 780                 s->cnt += 1;
 781         }
 782
 783 Finding a symbol (or item) in a `symset` uses a simple binary search.
 784 We return the index where the value was found (so data can be accessed),
 785 or `-1` to indicate failure.
 786
 787         static int symset_find(struct symset *ss, unsigned short key)
 788         {
 789                 int lo = 0;
 790                 int hi = ss->cnt;
 791
 792                 if (hi == 0)
 793                         return -1;
 794                 while (lo + 1 < hi) {
 795                         int mid = (lo + hi) / 2;
 796                         if (ss->syms[mid] <= key)
 797                                 lo = mid;
 798                         else
 799                                 hi = mid;
 800                 }
 801                 if (ss->syms[lo] == key)
 802                         return lo;
 803                 return -1;
 804         }
 805
 806 We will often want to form the union of two symsets.  When we do, we
 807 will often want to know if anything changed (as that might mean there is
 808 more work to do).  So `symset_union` reports whether anything was added
 809 to the first set.  We use a slow quadratic approach as these sets are
 810 rarely large.  If profiling shows this to be a problem it can be
 811 optimised later.
 812
 813         static int symset_union(struct symset *a, struct symset *b)
 814         {
 815                 int i;
 816                 int added = 0;
 817                 for (i = 0; i < b->cnt; i++)
 818                         if (symset_find(a, b->syms[i]) < 0) {
 819                                 unsigned short data = 0;
 820                                 if (b->data != NO_DATA)
 821                                         data = b->data[i];
 822                                 symset_add(a, b->syms[i], data);
 823                                 added++;
 824                         }
 825                 return added;
 826         }
 827
 828 And of course we must be able to free a symset.
 829
 830         static void symset_free(struct symset ss)
 831         {
 832                 free(ss.syms);
 833                 if (ss.data != NO_DATA)
 834                         free(ss.data);
 835         }
 836
 837 ### Symset Storage
 838
 839 Some symsets are simply stored somewhere appropriate in the `grammar`
 840 data structure, others needs a bit of indirection.  This applies
 841 particularly to "LA" sets.  These will be paired with "items" in an
 842 "itemset".  As itemsets will be stored in a symset, the "LA" set needs
 843 to be stored in the `data` field, so we need a mapping from a small
 844 (short) number to an LA `symset`.
 845
 846 As mentioned earlier this involves creating a list of unique symsets.
 847
 848 For now, we just use a linear list sorted by insertion.  A skiplist
 849 would be more efficient and may be added later.
 850
 851 ###### declarations
 852
 853         struct setlist {
 854                 struct symset ss;
 855                 int num;
 856                 struct setlist *next;
 857         };
 858
 859 ###### grammar fields
 860         struct setlist *sets;
 861         int nextset;
 862
 863 ###### functions
 864
 865         static int ss_cmp(struct symset a, struct symset b)
 866         {
 867                 int i;
 868                 int diff = a.cnt - b.cnt;
 869                 if (diff)
 870                         return diff;
 871                 for (i = 0; i < a.cnt; i++) {
 872                         diff = (int)a.syms[i] - (int)b.syms[i];
 873                         if (diff)
 874                                 return diff;
 875                 }
 876                 return 0;
 877         }
 878
 879         static int save_set(struct grammar *g, struct symset ss)
 880         {
 881                 struct setlist **sl = &g->sets;
 882                 int cmp = 1;
 883                 struct setlist *s;
 884
 885                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 886                         sl = & (*sl)->next;
 887                 if (cmp == 0) {
 888                         symset_free(ss);
 889                         return (*sl)->num;
 890                 }
 891
 892                 s = malloc(sizeof(*s));
 893                 s->ss = ss;
 894                 s->num = g->nextset;
 895                 g->nextset += 1;
 896                 s->next = *sl;
 897                 *sl = s;
 898                 return s->num;
 899         }
 900
 901 Finding a set by number is currently performed by a simple linear
 902 search.  If this turns out to hurt performance, we can store the sets in
 903 a dynamic array like the productions.
 904
 905         static struct symset set_find(struct grammar *g, int num)
 906         {
 907                 struct setlist *sl = g->sets;
 908                 while (sl && sl->num != num)
 909                         sl = sl->next;
 910                 return sl->ss;
 911         }
 912
 913 ### Setting `nullable`
 914
 915 We set `nullable` on the head symbol for any production for which all
 916 the body symbols (if any) are nullable.  As this is a recursive
 917 definition, any change in the `nullable` setting means that we need to
 918 re-evaluate where it needs to be set.
 919
 920 We simply loop around performing the same calculations until no more
 921 changes happen.
 922
 923 ###### symbol fields
 924         int nullable;
 925
 926 ###### functions
 927         static void set_nullable(struct grammar *g)
 928         {
 929                 int check_again = 1;
 930                 while (check_again) {
 931                         int p;
 932                         check_again = 0;
 933                         for (p = 0; p < g->production_count; p++) {
 934                                 struct production *pr = g->productions[p];
 935                                 int s;
 936
 937                                 if (pr->head->nullable)
 938                                         continue;
 939                                 for (s = 0; s < pr->body_size; s++)
 940                                         if (! pr->body[s]->nullable)
 941                                                 break;
 942                                 if (s == pr->body_size) {
 943                                         pr->head->nullable = 1;
 944                                         check_again = 1;
 945                                 }
 946                         }
 947                 }
 948         }
 949
 950 ### Building the `first` sets
 951
 952 When calculating what can follow a particular non-terminal, we will need
 953 to know what the "first" terminal in any subsequent non-terminal might
 954 be.  So we calculate the `first` set for every non-terminal and store
 955 them in an array.  We don't bother recording the "first" set for
 956 terminals as they are trivial.
 957
 958 As the "first" for one symbol might depend on the "first" of another, we
 959 repeat the calculations until no changes happen, just like with
 960 `set_nullable`.  This makes use of the fact that `symset_union` reports
 961 if any change happens.
 962
 963 The core of this, which finds the "first" of part of a production body,
 964 will be reused for computing the "follow" sets, so we split that out
 965 into a separate function, `add_first`, which also reports if it got all
 966 the way to the end of the production without finding a non-nullable
 967 symbol.
 968
 969 ###### grammar fields
 970         struct symset *first;
 971
 972 ###### functions
 973
 974         static int add_first(struct production *pr, int start,
 975                              struct symset *target, struct grammar *g,
 976                              int *to_end)
 977         {
 978                 int s;
 979                 int changed = 0;
 980                 for (s = start; s < pr->body_size; s++) {
 981                         struct symbol *bs = pr->body[s];
 982                         if (bs->type == Terminal) {
 983                                 if (symset_find(target, bs->num) < 0) {
 984                                         symset_add(target, bs->num, 0);
 985                                         changed = 1;
 986                                 }
 987                                 break;
 988                         } else if (symset_union(target, &g->first[bs->num]))
 989                                 changed = 1;
 990                         if (!bs->nullable)
 991                                 break;
 992                 }
 993                 if (to_end)
 994                         *to_end = (s == pr->body_size);
 995                 return changed;
 996         }
 997
 998         static void build_first(struct grammar *g)
 999         {
1000                 int check_again = 1;
1001                 int s;
1002                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
1003                 for (s = 0; s < g->num_syms; s++)
1004                         g->first[s] = INIT_SYMSET;
1005
1006                 while (check_again) {
1007                         int p;
1008                         check_again = 0;
1009                         for (p = 0; p < g->production_count; p++) {
1010                                 struct production *pr = g->productions[p];
1011                                 struct symset *head = &g->first[pr->head->num];
1012
1013                                 if (add_first(pr, 0, head, g, NULL))
1014                                         check_again = 1;
1015                         }
1016                 }
1017         }
1018
1019 ### Building the `follow` sets.
1020
1021 There are two different situations when we will want to generate
1022 "follow" sets.  If we are doing an SLR analysis, we want to generate a
1023 single "follow" set for each non-terminal in the grammar.  That is what
1024 is happening here.  If we are doing an LALR or LR analysis we will want
1025 to generate a separate "LA" set for each item.  We do that later in
1026 state generation.
1027
1028 There are two parts to generating a "follow" set.  Firstly we look at
1029 every place that any non-terminal appears in the body of any production,
1030 and we find the set of possible "first" symbols after there.  This is
1031 done using `add_first` above and only needs to be done once as the
1032 "first" sets are now stable and will not change.
1033
1034 ###### grammar fields
1035         struct symset *follow;
1036
1037 ###### follow code
1038
1039         for (p = 0; p < g->production_count; p++) {
1040                 struct production *pr = g->productions[p];
1041                 int b;
1042
1043                 for (b = 0; b < pr->body_size - 1; b++) {
1044                         struct symbol *bs = pr->body[b];
1045                         if (bs->type == Terminal)
1046                                 continue;
1047                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1048                 }
1049         }
1050
1051 The second part is to add the "follow" set of the head of a production
1052 to the "follow" sets of the final symbol in the production, and any
1053 other symbol which is followed only by `nullable` symbols.  As this
1054 depends on "follow" itself we need to repeatedly perform the process
1055 until no further changes happen.
1056
1057 Rather than 2 nested loops, one that repeats the whole process until
1058 there is no change, and one that iterates of the set of productions, we
1059 combine these two functions into a single loop.
1060
1061 ###### follow code
1062
1063         for (check_again = 0, p = 0;
1064              p < g->production_count;
1065              p < g->production_count-1
1066                 ? p++ : check_again ? (check_again = 0, p = 0)
1067                               : p++) {
1068                 struct production *pr = g->productions[p];
1069                 int b;
1070
1071                 for (b = pr->body_size - 1; b >= 0; b--) {
1072                         struct symbol *bs = pr->body[b];
1073                         if (bs->type == Terminal)
1074                                 break;
1075                         if (symset_union(&g->follow[bs->num],
1076                                          &g->follow[pr->head->num]))
1077                                 check_again = 1;
1078                         if (!bs->nullable)
1079                                 break;
1080                 }
1081         }
1082
1083 We now just need to create and initialise the `follow` list to get a
1084 complete function.
1085
1086 ###### functions
1087         static void build_follow(struct grammar *g)
1088         {
1089                 int s, check_again, p;
1090                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1091                 for (s = 0; s < g->num_syms; s++)
1092                         g->follow[s] = INIT_SYMSET;
1093                 ## follow code
1094         }
1095
1096 ### Building itemsets and states
1097
1098 There are three different levels of detail that can be chosen for
1099 building the itemsets and states for the LR grammar.  They are:
1100
1101 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1102    itemsets - look-ahead, if used at all, is simply the 'follow' sets
1103    already computed,
1104 2. LALR(1) where we build look-ahead sets with each item and merge
1105    the LA sets when we find two paths to the same "kernel" set of items.
1106 3. LR(1) where different look-ahead for any item in the set means
1107    a different item set must be created.
1108
1109 ###### forward declarations
1110         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1111
1112 We need to be able to look through existing states to see if a newly
1113 generated state already exists.  For now we use a simple sorted linked
1114 list.
1115
1116 An item is a pair of numbers: the production number and the position of
1117 "DOT", which is an index into the body of the production.  As the
1118 numbers are not enormous we can combine them into a single "short" and
1119 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1120 production number (so 2000 productions with maximum of 31 symbols in the
1121 body).
1122
1123 Comparing the itemsets will be a little different to comparing symsets
1124 as we want to do the lookup after generating the "kernel" of an
1125 itemset, so we need to ignore the offset=zero items which are added during
1126 completion.
1127
1128 To facilitate this, we modify the "DOT" number so that "0" sorts to
1129 the end of the list in the symset, and then only compare items before
1130 the first "0".
1131
1132 ###### declarations
1133         static inline unsigned short item_num(int production, int dot)
1134         {
1135                 return production | ((31-dot) << 11);
1136         }
1137         static inline int item_prod(unsigned short item)
1138         {
1139                 return item & 0x7ff;
1140         }
1141         static inline int item_dot(unsigned short item)
1142         {
1143                 return (31-(item >> 11)) & 0x1f;
1144         }
1145
1146 For LR(1) analysis we need to compare not just the itemset in a state
1147 but also the LA sets.  As we assign each unique LA set a number, we
1148 can just compare the symset and the data values together.
1149
1150 ###### functions
1151         static int itemset_cmp(struct symset a, struct symset b,
1152                                enum grammar_type type)
1153         {
1154                 int i;
1155                 int av, bv;
1156
1157                 for (i = 0;
1158                      i < a.cnt && i < b.cnt &&
1159                      item_dot(a.syms[i]) > 0 &&
1160                      item_dot(b.syms[i]) > 0;
1161                      i++) {
1162                         int diff = a.syms[i] - b.syms[i];
1163                         if (diff)
1164                                 return diff;
1165                         if (type == LR1) {
1166                                 diff = a.data[i] - b.data[i];
1167                                 if (diff)
1168                                         return diff;
1169                         }
1170                 }
1171                 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1172                         av = -1;
1173                 else
1174                         av = a.syms[i];
1175                 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1176                         bv = -1;
1177                 else
1178                         bv = b.syms[i];
1179                 if (av - bv)
1180                         return av - bv;
1181                 if (type < LR1 || av == -1)
1182                         return 0;
1183                 return
1184                         a.data[i] - b.data[i];
1185         }
1186
1187 It will be helpful to know if an itemset has been "completed" or not,
1188 particularly for LALR where itemsets get merged, at which point they
1189 need to be consider for completion again.  So a `completed` flag is
1190 needed.
1191
1192 And now we can build the list of itemsets.  The lookup routine returns
1193 both a success flag and a pointer to where in the list an insert should
1194 happen, so we don't need to search a second time.
1195
1196 ###### declarations
1197         struct itemset {
1198                 struct itemset *next;
1199                 short state;
1200                 struct symset items;
1201                 struct symset go_to;
1202                 enum assoc assoc;
1203                 unsigned short precedence;
1204                 char completed;
1205         };
1206
1207 ###### grammar fields
1208         struct itemset *items;
1209         int states;
1210
1211 ###### functions
1212         static int itemset_find(struct grammar *g, struct itemset ***where,
1213                                 struct symset kernel, enum grammar_type type)
1214         {
1215                 struct itemset **ip;
1216
1217                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1218                         struct itemset *i = *ip;
1219                         int diff;
1220                         diff = itemset_cmp(i->items, kernel, type);
1221                         if (diff < 0)
1222                                 continue;
1223                         if (diff > 0)
1224                                 break;
1225                         /* found */
1226                         *where = ip;
1227                         return 1;
1228                 }
1229                 *where = ip;
1230                 return 0;
1231         }
1232
1233 Adding an itemset may require merging the LA sets if LALR analysis is
1234 happening.  If any new LA set adds any symbols that weren't in the old
1235 LA set, we clear the `completed` flag so that the dependants of this
1236 itemset will be recalculated and their LA sets updated.
1237
1238 `add_itemset` must consume the symsets it is passed, either by adding
1239 them to a data structure, of freeing them.
1240
1241         static int add_itemset(struct grammar *g, struct symset ss,
1242                                enum assoc assoc, unsigned short precedence,
1243                                enum grammar_type type)
1244         {
1245                 struct itemset **where, *is;
1246                 int i;
1247                 int found = itemset_find(g, &where, ss, type);
1248                 if (!found) {
1249                         is = calloc(1, sizeof(*is));
1250                         is->state = g->states;
1251                         g->states += 1;
1252                         is->items = ss;
1253                         is->assoc = assoc;
1254                         is->precedence = precedence;
1255                         is->next = *where;
1256                         is->go_to = INIT_DATASET;
1257                         *where = is;
1258                         return is->state;
1259                 }
1260                 is = *where;
1261                 if (type != LALR) {
1262                         symset_free(ss);
1263                         return is->state;
1264                 }
1265                 for (i = 0; i < ss.cnt; i++) {
1266                         struct symset temp = INIT_SYMSET, s;
1267                         if (ss.data[i] == is->items.data[i])
1268                                 continue;
1269                         s = set_find(g, is->items.data[i]);
1270                         symset_union(&temp, &s);
1271                         s = set_find(g, ss.data[i]);
1272                         if (symset_union(&temp, &s)) {
1273                                 is->items.data[i] = save_set(g, temp);
1274                                 is->completed = 0;
1275                         } else
1276                                 symset_free(temp);
1277                 }
1278                 symset_free(ss);
1279                 return is->state;
1280         }
1281
1282 #### The build
1283
1284 To build all the itemsets, we first insert the initial itemset made from
1285 production zero, complete each itemset, and then generate new itemsets
1286 from old until no new ones can be made.
1287
1288 Completing an itemset means finding all the items where "DOT" is
1289 followed by a nonterminal and adding "DOT=0" items for every production
1290 from that non-terminal - providing each item hasn't already been added.
1291 When we add such an item it might get inserted before the current
1292 one, so to make sure we process it we reset the loop counter to the
1293 start.
1294
1295 If LA sets are needed, the LA set for each new item is found using
1296 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This
1297 may be supplemented by the LA set for the item which produced the new
1298 item.
1299
1300 We also collect a set of all symbols which follow "DOT" (in `done`) as
1301 this is used in the next stage.
1302
1303 When itemsets are created we assign a precedence to the itemset from the
1304 complete item, if there is one.  We ignore the possibility of there
1305 being two and don't (currently) handle precedence in such grammars.
1306 When completing a grammar we ignore any item where DOT is followed by a
1307 terminal with a precedence lower than that for the itemset.  Unless the
1308 terminal has right associativity, we also ignore items where the
1309 terminal has the same precedence.  The result is that unwanted items are
1310 still in the itemset, but the terminal doesn't get into the go to set,
1311 so the item is ineffective.
1312
1313 ###### complete itemset
1314         for (i = 0; i < is->items.cnt; i++) {
1315                 int p = item_prod(is->items.syms[i]);
1316                 int bs = item_dot(is->items.syms[i]);
1317                 struct production *pr = g->productions[p];
1318                 int p2;
1319                 struct symbol *s;
1320                 struct symset LA = INIT_SYMSET;
1321                 unsigned short sn = 0;
1322
1323                 if (bs == pr->body_size)
1324                         continue;
1325                 s = pr->body[bs];
1326                 if (s->precedence && is->precedence &&
1327                     is->precedence > s->precedence)
1328                         /* This terminal has a low precedence and
1329                          * shouldn't be shifted
1330                          */
1331                         continue;
1332                 if (s->precedence && is->precedence &&
1333                     is->precedence == s->precedence && s->assoc != Right)
1334                         /* This terminal has a matching precedence and is
1335                          * not Right-associative, so we mustn't shift it.
1336                          */
1337                         continue;
1338                 if (symset_find(&done, s->num) < 0)
1339                         symset_add(&done, s->num, 0);
1340
1341                 if (s->type != Nonterminal)
1342                         continue;
1343                 check_again = 1;
1344                 if (type >= LALR) {
1345                         // Need the LA set.
1346                         int to_end;
1347                         add_first(pr, bs+1, &LA, g, &to_end);
1348                         if (to_end) {
1349                                 struct symset ss = set_find(g, is->items.data[i]);
1350                                 symset_union(&LA, &ss);
1351                         }
1352                         sn = save_set(g, LA);
1353                         LA = set_find(g, sn);
1354                 }
1355
1356                 /* Add productions for this symbol */
1357                 for (p2 = s->first_production;
1358                      p2 < g->production_count &&
1359                       g->productions[p2]->head == s;
1360                      p2++) {
1361                         int itm = item_num(p2, 0);
1362                         int pos = symset_find(&is->items, itm);
1363                         if (pos < 0) {
1364                                 symset_add(&is->items, itm, sn);
1365                                 /* Will have re-ordered, so start
1366                                  * from beginning again */
1367                                 i = -1;
1368                         } else if (type >= LALR) {
1369                                 struct symset ss = set_find(g, is->items.data[pos]);
1370                                 struct symset tmp = INIT_SYMSET;
1371                                 struct symset *la = &LA;
1372
1373                                 symset_union(&tmp, &ss);
1374                                 if (symset_union(&tmp, la)) {
1375                                         is->items.data[pos] = save_set(g, tmp);
1376                                         i = -1;
1377                                 } else
1378                                         symset_free(tmp);
1379                         }
1380                 }
1381         }
1382
1383 For each symbol we found (and placed in `done`) we collect all the items
1384 for which this symbol is next, and create a new itemset with "DOT"
1385 advanced over the symbol.  This is then added to the collection of
1386 itemsets (or merged with a pre-existing itemset).
1387
1388 ###### derive itemsets
1389         // Now we have a completed itemset, so we need to
1390         // compute all the 'next' itemsets and create them
1391         // if they don't exist.
1392         for (i = 0; i < done.cnt; i++) {
1393                 int j;
1394                 unsigned short state;
1395                 struct symbol *sym = g->symtab[done.syms[i]];
1396                 enum assoc assoc = Non;
1397                 unsigned short precedence = 0;
1398                 struct symset newitemset = INIT_SYMSET;
1399                 if (type >= LALR)
1400                         newitemset = INIT_DATASET;
1401
1402                 for (j = 0; j < is->items.cnt; j++) {
1403                         int itm = is->items.syms[j];
1404                         int p = item_prod(itm);
1405                         int bp = item_dot(itm);
1406                         struct production *pr = g->productions[p];
1407                         unsigned short la = 0;
1408                         int pos;
1409
1410                         if (bp == pr->body_size)
1411                                 continue;
1412                         if (pr->body[bp] != sym)
1413                                 continue;
1414
1415                         bp += 1;
1416                         if (type >= LALR)
1417                                 la = is->items.data[j];
1418                         if (bp == pr->body_size &&
1419                             pr->precedence > 0 &&
1420                             pr->precedence > precedence) {
1421                                 // new itemset is reducible and has a precedence.
1422                                 precedence = pr->precedence;
1423                                 assoc = pr->assoc;
1424                         }
1425                         pos = symset_find(&newitemset, item_num(p, bp));
1426
1427                         if (pos < 0)
1428                                 symset_add(&newitemset, item_num(p, bp), la);
1429                         else if (type >= LALR) {
1430                                 // Need to merge la set.
1431                                 int la2 = newitemset.data[pos];
1432                                 if (la != la2) {
1433                                         struct symset ss = set_find(g, la2);
1434                                         struct symset LA = INIT_SYMSET;
1435                                         symset_union(&LA, &ss);
1436                                         ss = set_find(g, la);
1437                                         if (symset_union(&LA, &ss))
1438                                                 newitemset.data[pos] = save_set(g, LA);
1439                                         else
1440                                                 symset_free(LA);
1441                                 }
1442                         }
1443                 }
1444                 state = add_itemset(g, newitemset, assoc, precedence, type);
1445                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1446                         symset_add(&is->go_to, done.syms[i], state);
1447         }
1448
1449 All that is left is to create the initial itemset from production zero, and
1450 with `TK_eof` as the LA set.
1451
1452 ###### functions
1453         static void build_itemsets(struct grammar *g, enum grammar_type type)
1454         {
1455                 struct symset first = INIT_SYMSET;
1456                 struct itemset *is;
1457                 int check_again;
1458                 unsigned short la = 0;
1459                 if (type >= LALR) {
1460                         // LA set just has eof
1461                         struct symset eof = INIT_SYMSET;
1462                         symset_add(&eof, TK_eof, 0);
1463                         la = save_set(g, eof);
1464                         first = INIT_DATASET;
1465                 }
1466                 // production 0, offset 0 (with no data)
1467                 symset_add(&first, item_num(0, 0), la);
1468                 add_itemset(g, first, Non, 0, type);
1469                 for (check_again = 0, is = g->items;
1470                      is;
1471                      is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1472                         int i;
1473                         struct symset done = INIT_SYMSET;
1474                         if (is->completed)
1475                                 continue;
1476                         is->completed = 1;
1477                         check_again = 1;
1478                         ## complete itemset
1479                         ## derive itemsets
1480                         symset_free(done);
1481                 }
1482         }
1483
1484 ### Completing the analysis.
1485
1486 The exact process of analysis depends on which level was requested.  For
1487 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1488 `LR1` we need first, but not follow.  For `SLR` we need both.
1489
1490 We don't build the "action" tables that you might expect as the parser
1491 doesn't use them.  They will be calculated to some extent if needed for
1492 a report.
1493
1494 Once we have built everything we allocate arrays for the two lists:
1495 symbols and itemsets.  This allows more efficient access during
1496 reporting.  The symbols are grouped as terminals, then non-terminals,
1497 then virtual, with the start of non-terminals recorded as `first_nonterm`.
1498 Special terminals -- meaning just EOL -- are included with the
1499 non-terminals so that they are not expected by the scanner.
1500
1501 ###### grammar fields
1502         struct symbol **symtab;
1503         struct itemset **statetab;
1504         int first_nonterm;
1505
1506 ###### grammar_analyse
1507
1508         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1509         {
1510                 struct symbol *s;
1511                 struct itemset *is;
1512                 int snum = TK_reserved;
1513                 for (s = g->syms; s; s = s->next)
1514                         if (s->num < 0 && s->type == Terminal && !s->isspecial) {
1515                                 s->num = snum;
1516                                 snum++;
1517                         }
1518                 g->first_nonterm = snum;
1519                 for (s = g->syms; s; s = s->next)
1520                         if (s->num < 0 && s->type != Virtual) {
1521                                 s->num = snum;
1522                                 snum++;
1523                         }
1524                 for (s = g->syms; s; s = s->next)
1525                         if (s->num < 0) {
1526                                 s->num = snum;
1527                                 snum++;
1528                         }
1529                 g->num_syms = snum;
1530                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1531                 for (s = g->syms; s; s = s->next)
1532                         g->symtab[s->num] = s;
1533
1534                 set_nullable(g);
1535                 if (type >= SLR)
1536                         build_first(g);
1537
1538                 if (type == SLR)
1539                         build_follow(g);
1540
1541                 build_itemsets(g, type);
1542
1543                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1544                 for (is = g->items; is ; is = is->next)
1545                         g->statetab[is->state] = is;
1546         }
1547
1548 ## Reporting on the Grammar
1549
1550 The purpose of the report is to give the grammar developer insight into
1551 how the grammar parser will work.  It is basically a structured dump of
1552 all the tables that have been generated, plus a description of any conflicts.
1553
1554 ###### grammar_report
1555         static int grammar_report(struct grammar *g, enum grammar_type type)
1556         {
1557                 report_symbols(g);
1558                 if (g->follow)
1559                         report_follow(g);
1560                 report_itemsets(g);
1561                 return report_conflicts(g, type);
1562         }
1563
1564 Firstly we have the complete list of symbols, together with the
1565 "FIRST" set if that was generated.  We add a mark to each symbol to
1566 show if it is nullable (`.`).
1567
1568 ###### functions
1569
1570         static void report_symbols(struct grammar *g)
1571         {
1572                 int n;
1573                 if (g->first)
1574                         printf("SYMBOLS + FIRST:\n");
1575                 else
1576                         printf("SYMBOLS:\n");
1577
1578                 for (n = 0; n < g->num_syms; n++) {
1579                         struct symbol *s = g->symtab[n];
1580                         if (!s)
1581                                 continue;
1582
1583                         printf(" %c%3d%c: ",
1584                                s->nullable ? '.':' ',
1585                                s->num, symtypes[s->type]);
1586                         prtxt(s->name);
1587                         if (s->precedence)
1588                                 printf(" (%d%s)", s->precedence,
1589                                        assoc_names[s->assoc]);
1590
1591                         if (g->first && s->type == Nonterminal) {
1592                                 int j;
1593                                 char c = ':';
1594                                 for (j = 0; j < g->first[n].cnt; j++) {
1595                                         printf("%c ", c);
1596                                         c = ',';
1597                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1598                                 }
1599                         }
1600                         printf("\n");
1601                 }
1602         }
1603
1604 Then we have the follow sets if they were computed.
1605
1606         static void report_follow(struct grammar *g)
1607         {
1608                 int n;
1609                 printf("FOLLOW:\n");
1610                 for (n = 0; n < g->num_syms; n++)
1611                         if (g->follow[n].cnt) {
1612                                 int j;
1613                                 char c = ':';
1614                                 printf("  ");
1615                                 prtxt(g->symtab[n]->name);
1616                                 for (j = 0; j < g->follow[n].cnt; j++) {
1617                                         printf("%c ", c);
1618                                         c = ',';
1619                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1620                                 }
1621                                 printf("\n");
1622                         }
1623         }
1624
1625 And finally the item sets.  These include the GO TO tables and, for
1626 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1627 it up a bit.  First the items, with production number and associativity.
1628
1629         static void report_item(struct grammar *g, int itm)
1630         {
1631                 int p = item_prod(itm);
1632                 int dot = item_dot(itm);
1633                 struct production *pr = g->productions[p];
1634                 int i;
1635
1636                 printf("    ");
1637                 prtxt(pr->head->name);
1638                 printf(" ->");
1639                 for (i = 0; i < pr->body_size; i++) {
1640                         printf(" %s", (dot == i ? ". ": ""));
1641                         prtxt(pr->body[i]->name);
1642                 }
1643                 if (dot == pr->body_size)
1644                         printf(" .");
1645                 printf(" [%d]", p);
1646                 if (pr->precedence && dot == pr->body_size)
1647                         printf(" (%d%s)", pr->precedence,
1648                                assoc_names[pr->assoc]);
1649                 if (dot < pr->body_size &&
1650                     pr->body[dot]->precedence) {
1651                         struct symbol *s = pr->body[dot];
1652                         printf(" [%d%s]", s->precedence,
1653                                assoc_names[s->assoc]);
1654                 }
1655                 printf("\n");
1656         }
1657
1658 The LA sets which are (possibly) reported with each item:
1659
1660         static void report_la(struct grammar *g, int lanum)
1661         {
1662                 struct symset la = set_find(g, lanum);
1663                 int i;
1664                 char c = ':';
1665
1666                 printf("        LOOK AHEAD(%d)", lanum);
1667                 for (i = 0; i < la.cnt; i++) {
1668                         printf("%c ", c);
1669                         c = ',';
1670                         prtxt(g->symtab[la.syms[i]]->name);
1671                 }
1672                 printf("\n");
1673         }
1674
1675 Then the go to sets:
1676
1677         static void report_goto(struct grammar *g, struct symset gt)
1678         {
1679                 int i;
1680                 printf("    GOTO:\n");
1681
1682                 for (i = 0; i < gt.cnt; i++) {
1683                         printf("      ");
1684                         prtxt(g->symtab[gt.syms[i]]->name);
1685                         printf(" -> %d\n", gt.data[i]);
1686                 }
1687         }
1688
1689 Now we can report all the item sets complete with items, LA sets, and GO TO.
1690
1691         static void report_itemsets(struct grammar *g)
1692         {
1693                 int s;
1694                 printf("ITEM SETS(%d)\n", g->states);
1695                 for (s = 0; s < g->states; s++) {
1696                         int j;
1697                         struct itemset *is = g->statetab[s];
1698                         printf("  Itemset %d:", s);
1699                         if (is->precedence)
1700                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1701                         printf("\n");
1702                         for (j = 0; j < is->items.cnt; j++) {
1703                                 report_item(g, is->items.syms[j]);
1704                                 if (is->items.data != NO_DATA)
1705                                         report_la(g, is->items.data[j]);
1706                         }
1707                         report_goto(g, is->go_to);
1708                 }
1709         }
1710
1711 ### Reporting conflicts
1712
1713 Conflict detection varies a lot among different analysis levels.  However
1714 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1715 LR1 are also similar as they have FOLLOW or LA sets.
1716
1717 ###### functions
1718
1719         ## conflict functions
1720
1721         static int report_conflicts(struct grammar *g, enum grammar_type type)
1722         {
1723                 int cnt = 0;
1724                 printf("Conflicts:\n");
1725                 if (type < SLR)
1726                         cnt = conflicts_lr0(g, type);
1727                 else
1728                         cnt = conflicts_slr(g, type);
1729                 if (cnt == 0)
1730                         printf(" - no conflicts\n");
1731                 return cnt;
1732         }
1733
1734 LR0 conflicts are any state which have both a reducible item and
1735 a shiftable item, or two reducible items.
1736
1737 LR05 conflicts only occur if two possible reductions exist,
1738 as shifts always over-ride reductions.
1739
1740 ###### conflict functions
1741         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1742         {
1743                 int i;
1744                 int cnt = 0;
1745                 for (i = 0; i < g->states; i++) {
1746                         struct itemset *is = g->statetab[i];
1747                         int last_reduce = -1;
1748                         int prev_reduce = -1;
1749                         int last_shift = -1;
1750                         int j;
1751                         if (!is)
1752                                 continue;
1753                         for (j = 0; j < is->items.cnt; j++) {
1754                                 int itm = is->items.syms[j];
1755                                 int p = item_prod(itm);
1756                                 int bp = item_dot(itm);
1757                                 struct production *pr = g->productions[p];
1758
1759                                 if (bp == pr->body_size) {
1760                                         prev_reduce = last_reduce;
1761                                         last_reduce = j;
1762                                         continue;
1763                                 }
1764                                 if (pr->body[bp]->type == Terminal)
1765                                         last_shift = j;
1766                         }
1767                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1768                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1769                                 report_item(g, is->items.syms[last_shift]);
1770                                 report_item(g, is->items.syms[last_reduce]);
1771                                 cnt++;
1772                         }
1773                         if (prev_reduce >= 0) {
1774                                 printf("  State %d has 2 (or more) reducible items\n", i);
1775                                 report_item(g, is->items.syms[prev_reduce]);
1776                                 report_item(g, is->items.syms[last_reduce]);
1777                                 cnt++;
1778                         }
1779                 }
1780                 return cnt;
1781         }
1782
1783 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1784 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1785 in the source of the look ahead set.
1786
1787 We build two datasets to reflect the "action" table: one which maps
1788 terminals to items where that terminal could be shifted and another
1789 which maps terminals to items that could be reduced when the terminal
1790 is in look-ahead.  We report when we get conflicts between the two.
1791
1792         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1793         {
1794                 int i;
1795                 int cnt = 0;
1796
1797                 for (i = 0; i < g->states; i++) {
1798                         struct itemset *is = g->statetab[i];
1799                         struct symset shifts = INIT_DATASET;
1800                         struct symset reduce = INIT_DATASET;
1801                         int j;
1802                         if (!is)
1803                                 continue;
1804                         /* First collect the shifts */
1805                         for (j = 0; j < is->items.cnt; j++) {
1806                                 unsigned short itm = is->items.syms[j];
1807                                 int p = item_prod(itm);
1808                                 int bp = item_dot(itm);
1809                                 struct production *pr = g->productions[p];
1810                                 struct symbol *s;
1811
1812                                 if (bp >= pr->body_size ||
1813                                     pr->body[bp]->type != Terminal)
1814                                         /* not shiftable */
1815                                         continue;
1816
1817                                 s = pr->body[bp];
1818                                 if (s->precedence && is->precedence)
1819                                         /* Precedence resolves this, so no conflict */
1820                                         continue;
1821
1822                                 if (symset_find(&shifts, s->num) < 0)
1823                                         symset_add(&shifts, s->num, itm);
1824                         }
1825                         /* Now look for reductions and conflicts */
1826                         for (j = 0; j < is->items.cnt; j++) {
1827                                 unsigned short itm = is->items.syms[j];
1828                                 int p = item_prod(itm);
1829                                 int bp = item_dot(itm);
1830                                 struct production *pr = g->productions[p];
1831
1832                                 if (bp < pr->body_size)
1833                                         continue;
1834                                 /* reducible */
1835                                 struct symset la;
1836                                 if (type == SLR)
1837                                         la = g->follow[pr->head->num];
1838                                 else
1839                                         la = set_find(g, is->items.data[j]);
1840                                 int k;
1841                                 for (k = 0; k < la.cnt; k++) {
1842                                         int pos = symset_find(&shifts, la.syms[k]);
1843                                         if (pos >= 0) {
1844                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1845                                                 cnt++;
1846                                                         prtxt(g->symtab[la.syms[k]]->name);
1847                                                 printf(":\n");
1848                                                 report_item(g, shifts.data[pos]);
1849                                                 report_item(g, itm);
1850                                         }
1851                                         pos = symset_find(&reduce, la.syms[k]);
1852                                         if (pos < 0) {
1853                                                 symset_add(&reduce, la.syms[k], itm);
1854                                                 continue;
1855                                         }
1856                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1857                                         prtxt(g->symtab[la.syms[k]]->name);
1858                                         printf(":\n");
1859                                         report_item(g, itm);
1860                                         report_item(g, reduce.data[pos]);
1861                                         cnt++;
1862                                 }
1863                         }
1864                         symset_free(shifts);
1865                         symset_free(reduce);
1866                 }
1867                 return cnt;
1868         }
1869
1870
1871 ## Generating the parser
1872
1873 The exported part of the parser is the `parse_XX` function, where the name
1874 `XX` is based on the name of the parser files.
1875
1876 This takes a `code_node`, a partially initialized `token_config`, and an
1877 optional `FILE` to send tracing to.  The `token_config` gets the list of
1878 known words added and then is used with the `code_node` to initialize the
1879 scanner.
1880
1881 `parse_XX` then calls the library function `parser_run` to actually complete
1882 the parse.  This needs the `states` table and functions to call the various
1883 pieces of code provided in the grammar file, so they are generated first.
1884
1885 ###### parser_generate
1886
1887         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1888                                struct code_node *pre_reduce)
1889         {
1890                 gen_known(f, g);
1891                 gen_non_term(f, g);
1892                 gen_goto(f, g);
1893                 gen_states(f, g);
1894                 gen_reduce(f, g, file, pre_reduce);
1895                 gen_free(f, g);
1896
1897                 fprintf(f, "#line 0 \"gen_parser\"\n");
1898                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1899                         name);
1900                 fprintf(f, "{\n");
1901                 fprintf(f, "\tstruct token_state *tokens;\n");
1902                 fprintf(f, "\tconfig->words_marks = known;\n");
1903                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1904                 fprintf(f, "\ttokens = token_open(code, config);\n");
1905                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1906                 fprintf(f, "\ttoken_close(tokens);\n");
1907                 fprintf(f, "\treturn rv;\n");
1908                 fprintf(f, "}\n\n");
1909         }
1910
1911 ### Known words table
1912
1913 The known words table is simply an array of terminal symbols.
1914 The table of nonterminals used for tracing is a similar array.
1915
1916 ###### functions
1917
1918         static void gen_known(FILE *f, struct grammar *g)
1919         {
1920                 int i;
1921                 fprintf(f, "#line 0 \"gen_known\"\n");
1922                 fprintf(f, "static const char *known[] = {\n");
1923                 for (i = TK_reserved;
1924                      i < g->num_syms && g->symtab[i]->type == Terminal;
1925                      i++)
1926                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1927                                 g->symtab[i]->name.txt);
1928                 fprintf(f, "};\n\n");
1929         }
1930
1931         static void gen_non_term(FILE *f, struct grammar *g)
1932         {
1933                 int i;
1934                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1935                 fprintf(f, "static const char *non_term[] = {\n");
1936                 for (i = TK_reserved;
1937                      i < g->num_syms;
1938                      i++)
1939                         if (g->symtab[i]->type == Nonterminal ||
1940                             g->symtab[i]->isspecial)
1941                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1942                                         g->symtab[i]->name.txt);
1943                 fprintf(f, "};\n\n");
1944         }
1945
1946 ### States and the goto tables.
1947
1948 For each state we record the goto table and details of the reducible
1949 production if there is one.
1950 Some of the details of the reducible production are stored in the
1951 `do_reduce` function to come later.  Here we store the production
1952 number, the body size (useful for stack management), and the resulting
1953 symbol (useful for knowing how to free data later).
1954
1955 The go to table is stored in a simple array of `sym` and corresponding
1956 `state`.
1957
1958 ###### exported types
1959
1960         struct lookup {
1961                 short sym;
1962                 short state;
1963         };
1964         struct state {
1965                 short go_to_cnt;
1966                 const struct lookup * go_to;
1967                 short reduce_prod;
1968                 short reduce_size;
1969                 short reduce_sym;
1970                 short result_size;
1971         };
1972
1973 ###### functions
1974
1975         static void gen_goto(FILE *f, struct grammar *g)
1976         {
1977                 int i;
1978                 fprintf(f, "#line 0 \"gen_goto\"\n");
1979                 for (i = 0; i < g->states; i++) {
1980                         struct symset gt = g->statetab[i]->go_to;
1981                         int j;
1982
1983                         if (gt.cnt == 0)
1984                                 continue;
1985                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
1986                                 i);
1987                         for (j = 0; j < gt.cnt; j++)
1988                                 fprintf(f, "\t{ %d, %d },\n",
1989                                         gt.syms[j], gt.data[j]);
1990                         fprintf(f, "};\n");
1991                 }
1992         }
1993
1994         static void gen_states(FILE *f, struct grammar *g)
1995         {
1996                 int i;
1997                 fprintf(f, "#line 0 \"gen_states\"\n");
1998                 fprintf(f, "static const struct state states[] = {\n");
1999                 for (i = 0; i < g->states; i++) {
2000                         struct itemset *is = g->statetab[i];
2001                         int j, prod = -1, prod_len;
2002
2003                         for (j = 0; j < is->items.cnt; j++) {
2004                                 int itm = is->items.syms[j];
2005                                 int p = item_prod(itm);
2006                                 int bp = item_dot(itm);
2007                                 struct production *pr = g->productions[p];
2008
2009                                 if (bp < pr->body_size)
2010                                         continue;
2011                                 /* This is what we reduce - choose longest */
2012                                 if (prod < 0 || prod_len < pr->body_size) {
2013                                         prod = p;
2014                                         prod_len = pr->body_size;
2015                                 }
2016                         }
2017                         if (is->go_to.cnt)
2018                                 fprintf(f, "\t[%d] = { %d, goto_%d, ",
2019                                         i, is->go_to.cnt, i);
2020                         else
2021                                 fprintf(f, "\t[%d] = { 0, NULL, ", i);
2022                         if (prod >= 0) {
2023                                 struct production *pr = g->productions[prod];
2024                                 struct symbol *hd = pr->head;
2025                                 fprintf(f, "%d, %d, %d, ",
2026                                         prod, pr->body_size, pr->head->num);
2027                                 if (hd->struct_name.txt == NULL)
2028                                         fprintf(f, "0 },\n");
2029                                 else
2030                                         fprintf(f, "sizeof(struct %.*s%s) },\n",
2031                                                 hd->struct_name.len,
2032                                                 hd->struct_name.txt,
2033                                                 hd->isref ? "*" : "");
2034                         } else
2035                                 fprintf(f, "-1, -1, -1, -1 },\n");
2036                 }
2037                 fprintf(f, "};\n\n");
2038         }
2039
2040 ### The `do_reduce` function and the code
2041
2042 When the parser engine decides to reduce a production, it calls
2043 `do_reduce` which runs the code that was included with the production,
2044 if any.
2045
2046 This code needs to be able to store data somewhere so we record the size
2047 of the data expected with each state so it can be allocated before
2048 `do_reduce` is called.
2049
2050 In order for the code to access "global" context, we pass in the
2051 "config" pointer that was passed to the parser function.  If the `struct
2052 token_config` is embedded in some larger structure, the reducing code
2053 can access the larger structure using pointer manipulation.  Performing
2054 that pointer manipulation, and any other common code or variables for
2055 all reduce actions, can be provided in code node called "reduce" which
2056 is passed around in `pre_reduce` which you might have already noticed.
2057
2058 The code fragments require translation when written out.  Any `$N` needs
2059 to be converted to a reference either to that buffer (if `$0`) or to the
2060 structure returned by a previous reduction.  These pointers need to be
2061 cast to the appropriate type for each access.  All this is handled in
2062 `gen_code`.
2063
2064 `gen_code` also allows symbol references to contain a '`<`' as in
2065 '`$<2`'.  This is particularly useful for references (or pointers), but
2066 can be used with structures too.  The `<` implies that the value is
2067 being moved out, so the object will not be automatically freed.  It is
2068 equivalent to assigning `NULL` to the pointer or filling a structure
2069 with zeros.
2070
2071 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2072 and possibly a number.  A number by itself (other than zero) selects a
2073 symbol from the body of the production.  A sequence of letters selects
2074 the shortest symbol in the body which contains those letters in the
2075 given order.  If a number follows the letters, then a later occurrence
2076 of that symbol is chosen.  So "`$AB2`" will refer to the structure
2077 attached to the second occurrence of the shortest symbol which contains
2078 an `A` followed by a `B`.  If there is no unique shortest system, or if
2079 the number given is too large, then the symbol reference is not
2080 transformed, and will cause an error when the code is compiled.
2081
2082 ###### functions
2083
2084         static int textchr(struct text t, char c, int s)
2085         {
2086                 int i;
2087                 for (i = s; i < t.len; i++)
2088                         if (t.txt[i] == c)
2089                                 return i;
2090                 return -1;
2091         }
2092
2093         static int subseq_match(char *seq, int slen, struct text name)
2094         {
2095                 int st = 0;
2096                 while (slen > 0) {
2097                         st = textchr(name, *seq, st);
2098                         if (st < 0)
2099                                 return 0;
2100                         slen -= 1;
2101                         seq += 1;
2102                         st += 1;
2103                 }
2104                 return 1;
2105         }
2106
2107         static int choose_sym(char **namep, int len, struct production *p)
2108         {
2109                 char *name = *namep;
2110                 char *nam = name;
2111                 int namlen;
2112                 int n = 0;
2113                 int i, s, slen;
2114                 char c;
2115
2116                 c = *name;
2117                 while (len > 0 &&
2118                        ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2119                         name += 1;
2120                         len -= 1;
2121                         c = *name;
2122                 }
2123                 namlen = name-nam;
2124                 while (len > 0 && (c >= '0' && c <= '9')) {
2125                         name += 1;
2126                         len -= 1;
2127                         n = n * 10 + (c - '0');
2128                         c = *name;
2129                 }
2130                 if (namlen == 0) {
2131                         if (name == *namep || n > p->body_size)
2132                                 return -1;
2133                         *namep = name;
2134                         return n;
2135                 }
2136                 slen = 0; s = -1;
2137                 for (i = 0; i < p->body_size; i++) {
2138                         if (!subseq_match(nam, namlen, p->body[i]->name))
2139                                 continue;
2140                         if (slen == 0 || p->body[i]->name.len < slen) {
2141                                 s = i;
2142                                 slen = p->body[i]->name.len;
2143                         }
2144                         if (s >= 0 && p->body[i] != p->body[s] &&
2145                             p->body[i]->name.len == p->body[s]->name.len)
2146                                 /* not unique, so s cannot be used */
2147                                 s = -1;
2148                 }
2149                 if (s < 0)
2150                         return -1;
2151                 if (n == 0)
2152                         n = 1;
2153                 for (i = 0; i < p->body_size; i++)
2154                         if (p->body[i] == p->body[s]) {
2155                                 n -= 1;
2156                                 if (n == 0)
2157                                         break;
2158                         }
2159                 if (n > 0 || i == p->body_size)
2160                         return -1;
2161                 *namep = name;
2162                 return i + 1;
2163         }
2164
2165         static void gen_code(struct production *p, FILE *f, struct grammar *g)
2166         {
2167                 char *c;
2168                 char *used = calloc(1, p->body_size);
2169                 int i;
2170
2171                 fprintf(f, "\t\t\t");
2172                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2173                         int n;
2174                         int use = 0;
2175                         if (*c != '$') {
2176                                 fputc(*c, f);
2177                                 if (*c == '\n')
2178                                         fputs("\t\t\t", f);
2179                                 continue;
2180                         }
2181                         c++;
2182                         if (*c == '<') {
2183                                 use = 1;
2184                                 c++;
2185                         }
2186                         n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2187                         if (n < 0) {
2188                                 fputc('$', f);
2189                                 if (use)
2190                                         fputc('<', f);
2191                                 fputc(*c, f);
2192                                 continue;
2193                         }
2194                         if (n == 0)
2195                                 fprintf(f, "(*(struct %.*s*%s)ret)",
2196                                         p->head->struct_name.len,
2197                                         p->head->struct_name.txt,
2198                                         p->head->isref ? "*":"");
2199                         else if (p->body[n-1]->type == Terminal)
2200                                 fprintf(f, "(*(struct token *)body[%d])",
2201                                         n-1);
2202                         else if (p->body[n-1]->struct_name.txt == NULL)
2203                                 fprintf(f, "$%d", n);
2204                         else {
2205                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2206                                         p->body[n-1]->struct_name.len,
2207                                         p->body[n-1]->struct_name.txt,
2208                                         p->body[n-1]->isref ? "*":"", n-1);
2209                                 used[n-1] = use;
2210                         }
2211                         c -= 1;
2212                 }
2213                 fputs("\n", f);
2214                 for (i = 0; i < p->body_size; i++) {
2215                         if (p->body[i]->struct_name.txt &&
2216                             used[i]) {
2217                                 // assume this has been copied out
2218                                 if (p->body[i]->isref)
2219                                         fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2220                                 else
2221                                         fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2222                         }
2223                 }
2224                 free(used);
2225         }
2226
2227 ###### functions
2228
2229         static void gen_reduce(FILE *f, struct grammar *g, char *file,
2230                                struct code_node *pre_reduce)
2231         {
2232                 int i;
2233                 fprintf(f, "#line 1 \"gen_reduce\"\n");
2234                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2235                 fprintf(f, "{\n");
2236                 fprintf(f, "\tint ret_size = 0;\n");
2237                 if (pre_reduce)
2238                         code_node_print(f, pre_reduce, file);
2239
2240                 fprintf(f, "#line 4 \"gen_reduce\"\n");
2241                 fprintf(f, "\tswitch(prod) {\n");
2242                 for (i = 0; i < g->production_count; i++) {
2243                         struct production *p = g->productions[i];
2244                         fprintf(f, "\tcase %d:\n", i);
2245
2246                         if (p->code.txt) {
2247                                 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2248                                 gen_code(p, f, g);
2249                         }
2250
2251                         if (p->head->struct_name.txt)
2252                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2253                                         p->head->struct_name.len,
2254                                         p->head->struct_name.txt,
2255                                         p->head->isref ? "*":"");
2256
2257                         fprintf(f, "\t\tbreak;\n");
2258                 }
2259                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2260         }
2261
2262 ### `do_free`
2263
2264 As each non-terminal can potentially cause a different type of data
2265 structure to be allocated and filled in, we need to be able to free it when
2266 done.
2267
2268 It is particularly important to have fine control over freeing during error
2269 recovery where individual stack frames might need to be freed.
2270
2271 For this, the grammar author is required to defined a `free_XX` function for
2272 each structure that is used by a non-terminal.  `do_free` will call whichever
2273 is appropriate given a symbol number, and will call `free` (as is
2274 appropriate for tokens) on any terminal symbol.
2275
2276 ###### functions
2277
2278         static void gen_free(FILE *f, struct grammar *g)
2279         {
2280                 int i;
2281
2282                 fprintf(f, "#line 0 \"gen_free\"\n");
2283                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2284                 fprintf(f, "{\n");
2285                 fprintf(f, "\tif (!asn) return;\n");
2286                 fprintf(f, "\tif (sym < %d", g->first_nonterm);
2287                 /* Need to handle special terminals too */
2288                 for (i = 0; i < g->num_syms; i++) {
2289                         struct symbol *s = g->symtab[i];
2290                         if (i >= g->first_nonterm && s->type == Terminal &&
2291                             s->isspecial)
2292                                 fprintf(f, " || sym == %d", s->num);
2293                 }
2294                 fprintf(f, ") {\n");
2295                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2296                 fprintf(f, "\tswitch(sym) {\n");
2297
2298                 for (i = 0; i < g->num_syms; i++) {
2299                         struct symbol *s = g->symtab[i];
2300                         if (!s ||
2301                             s->type != Nonterminal ||
2302                             s->struct_name.txt == NULL)
2303                                 continue;
2304
2305                         fprintf(f, "\tcase %d:\n", s->num);
2306                         if (s->isref) {
2307                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2308                                         s->struct_name.len,
2309                                         s->struct_name.txt);
2310                                 fprintf(f, "\t\tfree(asn);\n");
2311                         } else
2312                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2313                                         s->struct_name.len,
2314                                         s->struct_name.txt);
2315                         fprintf(f, "\t\tbreak;\n");
2316                 }
2317                 fprintf(f, "\t}\n}\n\n");
2318         }
2319
2320 ## The main routine.
2321
2322 There are three key parts to the "main" function of parsergen: processing
2323 the arguments, loading the grammar file, and dealing with the grammar.
2324
2325 ### Argument processing.
2326
2327 Command line options allow the selection of analysis mode, name of output
2328 file, and whether or not a report should be generated.  By default we create
2329 a report only if no code output was requested.
2330
2331 The `parse_XX` function name uses the basename of the output file which
2332 should not have a suffix (`.c`).  `.c` is added to the given name for the
2333 code, and `.h` is added for the header (if header text is specifed in the
2334 grammar file).
2335
2336 ###### includes
2337         #include <getopt.h>
2338
2339 ###### declarations
2340         static const struct option long_options[] = {
2341                 { "LR0",        0, NULL, '0' },
2342                 { "LR05",       0, NULL, '5' },
2343                 { "SLR",        0, NULL, 'S' },
2344                 { "LALR",       0, NULL, 'L' },
2345                 { "LR1",        0, NULL, '1' },
2346                 { "tag",        1, NULL, 't' },
2347                 { "report",     0, NULL, 'R' },
2348                 { "output",     1, NULL, 'o' },
2349                 { NULL,         0, NULL, 0   }
2350         };
2351         const char *options = "05SL1t:Ro:";
2352
2353 ###### process arguments
2354         int opt;
2355         char *outfile = NULL;
2356         char *infile;
2357         char *name;
2358         char *tag = NULL;
2359         int report = 1;
2360         enum grammar_type type = LR05;
2361         while ((opt = getopt_long(argc, argv, options,
2362                                   long_options, NULL)) != -1) {
2363                 switch(opt) {
2364                 case '0':
2365                         type = LR0; break;
2366                 case '5':
2367                         type = LR05; break;
2368                 case 'S':
2369                         type = SLR; break;
2370                 case 'L':
2371                         type = LALR; break;
2372                 case '1':
2373                         type = LR1; break;
2374                 case 'R':
2375                         report = 2; break;
2376                 case 'o':
2377                         outfile = optarg; break;
2378                 case 't':
2379                         tag = optarg; break;
2380                 default:
2381                         fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2382                         exit(1);
2383                 }
2384         }
2385         if (optind < argc)
2386                 infile = argv[optind++];
2387         else {
2388                 fprintf(stderr, "No input file given\n");
2389                 exit(1);
2390         }
2391         if (outfile && report == 1)
2392                 report = 0;
2393         name = outfile;
2394         if (name && strchr(name, '/'))
2395                 name = strrchr(name, '/')+1;
2396
2397         if (optind < argc) {
2398                 fprintf(stderr, "Excess command line arguments\n");
2399                 exit(1);
2400         }
2401
2402 ### Loading the grammar file
2403
2404 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2405 map it.
2406
2407 Once we have extracted the code (with `mdcode`) we expect to find three
2408 or four sections: header, code, grammar, reduce.
2409 Anything else that is not excluded by the `--tag` option is an error.
2410
2411 "header", "code", and "reduce" are optional, though it is hard to build
2412 a working parser without the first two.  "grammar" must be provided.
2413
2414 ###### includes
2415         #include <fcntl.h>
2416         #include <sys/mman.h>
2417         #include <errno.h>
2418
2419 ###### functions
2420         static int errs;
2421         static void pr_err(char *msg)
2422         {
2423                 errs++;
2424                 fprintf(stderr, "%s\n", msg);
2425         }
2426
2427 ###### load file
2428         struct section *table;
2429         int fd;
2430         int len;
2431         char *file;
2432         fd = open(infile, O_RDONLY);
2433         if (fd < 0) {
2434                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2435                         infile, strerror(errno));
2436                 exit(1);
2437         }
2438         len = lseek(fd, 0, 2);
2439         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2440         table = code_extract(file, file+len, pr_err);
2441
2442         struct code_node *hdr = NULL;
2443         struct code_node *code = NULL;
2444         struct code_node *gram = NULL;
2445         struct code_node *pre_reduce = NULL;
2446         for (s = table; s; s = s->next) {
2447                 struct text sec = s->section;
2448                 if (tag && !strip_tag(&sec, tag))
2449                         continue;
2450                 if (text_is(sec, "header"))
2451                         hdr = s->code;
2452                 else if (text_is(sec, "code"))
2453                         code = s->code;
2454                 else if (text_is(sec, "grammar"))
2455                         gram = s->code;
2456                 else if (text_is(sec, "reduce"))
2457                         pre_reduce = s->code;
2458                 else {
2459                         fprintf(stderr, "Unknown content section: %.*s\n",
2460                                 s->section.len, s->section.txt);
2461                         rv |= 2;
2462                 }
2463         }
2464
2465 ### Processing the input
2466
2467 As we need to append an extention to a filename and then open it for
2468 writing, and we need to do this twice, it helps to have a separate function.
2469
2470 ###### functions
2471
2472         static FILE *open_ext(char *base, char *ext)
2473         {
2474                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2475                 FILE *f;
2476                 strcat(strcpy(fn, base), ext);
2477                 f = fopen(fn, "w");
2478                 free(fn);
2479                 return f;
2480         }
2481
2482 If we can read the grammar, then we analyse and optionally report on it.  If
2483 the report finds conflicts we will exit with an error status.
2484
2485 ###### process input
2486         struct grammar *g = NULL;
2487         if (gram == NULL) {
2488                 fprintf(stderr, "No grammar section provided\n");
2489                 rv |= 2;
2490         } else {
2491                 g = grammar_read(gram);
2492                 if (!g) {
2493                         fprintf(stderr, "Failure to parse grammar\n");
2494                         rv |= 2;
2495                 }
2496         }
2497         if (g) {
2498                 grammar_analyse(g, type);
2499                 if (report)
2500                         if (grammar_report(g, type))
2501                                 rv |= 1;
2502         }
2503
2504 If a "headers" section is defined, we write it out and include a declaration
2505 for the `parse_XX` function so it can be used from separate code.
2506
2507         if (rv == 0 && hdr && outfile) {
2508                 FILE *f = open_ext(outfile, ".h");
2509                 if (f) {
2510                         code_node_print(f, hdr, infile);
2511                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2512                                 name);
2513                         fclose(f);
2514                 } else {
2515                         fprintf(stderr, "Cannot create %s.h\n",
2516                                 outfile);
2517                         rv |= 4;
2518                 }
2519         }
2520
2521 And if all goes well and an output file was provided, we create the `.c`
2522 file with the code section (if any) and the parser tables and function.
2523
2524         if (rv == 0 && outfile) {
2525                 FILE *f = open_ext(outfile, ".c");
2526                 if (f) {
2527                         if (code)
2528                                 code_node_print(f, code, infile);
2529                         gen_parser(f, g, infile, name, pre_reduce);
2530                         fclose(f);
2531                 } else {
2532                         fprintf(stderr, "Cannot create %s.c\n",
2533                                 outfile);
2534                         rv |= 4;
2535                 }
2536         }
2537
2538 And that about wraps it up.  We need to set the locale so that UTF-8 is
2539 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2540
2541 ###### File: parsergen.mk
2542         parsergen : parsergen.o libscanner.o libmdcode.o
2543                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2544
2545 ###### includes
2546         #include <locale.h>
2547
2548 ###### main
2549
2550         int main(int argc, char *argv[])
2551         {
2552                 struct section *s;
2553                 int rv = 0;
2554
2555                 setlocale(LC_ALL,"");
2556
2557                 ## process arguments
2558                 ## load file
2559                 ## process input
2560
2561                 return rv;
2562         }
2563
2564 ## The SHIFT/REDUCE parser
2565
2566 Having analysed the grammar and generated all the tables, we only need
2567 the shift/reduce engine to bring it all together.
2568
2569 ### Goto table lookup
2570
2571 The parser generator has nicely provided us with goto tables sorted by
2572 symbol number.  We need a binary search function to find a symbol in the
2573 table.
2574
2575 ###### parser functions
2576
2577         static int search(const struct state *l, int sym)
2578         {
2579                 int lo = 0;
2580                 int hi = l->go_to_cnt;
2581
2582                 if (hi == 0)
2583                         return -1;
2584                 while (lo + 1 < hi) {
2585                         int mid = (lo + hi) / 2;
2586                         if (l->go_to[mid].sym <= sym)
2587                                 lo = mid;
2588                         else
2589                                 hi = mid;
2590                 }
2591                 if (l->go_to[lo].sym == sym)
2592                         return l->go_to[lo].state;
2593                 else
2594                         return -1;
2595         }
2596
2597 ### Memory allocation
2598
2599 The `scanner` returns tokens in a local variable - we want them in allocated
2600 memory so they can live in the `asn_stack`.  So we provide `tok_copy` to
2601 make an allocated copy as required.
2602
2603 ###### parser includes
2604         #include <memory.h>
2605
2606 ###### parser functions
2607
2608         static struct token *tok_copy(struct token tk)
2609         {
2610                 struct token *new = malloc(sizeof(*new));
2611                 *new = tk;
2612                 return new;
2613         }
2614
2615 ### The state stack.
2616
2617 The core data structure for the parser is the stack.  This tracks all
2618 the symbols that have been recognised or partially recognised.
2619
2620 The stack usually won't grow very large - maybe a few tens of entries.
2621 So we dynamically grow an array as required but never bother to
2622 shrink it down again.
2623
2624 We keep the stack as two separate allocations.  One, `asn_stack`, stores
2625 the "abstract syntax nodes" which are created by each reduction.  When
2626 we call `do_reduce` we need to pass an array of the `asn`s of the body
2627 of the production, and by keeping a separate `asn` stack, we can just
2628 pass a pointer into this stack.
2629
2630 The other allocation stores all other stack fields of which there are
2631 two.  The `state` is the most important one and guides the parsing
2632 process.  The `sym` is nearly unnecessary as it is implicit in the
2633 state.  However when we want to free entries from the `asn_stack`, it
2634 helps to know what type they are so we can call the right freeing
2635 function.  The symbol leads us to the right free function through
2636 `do_free`.
2637
2638 The stack is most properly seen as alternating states and symbols -
2639 states, like the 'DOT' in items, are between symbols.  Each frame in
2640 our stack holds a state and the symbol that was before it.  The
2641 bottom of stack holds the start state but no symbol, as nothing came
2642 before the beginning.  As we need to store some value, `TK_eof` is used
2643 to mark the beginning of the file as well as the end.
2644
2645 ###### parser functions
2646
2647         struct parser {
2648                 struct frame {
2649                         short state;
2650                         short sym;
2651                 } *stack;
2652                 void **asn_stack;
2653                 int stack_size;
2654                 int tos;
2655
2656                 ## parser state
2657         };
2658
2659 #### Shift and pop
2660
2661 Two operations are needed on the stack - shift (which is like push) and pop.
2662
2663 Shift applies not only to terminals but also to non-terminals.  When we
2664 reduce a production we will pop off frames corresponding to the body
2665 symbols, then push on a frame for the head of the production.  This last
2666 is exactly the same process as shifting in a terminal so we use the same
2667 function for both.  In both cases we provide the symbol.  The state is
2668 deduced from the current top-of-stack state and the new symbol.
2669
2670 To simplify other code we arrange for `shift` to fail if there is no `goto`
2671 state for the symbol.  This is useful in basic parsing due to our design
2672 that we shift when we can, and reduce when we cannot.  So the `shift`
2673 function reports if it could.
2674
2675 `shift` is also used to push state zero onto the stack, so if the
2676 stack is empty, it always chooses zero as the next state.
2677
2678 So `shift` finds the next state.  If that succeeds it extends the
2679 allocations if needed and pushes all the information onto the stacks.
2680
2681 An extra complication is added to `shift` by the `EOL` token.  This
2682 token must be generated when a `NEWLINE` is seen, but an `EOL` is
2683 expected.  When this happens, the `NEWLINE` is NOT consumed, so multiple
2684 EOL can appear before a NEWLINE.  To indicate that the token was shifted
2685 by not consumed, `shift` can return the special value `2`.  The token
2686 number for `EOL` cannot be statically declared, so when the parser
2687 starts we need to look through the array of non-terminals to find the
2688 EOL.
2689
2690 ###### parser state
2691         int tk_eol;
2692
2693 ###### find eol
2694         p.tk_eol = 0;
2695         while (strcmp(non_term[p.tk_eol], "EOL") != 0)
2696                 p.tk_eol += 1;
2697         p.tk_eol += TK_reserved + config->known_count;
2698
2699
2700 ###### parser functions
2701
2702         static int shift(struct parser *p,
2703                          short sym, void *asn,
2704                          const struct state states[])
2705         {
2706                 struct frame next = {0};
2707                 int ret;
2708                 int newstate = p->tos
2709                         ? search(&states[p->stack[p->tos-1].state],
2710                                  sym)
2711                         : 0;
2712                 if (newstate >= 0)
2713                         ret = 1;
2714                 else if (sym != TK_newline)
2715                         return 0;
2716                 else {
2717                         // have a NEWLINE, might need an EOL
2718                         sym = p->tk_eol;
2719                         newstate = p->tos
2720                                 ? search(&states[p->stack[p->tos-1].state],
2721                                          sym)
2722                                 : 0;
2723                         if (newstate < 0)
2724                                 return 0;
2725                         ret = 2;
2726                         asn = tok_copy(*(struct token*)asn);
2727                 }
2728
2729                 if (p->tos >= p->stack_size) {
2730                         p->stack_size += 10;
2731                         p->stack = realloc(p->stack, p->stack_size
2732                                            * sizeof(p->stack[0]));
2733                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2734                                            * sizeof(p->asn_stack[0]));
2735                 }
2736                 next.sym = sym;
2737                 next.state = newstate;
2738
2739                 p->stack[p->tos] = next;
2740                 p->asn_stack[p->tos] = asn;
2741                 p->tos++;
2742                 return ret;
2743         }
2744
2745 `pop` primarily moves the top of stack (`tos`) back down the required
2746 amount and frees any `asn` entries that need to be freed.  It is called
2747 _after_ we reduce a production, just before we `shift` the nonterminal
2748 in.
2749
2750 ###### parser functions
2751
2752         static void pop(struct parser *p, int num,
2753                         void(*do_free)(short sym, void *asn))
2754         {
2755                 int i;
2756
2757                 p->tos -= num;
2758                 for (i = 0; i < num; i++)
2759                         do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]);
2760         }
2761
2762 ### The heart of the parser.
2763
2764 Now we have the parser.  For each token we might shift it, trigger a
2765 reduction, or start error handling.  2D tokens (IN, OUT, NEWLINE, EOL)
2766 might also be ignored.  Ignoring tokens is combined with shifting.
2767
2768 ###### parser vars
2769
2770         struct parser p = { 0 };
2771         struct token *tk = NULL;
2772         int accepted = 0;
2773
2774 ###### heart of parser
2775
2776         shift(&p, TK_eof, NULL, states);
2777         while (!accepted && p.tos > 0) {
2778                 struct frame *tos = &p.stack[p.tos-1];
2779                 if (!tk)
2780                         tk = tok_copy(token_next(tokens));
2781                 parser_trace(trace, &p,
2782                              tk, states, non_term, config->known_count);
2783
2784                 ## try shift or ignore
2785                 ## try reduce
2786                 ## handle error
2787         }
2788
2789 Indents are ignored unless they can be shifted onto the stack
2790 immediately.  The end of an indented section - the OUT token - is
2791 ignored precisely when the indent was ignored.  To keep track of this we
2792 need a small stack of flags, which is easily stored as bits in an
2793 `unsigned long`.  This will never overflow and the scanner only allows
2794 20 levels of indentation.
2795
2796 ###### parser state
2797         unsigned long ignored_indents;
2798
2799 NEWLINE/EOL is ignored when in an indented section of text which was not
2800 explicitly expected by the grammar.  So if the most recent indent is
2801 ignored, so is any EOL token.
2802
2803 For other tokens, we shift the next token if that is possible, otherwise
2804 we try to reduce a production.
2805
2806 ###### try shift or ignore
2807
2808         if ((tk->num == TK_newline || tk->num == TK_out) &&
2809             (p.ignored_indents & 1)) {
2810                 /* indented, so ignore OUT and NEWLINE */
2811                 if (tk->num == TK_out)
2812                         p.ignored_indents >>= 1;
2813                 free(tk);
2814                 tk = NULL;
2815                 parser_trace_action(trace, "Ignore");
2816                 continue;
2817         }
2818
2819         switch (shift(&p, tk->num, tk, states)) {
2820         case 1:
2821                 if (tk->num == TK_out)
2822                         p.ignored_indents >>= 1;
2823                 if (tk->num == TK_in)
2824                         p.ignored_indents <<= 1;
2825
2826                 tk = NULL;
2827                 /* fallthrough */
2828         case 2:
2829                 parser_trace_action(trace, tk ? "ShiftEOL" : "Shift");
2830                 ## did shift
2831                 continue;
2832         }
2833
2834         if (tk->num == TK_in) {
2835                 /* No indent expected here, so ignore IN */
2836                 free(tk);
2837                 tk = NULL;
2838                 p.ignored_indents <<= 1;
2839                 p.ignored_indents |= 1;
2840                 parser_trace_action(trace, "Ignore");
2841                 continue;
2842         }
2843
2844 We have already discussed the bulk of the handling of a "reduce" action,
2845 with the `pop()` and `shift()` functions doing much of the work.  There
2846 is a little more complexity needed to manage storage for the asn (Abstract
2847 Syntax Node), and also a test of whether the reduction is permitted.
2848
2849 When we try to shift the result of reducing production-zero, it will
2850 fail because there is no next state.  In this case the asn will not have
2851 been stored on the stack, so it get stored in the `ret` variable, and we
2852 report that that input has been accepted.
2853
2854 ###### parser vars
2855
2856         void *ret = NULL;
2857
2858 ###### try reduce
2859
2860         if (states[tos->state].reduce_prod >= 0) {
2861                 void **body;
2862                 void *res;
2863                 const struct state *nextstate = &states[tos->state];
2864                 int prod = nextstate->reduce_prod;
2865                 int size = nextstate->reduce_size;
2866                 int res_size = nextstate->result_size;
2867
2868                 body = p.asn_stack + (p.tos - size);
2869                 res = res_size ? calloc(1, res_size) : NULL;
2870                 res_size = do_reduce(prod, body, config, res);
2871                 if (res_size != nextstate->result_size)
2872                         abort();
2873                 pop(&p, size, do_free);
2874                 if (!shift(&p, nextstate->reduce_sym, res, states)) {
2875                         accepted = 1;
2876                         ret = res;
2877                         parser_trace_action(trace, "Accept");
2878                 } else
2879                         parser_trace_action(trace, "Reduce");
2880                 continue;
2881         }
2882
2883 If we can neither shift nor reduce we have an error to handle.  There
2884 are two possible responses to an error: we can pop single frames off the
2885 stack until we find a frame that can shift `TK_error`, or we can discard
2886 the current look-ahead symbol.
2887
2888 When we first see an error we do the first of these and set a flag to
2889 record that we are processing an error.  If the normal shift/reduce
2890 tests fail to find that anything can be done from that state, we start
2891 dropping tokens until either we manage to shift one, or reach end-of-file.
2892
2893 ###### parser vars
2894
2895         int in_err = 0;
2896
2897 ###### did shift
2898
2899         in_err = 0;
2900
2901 ###### handle error
2902
2903         if (in_err) {
2904                 parser_trace_action(trace, "DISCARD");
2905                 if (tk->num == TK_eof)
2906                         break;
2907                 free(tk);
2908                 tk = NULL;
2909         } else {
2910                 struct token *err_tk;
2911
2912                 parser_trace_action(trace, "ERROR");
2913
2914                 err_tk = tok_copy(*tk);
2915                 while (p.tos > 0 && shift(&p, TK_error, err_tk, states) == 0)
2916                         // discard this state
2917                         pop(&p, 1, do_free);
2918                 if (p.tos == 0) {
2919                         free(err_tk);
2920                         // no state accepted TK_error
2921                         break;
2922                 }
2923                 in_err = 1;
2924         }
2925
2926 ###### parser includes
2927         #include "parser.h"
2928
2929 ###### parser_run
2930
2931         void *parser_run(struct token_state *tokens,
2932                          const struct state states[],
2933                          int (*do_reduce)(int, void**, struct token_config*, void*),
2934                          void (*do_free)(short, void*),
2935                          FILE *trace, const char *non_term[],
2936                          struct token_config *config)
2937         {
2938                 ## parser vars
2939
2940                 ## find eol
2941
2942                 ## heart of parser
2943
2944                 free(tk);
2945                 pop(&p, p.tos, do_free);
2946                 free(p.asn_stack);
2947                 free(p.stack);
2948                 return ret;
2949         }
2950
2951 ###### exported functions
2952         void *parser_run(struct token_state *tokens,
2953                          const struct state states[],
2954                          int (*do_reduce)(int, void**, struct token_config*, void*),
2955                          void (*do_free)(short, void*),
2956                          FILE *trace, const char *non_term[],
2957                          struct token_config *config);
2958
2959 ### Tracing
2960
2961 Being able to visualize the parser in action can be invaluable when
2962 debugging the parser code, or trying to understand how the parse of a
2963 particular grammar progresses.  The stack contains all the important
2964 state, so just printing out the stack every time around the parse loop
2965 can make it possible to see exactly what is happening.
2966
2967 This doesn't explicitly show each SHIFT and REDUCE action.  However they
2968 are easily deduced from the change between consecutive lines, and the
2969 details of each state can be found by cross referencing the states list
2970 in the stack with the "report" that parsergen can generate.
2971
2972 For terminal symbols, we just dump the token.  For non-terminals we
2973 print the name of the symbol.  The look ahead token is reported at the
2974 end inside square brackets.
2975
2976 ###### parser functions
2977
2978         static char *reserved_words[] = {
2979                 [TK_error]        = "ERROR",
2980                 [TK_in]           = "IN",
2981                 [TK_out]          = "OUT",
2982                 [TK_newline]      = "NEWLINE",
2983                 [TK_eof]          = "$eof",
2984         };
2985
2986         void parser_trace(FILE *trace, struct parser *p,
2987                           struct token *tk, const struct state states[],
2988                           const char *non_term[], int knowns)
2989         {
2990                 int i;
2991                 if (!trace)
2992                         return;
2993                 for (i = 0; i < p->tos; i++) {
2994                         struct frame *f = &p->stack[i];
2995                         if (i) {
2996                                 int sym = f->sym;
2997                                 if (sym < TK_reserved &&
2998                                     reserved_words[sym] != NULL)
2999                                         fputs(reserved_words[sym], trace);
3000                                 else if (sym < TK_reserved + knowns) {
3001                                         struct token *t = p->asn_stack[i];
3002                                         text_dump(trace, t->txt, 20);
3003                                 } else
3004                                         fputs(non_term[sym - TK_reserved - knowns],
3005                                               trace);
3006                                 fputs(" ", trace);
3007                         }
3008                         fprintf(trace, "(%d) ", f->state);
3009                 }
3010                 fprintf(trace, "[");
3011                 if (tk->num < TK_reserved &&
3012                     reserved_words[tk->num] != NULL)
3013                         fputs(reserved_words[tk->num], trace);
3014                 else
3015                         text_dump(trace, tk->txt, 20);
3016                 fprintf(trace, ":%d:%d]", tk->line, tk->col);
3017         }
3018
3019         void parser_trace_action(FILE *trace, char *action)
3020         {
3021                 if (trace)
3022                         fprintf(trace, " - %s\n", action);
3023         }
3024
3025 # A Worked Example
3026
3027 The obvious example for a parser is a calculator.
3028
3029 As `scanner` provides number parsing function using `libgmp` it is not much
3030 work to perform arbitrary rational number calculations.
3031
3032 This calculator takes one expression, or an equality test, per line.
3033 The results are printed and if any equality test fails, the program
3034 exits with an error.
3035
3036 ###### File: parsergen.mk
3037         calc.c calc.h : parsergen parsergen.mdc
3038                 ./parsergen --tag calc -o calc parsergen.mdc
3039         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
3040                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3041         calctest : calc
3042                 ./calc parsergen.mdc
3043         demos :: calctest
3044
3045 # calc: header
3046
3047         #include "parse_number.h"
3048         // what do we use for a demo-grammar?  A calculator of course.
3049         struct number {
3050                 mpq_t val;
3051                 char tail[2];
3052                 int err;
3053         };
3054
3055 # calc: code
3056
3057         #include <stdlib.h>
3058         #include <unistd.h>
3059         #include <fcntl.h>
3060         #include <sys/mman.h>
3061         #include <stdio.h>
3062         #include <malloc.h>
3063         #include <gmp.h>
3064         #include <string.h>
3065         #include "mdcode.h"
3066         #include "scanner.h"
3067         #include "parser.h"
3068
3069         #include "calc.h"
3070
3071         static void free_number(struct number *n)
3072         {
3073                 mpq_clear(n->val);
3074                 free(n);
3075         }
3076
3077         static int text_is(struct text t, char *s)
3078         {
3079                 return (strlen(s) == t.len &&
3080                         strncmp(s, t.txt, t.len) == 0);
3081         }
3082
3083         int main(int argc, char *argv[])
3084         {
3085                 int fd = open(argv[1], O_RDONLY);
3086                 int len = lseek(fd, 0, 2);
3087                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3088                 struct section *table = code_extract(file, file+len, NULL);
3089                 struct section *s;
3090                 struct token_config config = {
3091                         .ignored = (1 << TK_line_comment)
3092                                  | (1 << TK_in)
3093                                  | (1 << TK_out),
3094                         .number_chars = ".,_+-",
3095                         .word_start = "",
3096                         .word_cont = "",
3097                 };
3098                 for (s = table; s; s = s->next)
3099                         if (text_is(s->section, "example: input"))
3100                                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3101                 while (table) {
3102                         struct section *t = table->next;
3103                         code_free(table->code);
3104                         free(table);
3105                         table = t;
3106                 }
3107                 exit(0);
3108         }
3109
3110 # calc: grammar
3111
3112         $LEFT + -
3113         $LEFT * / //
3114
3115         Session -> Session Line
3116                 | Line
3117
3118         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3119                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3120                                         gmp_printf("  or as a decimal: %Fg\n", fl);
3121                                         mpf_clear(fl);
3122                                         }
3123                                      }$
3124                 | Expression = Expression NEWLINE ${
3125                         if (mpq_equal($1.val, $3.val))
3126                                 gmp_printf("Both equal %Qd\n", $1.val);
3127                         else {
3128                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
3129                                         $1.val, $3.val);
3130                                 exit(1);
3131                         }
3132                 }$
3133                 | NEWLINE ${ printf("Blank line\n"); }$
3134                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3135
3136         $number
3137         Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3138                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3139                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3140                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3141                 | Expression // Expression ${ {
3142                         mpz_t z0, z1, z2;
3143                         mpq_init($0.val);
3144                         mpz_init(z0); mpz_init(z1); mpz_init(z2);
3145                         mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3146                         mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3147                         mpz_tdiv_q(z0, z1, z2);
3148                         mpq_set_z($0.val, z0);
3149                         mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3150                 } }$
3151                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3152                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3153
3154 # example: input
3155
3156         355/113
3157         3.1415926535 - 355/113
3158         2 + 4 * 5
3159         1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3160         10 * 9 / 2
3161         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
3162
3163         355//113
3164
3165         error