ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator with 2D support #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 "2D support" means that indentation and line breaks can be significant.
   8
   9 There are several distinct sections.
  10
  11 1. `grammar_read` will read a grammar from a literate-code file and
  12    store the productions, symbols, and code fragments.
  13 2. `grammar_analyse` will take that grammar and build LR parsing
  14    states and look-ahead tables.
  15 3. `grammar_report` will present the details of the analysis
  16    in a readable format and will report any conflicts.
  17 4. `parser_generate` will write out C code files with various
  18    tables and with the code fragments arranged in useful places.
  19 5. `parser_run` is a library function intended to be linked together
  20    with the generated parser tables to complete the implementation of
  21    a parser.
  22 6. Finally `calc` is a test grammar for a simple calculator.  The
  23    `parsergen` program built from the C code in this file can extract
  24    that grammar directly from this file and process it.
  25
  26 ###### File: parsergen.c
  27         #include <unistd.h>
  28         #include <stdlib.h>
  29         #include <stdio.h>
  30         ## includes
  31         ## forward declarations
  32         ## declarations
  33         ## functions
  34         ## grammar_read
  35         ## grammar_analyse
  36         ## grammar_report
  37         ## parser_generate
  38         ## main
  39 ###### File: parser.h
  40         ## exported types
  41         ## exported functions
  42 ###### File: libparser.c
  43         #include <unistd.h>
  44         #include <stdlib.h>
  45         #include <stdio.h>
  46         ## parser includes
  47         ## parser functions
  48         ## parser_run
  49 ###### File: parsergen.mk
  50         CFLAGS += -Wall -g
  51         all :: parsergen calc
  52         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  53                 ./md2c parsergen.mdc
  54
  55 ## Reading the grammar
  56
  57 The grammar must be stored in a markdown literate code file as parsed by
  58 "[mdcode][]".  It may have up to four top level (i.e.  unreferenced)
  59 sections: `header`, `code`, `grammar` and `reduce`.  The first two, if
  60 present, will be literally copied into the generated `.c` and `.h`
  61 files.  The third is required and contains the grammar which is
  62 tokenised with "[scanner][]".  `reduce` can provide variable
  63 declarations and common intialization code for all the reduction code
  64 fragments in the grammar.
  65
  66 If the `--tag` option is given, then any top level header that doesn't
  67 start with the tag is ignored, and the tag is striped from the rest.  So
  68 `--tag Foo` means that the four sections expected are `Foo: header`,
  69 `Foo: code`, `Foo: grammar` and `Foo: reduce`.  The tag `calc` is used
  70 to extract the sample calculator from this file.
  71
  72 [mdcode]: mdcode.html
  73 [scanner]: scanner.html
  74
  75 ###### includes
  76         #include "mdcode.h"
  77         #include "scanner.h"
  78
  79 ###### parser includes
  80         #include "mdcode.h"
  81         #include "scanner.h"
  82
  83 The grammar contains production sets, precedence/associativity
  84 declarations, general terminal declarations, and data type declarations.
  85 These are all parsed with _ad hoc_ parsing as we don't have a parser
  86 generator yet.
  87
  88 The precedence and associativity can be set for each production, and
  89 can be inherited from symbols.  The data type (either structure or a
  90 reference to a structure) is potentially defined for each non-terminal
  91 and describes what C structure is used to store information about each
  92 symbol.
  93
  94 ###### declarations
  95         enum assoc {Left, Right, Non};
  96         char *assoc_names[] = {"Left","Right","Non"};
  97
  98         struct symbol {
  99                 struct text struct_name;
 100                 int isref;
 101                 enum assoc assoc;
 102                 unsigned short precedence;
 103                 ## symbol fields
 104         };
 105         struct production {
 106                 unsigned short precedence;
 107                 enum assoc assoc;
 108                 ## production fields
 109         };
 110         struct grammar {
 111                 ## grammar fields
 112         };
 113
 114 The strings reported by `mdcode` and `scanner` are `struct text` which
 115 have length rather than being nul terminated.  To help with printing and
 116 comparing we define `text_is` and `prtxt`, which should possibly go in
 117 `mdcode`.  `scanner` does provide `text_dump` which is useful for
 118 strings which might contain control characters.
 119
 120 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 121 because that is what we need to detect tags.
 122
 123 ###### functions
 124         static int text_is(struct text t, char *s)
 125         {
 126                 return (strlen(s) == t.len &&
 127                         strncmp(s, t.txt, t.len) == 0);
 128         }
 129         static void prtxt(struct text t)
 130         {
 131                 printf("%.*s", t.len, t.txt);
 132         }
 133
 134         static int strip_tag(struct text *t, char *tag)
 135         {
 136                 int skip = strlen(tag) + 1;
 137                 if (skip >= t->len ||
 138                     strncmp(t->txt, tag, skip-1) != 0 ||
 139                     t->txt[skip-1] != ':')
 140                         return 0;
 141                 while (skip < t->len && t->txt[skip] == ' ')
 142                         skip++;
 143                 t->len -= skip;
 144                 t->txt += skip;
 145                 return 1;
 146         }
 147
 148 ### Symbols
 149
 150 Productions are comprised primarily of symbols - terminal and
 151 non-terminal.  We do not make a syntactic distinction between the two,
 152 though non-terminals must be identifiers.  Non-terminal symbols are
 153 those which appear in the head of a production, terminal symbols are
 154 those which don't.  There are also "virtual" symbols used for precedence
 155 marking discussed later, and sometimes we won't know what type a symbol
 156 is yet.
 157
 158 To help with code safety it is possible to declare the terminal symbols.
 159 If this is done, then any symbol used in a production that does not
 160 appear in the head of any production and is not declared as a terminal
 161 is treated as an error.
 162
 163 ###### forward declarations
 164         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 165         char *symtypes = "UVTN";
 166 ###### symbol fields
 167         enum symtype type;
 168 ###### grammar fields
 169         int terminals_declared;
 170
 171 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 172 table of known symbols and the resulting parser will report them as
 173 `TK_reserved + N`.  A small set of identifiers are reserved for the
 174 different token types that `scanner` can report.
 175
 176 ###### declarations
 177
 178         static struct { int num; char *name; } reserved_words[] = {
 179                 { TK_error,        "ERROR" },
 180                 { TK_number,       "NUMBER" },
 181                 { TK_ident,        "IDENTIFIER" },
 182                 { TK_mark,         "MARK" },
 183                 { TK_string,       "STRING" },
 184                 { TK_multi_string, "MULTI_STRING" },
 185                 { TK_in,           "IN" },
 186                 { TK_out,          "OUT" },
 187                 { TK_newline,      "NEWLINE" },
 188                 { TK_eof,          "$eof" },
 189         };
 190 ###### symbol fields
 191         short num;
 192
 193 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 194 recognised.  The former is automatically expected at the end of the text
 195 being parsed. The latter are always ignored by the parser.
 196
 197 All of these symbols are stored in a simple symbol table.  We use the
 198 `struct text` from `mdcode` to store the name and link them together
 199 into a sorted list using an insertion sort.
 200
 201 We don't have separate `find` and `insert` functions as any symbol we
 202 find needs to be remembered.  We simply expect `find` to always return a
 203 symbol, but its type might be `Unknown`.
 204
 205 ###### includes
 206         #include <string.h>
 207
 208 ###### symbol fields
 209         struct text name;
 210         struct symbol *next;
 211
 212 ###### grammar fields
 213         struct symbol *syms;
 214         int num_syms;
 215
 216 ###### functions
 217         static struct symbol *sym_find(struct grammar *g, struct text s)
 218         {
 219                 struct symbol **l = &g->syms;
 220                 struct symbol *n;
 221                 int cmp = 1;
 222
 223                 while (*l &&
 224                         (cmp = text_cmp((*l)->name, s)) < 0)
 225                                 l = & (*l)->next;
 226                 if (cmp == 0)
 227                         return *l;
 228                 n = calloc(1, sizeof(*n));
 229                 n->name = s;
 230                 n->type = Unknown;
 231                 n->next = *l;
 232                 n->num = -1;
 233                 *l = n;
 234                 return n;
 235         }
 236
 237         static void symbols_init(struct grammar *g)
 238         {
 239                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 240                 int i;
 241                 for (i = 0; i < entries; i++) {
 242                         struct text t;
 243                         struct symbol *s;
 244                         t.txt = reserved_words[i].name;
 245                         t.len = strlen(t.txt);
 246                         s = sym_find(g, t);
 247                         s->type = Terminal;
 248                         s->num = reserved_words[i].num;
 249                 }
 250         }
 251
 252 ### Data types and precedence.
 253
 254 Data type specification, precedence specification, and declaration of
 255 terminals are all introduced by a dollar sign at the start of the line.
 256 If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 257 precedence, if it is `TERM` then the line declares terminals without
 258 precedence, otherwise it specifies a data type.
 259
 260 The data type name is simply stored and applied to the head of all
 261 subsequent productions.  It must be the name of a structure optionally
 262 preceded by an asterisk which means a reference or "pointer".  So
 263 `$expression` maps to `struct expression` and `$*statement` maps to
 264 `struct statement *`.
 265
 266 Any productions given before the first data type declaration will have
 267 no data type associated with them and can carry no information.  In
 268 order to allow other non-terminals to have no type, the data type
 269 `$void` can be given.  This does *not* mean that `struct void` will be
 270 used, but rather than no type will be associated with subsequent
 271 non-terminals.
 272
 273 The precedence line must contain a list of symbols, either terminal
 274 symbols or virtual symbols.  It can only contain symbols that have not
 275 been seen yet, so precedence declaration must precede the productions
 276 that they affect.
 277
 278 A "virtual" symbol is an identifier preceded by `$$`.  Virtual symbols
 279 carry precedence information but are not included in the grammar.  A
 280 production can declare that it inherits the precedence of a given
 281 virtual symbol.
 282
 283 This use for `$$` precludes it from being used as a symbol in the
 284 described language.  Two other symbols: `${` and `}$` are also
 285 unavailable.
 286
 287 Each new precedence line introduces a new precedence level and
 288 declares how it associates.  This level is stored in each symbol
 289 listed and may be inherited by any production which uses the symbol.  A
 290 production inherits from the last symbol which has a precedence.
 291
 292 The symbols on the first precedence line have the lowest precedence.
 293 Subsequent lines introduce symbols with higher precedence and so bind
 294 more tightly.
 295
 296 Note that a declaration line (or "dollar line") can start with either of
 297 two different marks: "$" or "$*".  The `dollar_line()` function defined
 298 here is told which was found in the `isref` argument.
 299
 300 ###### grammar fields
 301         struct text current_type;
 302         int type_isref;
 303         int prec_levels;
 304
 305 ###### declarations
 306         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 307         static const char *known[] = { "$$", "${", "}$" };
 308
 309 ###### functions
 310         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 311         {
 312                 struct token t = token_next(ts);
 313                 char *err;
 314                 enum assoc assoc;
 315                 int term = 0;
 316                 int found;
 317
 318                 if (t.num != TK_ident) {
 319                         err = "type or assoc expected after '$'";
 320                         goto abort;
 321                 }
 322                 if (text_is(t.txt, "LEFT"))
 323                         assoc = Left;
 324                 else if (text_is(t.txt, "RIGHT"))
 325                         assoc = Right;
 326                 else if (text_is(t.txt, "NON"))
 327                         assoc = Non;
 328                 else if (text_is(t.txt, "TERM")) {
 329                         term = 1;
 330                         g->terminals_declared = 1;
 331                 } else {
 332                         g->current_type = t.txt;
 333                         g->type_isref = isref;
 334                         if (text_is(t.txt, "void"))
 335                                 g->current_type.txt = NULL;
 336                         t = token_next(ts);
 337                         if (t.num != TK_newline) {
 338                                 err = "Extra tokens after type name";
 339                                 goto abort;
 340                         }
 341                         return NULL;
 342                 }
 343
 344                 if (isref) {
 345                         err = "$* cannot be followed by a precedence";
 346                         goto abort;
 347                 }
 348
 349                 // This is a precedence or TERM line, need some symbols.
 350                 found = 0;
 351                 g->prec_levels += 1;
 352                 t = token_next(ts);
 353                 while (t.num != TK_newline) {
 354                         enum symtype type = Terminal;
 355                         struct symbol *s;
 356                         if (t.num == TK_virtual) {
 357                                 type = Virtual;
 358                                 t = token_next(ts);
 359                                 if (t.num != TK_ident) {
 360                                         err = "$$ must be followed by a word";
 361                                         goto abort;
 362                                 }
 363                                 if (term) {
 364                                         err = "Virtual symbols not permitted on $TERM line";
 365                                         goto abort;
 366                                 }
 367                         } else if (t.num != TK_ident &&
 368                                    t.num != TK_mark) {
 369                                 err = "Illegal token in precedence line";
 370                                 goto abort;
 371                         }
 372                         s = sym_find(g, t.txt);
 373                         if (s->type != Unknown) {
 374                                 err = "Symbols in precedence/TERM line must not already be known.";
 375                                 goto abort;
 376                         }
 377                         s->type = type;
 378                         if (!term) {
 379                                 s->precedence = g->prec_levels;
 380                                 s->assoc = assoc;
 381                         }
 382                         found += 1;
 383                         t = token_next(ts);
 384                 }
 385                 if (found == 0)
 386                         err = "No symbols given on precedence/TERM line";
 387                         goto abort;
 388                 return NULL;
 389         abort:
 390                 while (t.num != TK_newline && t.num != TK_eof)
 391                         t = token_next(ts);
 392                 return err;
 393         }
 394
 395 ### Productions
 396
 397 A production either starts with an identifier which is the head
 398 non-terminal, or a vertical bar (`|`) in which case this production
 399 uses the same head as the previous one.  The identifier must be
 400 followed by a `->` mark.  All productions for a given non-terminal must
 401 be grouped together with the `nonterminal ->` given only once.
 402
 403 After this a (possibly empty) sequence of identifiers and marks form
 404 the body of the production.  A virtual symbol may be given after the
 405 body by preceding it with `$$`.  If a virtual symbol is given, the
 406 precedence of the production is that for the virtual symbol.  If none
 407 is given, the precedence is inherited from the last symbol in the
 408 production which has a precedence specified.
 409
 410 After the optional precedence may come the `${` mark.  This indicates
 411 the start of a code fragment.  If present, this must be on the same
 412 line as the start of the production.
 413
 414 All of the text from the `${` through to the matching `}$` is
 415 collected and forms the code-fragment for the production.  It must all
 416 be in one `code_node` of the literate code.  The `}$` must be
 417 at the end of a line.
 418
 419 Text in the code fragment will undergo substitutions where `$N` or
 420 `$<N`,for some numeric `N` (or non-numeric indicator as described
 421 later), will be replaced with a variable holding the parse information
 422 for the particular symbol in the production.  `$0` is the head of the
 423 production, `$1` is the first symbol of the body, etc.  The type of `$N`
 424 for a terminal symbol is `struct token`.  For a non-terminal, it is
 425 whatever has been declared for that symbol.  The `<` may be included and
 426 means that the value (usually a reference) is being moved out, so it
 427 will not automatically be freed.  The effect of using '<' is that the
 428 variable is cleared to all-zeros.
 429
 430 While building productions we will need to add to an array which needs to
 431 grow dynamically.
 432
 433 ###### functions
 434         static void array_add(void *varray, int *cnt, void *new)
 435         {
 436                 void ***array = varray;
 437                 int current = 0;
 438                 const int step = 8;
 439                 current = ((*cnt-1) | (step-1))+1;
 440                 if (*cnt == current) {
 441                         /* must grow */
 442                         current += step;
 443                         *array = realloc(*array, current * sizeof(void*));
 444                 }
 445                 (*array)[*cnt] = new;
 446                 (*cnt) += 1;
 447         }
 448
 449 Collecting the code fragment simply involves reading tokens until we
 450 hit the end token `}$`, and noting the character position of the start and
 451 the end.
 452
 453 ###### functions
 454         static struct text collect_code(struct token_state *state,
 455                                         struct token start)
 456         {
 457                 struct text code;
 458                 struct token t;
 459                 code.txt = start.txt.txt + start.txt.len;
 460                 do
 461                         t = token_next(state);
 462                 while (t.node == start.node &&
 463                        t.num != TK_close && t.num != TK_error &&
 464                        t.num != TK_eof);
 465                 if (t.num == TK_close && t.node == start.node)
 466                         code.len = t.txt.txt - code.txt;
 467                 else
 468                         code.txt = NULL;
 469                 return code;
 470         }
 471
 472 Now we have all the bits we need to parse a full production.
 473
 474 ###### includes
 475         #include <memory.h>
 476
 477 ###### grammar fields
 478         struct production **productions;
 479         int production_count;
 480
 481 ###### production fields
 482         struct symbol  *head;
 483         struct symbol **body;
 484         int             body_size;
 485         struct text     code;
 486         int             code_line;
 487
 488 ###### symbol fields
 489         int first_production;
 490
 491 ###### functions
 492         static char *parse_production(struct grammar *g,
 493                                       struct symbol *head,
 494                                       struct token_state *state)
 495         {
 496                 /* Head has already been parsed. */
 497                 struct token tk;
 498                 char *err;
 499                 struct production p, *pp;
 500
 501                 memset(&p, 0, sizeof(p));
 502                 p.head = head;
 503                 tk = token_next(state);
 504                 while (tk.num == TK_ident || tk.num == TK_mark) {
 505                         struct symbol *bs = sym_find(g, tk.txt);
 506                         if (bs->type == Unknown) {
 507                                 if (!g->terminals_declared)
 508                                         bs->type = Terminal;
 509                         }
 510                         if (bs->type == Virtual) {
 511                                 err = "Virtual symbol not permitted in production";
 512                                 goto abort;
 513                         }
 514                         if (bs->precedence) {
 515                                 p.precedence = bs->precedence;
 516                                 p.assoc = bs->assoc;
 517                         }
 518                         array_add(&p.body, &p.body_size, bs);
 519                         tk = token_next(state);
 520                 }
 521                 if (tk.num == TK_virtual) {
 522                         struct symbol *vs;
 523                         tk = token_next(state);
 524                         if (tk.num != TK_ident) {
 525                                 err = "word required after $$";
 526                                 goto abort;
 527                         }
 528                         vs = sym_find(g, tk.txt);
 529                         if (vs->precedence == 0) {
 530                                 err = "symbol after $$ must have precedence";
 531                                 goto abort;
 532                         } else {
 533                                 p.precedence = vs->precedence;
 534                                 p.assoc = vs->assoc;
 535                         }
 536                         tk = token_next(state);
 537                 }
 538                 if (tk.num == TK_open) {
 539                         p.code_line = tk.line;
 540                         p.code = collect_code(state, tk);
 541                         if (p.code.txt == NULL) {
 542                                 err = "code fragment not closed properly";
 543                                 goto abort;
 544                         }
 545                         tk = token_next(state);
 546                 }
 547
 548                 if (tk.num != TK_newline && tk.num != TK_eof) {
 549                         err = "stray tokens at end of line";
 550                         goto abort;
 551                 }
 552                 pp = malloc(sizeof(*pp));
 553                 *pp = p;
 554                 array_add(&g->productions, &g->production_count, pp);
 555                 return NULL;
 556         abort:
 557                 while (tk.num != TK_newline && tk.num != TK_eof)
 558                         tk = token_next(state);
 559                 free(p.body);
 560                 return err;
 561         }
 562
 563 With the ability to parse productions and dollar-lines, we have nearly all
 564 that we need to parse a grammar from a `code_node`.
 565
 566 The head of the first production will effectively be the `start` symbol
 567 of the grammar.  However it won't _actually_ be so.  Processing the
 568 grammar is greatly simplified if the real start symbol only has a single
 569 production, and expects `$eof` as the final terminal.  So when we find
 570 the first explicit production we insert an extra implicit production as
 571 production zero which looks like
 572
 573 ###### Example: production 0
 574         $start -> START $eof
 575
 576 where `START` is the first non-terminal given.
 577
 578 ###### create production zero
 579         struct production *p = calloc(1,sizeof(*p));
 580         struct text start = {"$start",6};
 581         struct text eof = {"$eof",4};
 582         struct text code = {"$0 = $<1;", 9};
 583         p->head = sym_find(g, start);
 584         p->head->type = Nonterminal;
 585         p->head->struct_name = g->current_type;
 586         p->head->isref = g->type_isref;
 587         if (g->current_type.txt)
 588                 p->code = code;
 589         array_add(&p->body, &p->body_size, head);
 590         array_add(&p->body, &p->body_size, sym_find(g, eof));
 591         p->head->first_production = g->production_count;
 592         array_add(&g->productions, &g->production_count, p);
 593
 594 Now we are ready to read in the grammar.  We ignore comments
 595 and strings so that the marks which introduce them can be
 596 used as terminals in the grammar.  We don't ignore numbers
 597 even though we don't allow them as that causes the scanner
 598 to produce errors that the parser is better positioned to handle.
 599 We also ignore indentation, but we expect and use newlines.
 600
 601 To allow comments in the grammar, we explicitly check for "//" as the
 602 first token on the line and those lines are skipped.  "//" can still be
 603 used as a terminal anywhere that a terminal is expected.
 604
 605 ###### grammar_read
 606         static struct grammar *grammar_read(struct code_node *code)
 607         {
 608                 struct token_config conf = {
 609                         .word_start = "",
 610                         .word_cont = "",
 611                         .words_marks = known,
 612                         .known_count = sizeof(known)/sizeof(known[0]),
 613                         .number_chars = "",
 614                         .ignored = (1 << TK_line_comment)
 615                                  | (1 << TK_block_comment)
 616                                  | (0 << TK_number)
 617                                  | (1 << TK_string)
 618                                  | (1 << TK_multi_string)
 619                                  | (1 << TK_in)
 620                                  | (1 << TK_out),
 621                 };
 622
 623                 struct token_state *state = token_open(code, &conf);
 624                 struct token tk;
 625                 struct symbol *head = NULL;
 626                 struct grammar *g;
 627                 char *err = NULL;
 628
 629                 g = calloc(1, sizeof(*g));
 630                 symbols_init(g);
 631
 632                 for (tk = token_next(state); tk.num != TK_eof;
 633                      tk = token_next(state)) {
 634                         if (tk.num == TK_newline)
 635                                 continue;
 636                         if (tk.num == TK_ident) {
 637                                 // new non-terminal
 638                                 head = sym_find(g, tk.txt);
 639                                 if (head->type == Nonterminal)
 640                                         err = "This non-terminal has already be used.";
 641                                 else if (head->type == Virtual)
 642                                         err = "Virtual symbol not permitted in head of production";
 643                                 else {
 644                                         head->type = Nonterminal;
 645                                         head->struct_name = g->current_type;
 646                                         head->isref = g->type_isref;
 647                                         if (g->production_count == 0) {
 648                                                 ## create production zero
 649                                         }
 650                                         head->first_production = g->production_count;
 651                                         tk = token_next(state);
 652                                         if (tk.num == TK_mark &&
 653                                             text_is(tk.txt, "->"))
 654                                                 err = parse_production(g, head, state);
 655                                         else
 656                                                 err = "'->' missing in production";
 657                                 }
 658                         } else if (tk.num == TK_mark
 659                                    && text_is(tk.txt, "|")) {
 660                                 // another production for same non-term
 661                                 if (head)
 662                                         err = parse_production(g, head, state);
 663                                 else
 664                                         err = "First production must have a head";
 665                         } else if (tk.num == TK_mark
 666                                    && text_is(tk.txt, "$")) {
 667                                 err = dollar_line(state, g, 0);
 668                         } else if (tk.num == TK_mark
 669                                    && text_is(tk.txt, "$*")) {
 670                                 err = dollar_line(state, g, 1);
 671                         } else if (tk.num == TK_mark
 672                                    && text_is(tk.txt, "//")) {
 673                                 while (tk.num != TK_newline &&
 674                                        tk.num != TK_eof)
 675                                         tk = token_next(state);
 676                         } else {
 677                                 err = "Unrecognised token at start of line.";
 678                         }
 679                         if (err)
 680                                 goto abort;
 681                 }
 682                 token_close(state);
 683                 if (g->terminals_declared) {
 684                         struct symbol *s;
 685                         int errs = 0;
 686                         for (s = g->syms; s; s = s->next) {
 687                                 if (s->type != Unknown)
 688                                         continue;
 689                                 errs += 1;
 690                                 fprintf(stderr, "Token %.*s not declared\n",
 691                                         s->name.len, s->name.txt);
 692                         }
 693                         if (errs) {
 694                                 free(g); // FIXME free content
 695                                 g = NULL;
 696                         }
 697                 }
 698                 return g;
 699         abort:
 700                 fprintf(stderr, "Error at line %d: %s\n",
 701                         tk.line, err);
 702                 token_close(state);
 703                 free(g); // FIXME free content
 704                 return NULL;
 705         }
 706
 707 ## Analysing the grammar
 708
 709 The central task in analysing the grammar is to determine a set of
 710 states to drive the parsing process.  This will require finding various
 711 sets of symbols and of "items".  Some of these sets will need to attach
 712 information to each element in the set, so they are more like maps.
 713
 714 Each "item" may have a 'look-ahead' or `LA` set associated with it.
 715 Many of these will be the same as each other.  To avoid memory wastage,
 716 and to simplify some comparisons of sets, these sets will be stored in a
 717 list of unique sets, each assigned a number.
 718
 719 Once we have the data structures in hand to manage these sets and lists,
 720 we can start setting the 'nullable' flag, build the 'FIRST' sets, and
 721 then create the item sets which define the various states.
 722
 723 ### Symbol sets.
 724
 725 Though we don't only store symbols in these sets, they are the main
 726 things we store, so they are called symbol sets or "symsets".
 727
 728 A symset has a size, an array of shorts, and an optional array of data
 729 values, which are also shorts.  If the array of data is not present, we
 730 store an impossible pointer, as `NULL` is used to indicate that no
 731 memory has been allocated yet;
 732
 733 ###### declarations
 734         struct symset {
 735                 short cnt;
 736                 unsigned short *syms, *data;
 737         };
 738         #define NO_DATA ((unsigned short *)1)
 739         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 740         const struct symset INIT_DATASET = { 0, NULL, NULL };
 741
 742 The arrays of shorts are allocated in blocks of 8 and are kept sorted by
 743 using an insertion sort.  We don't explicitly record the amount of
 744 allocated space as it can be derived directly from the current `cnt`
 745 using `((cnt - 1) | 7) + 1`.
 746
 747 ###### functions
 748         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 749         {
 750                 int i;
 751                 int current = ((s->cnt-1) | 7) + 1;
 752                 if (current == s->cnt) {
 753                         current += 8;
 754                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 755                         if (s->data != NO_DATA)
 756                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 757                 }
 758                 i = s->cnt;
 759                 while (i > 0 && s->syms[i-1] > key) {
 760                         s->syms[i] = s->syms[i-1];
 761                         if (s->data != NO_DATA)
 762                                 s->data[i] = s->data[i-1];
 763                         i--;
 764                 }
 765                 s->syms[i] = key;
 766                 if (s->data != NO_DATA)
 767                         s->data[i] = val;
 768                 s->cnt += 1;
 769         }
 770
 771 Finding a symbol (or item) in a `symset` uses a simple binary search.
 772 We return the index where the value was found (so data can be accessed),
 773 or `-1` to indicate failure.
 774
 775         static int symset_find(struct symset *ss, unsigned short key)
 776         {
 777                 int lo = 0;
 778                 int hi = ss->cnt;
 779
 780                 if (hi == 0)
 781                         return -1;
 782                 while (lo + 1 < hi) {
 783                         int mid = (lo + hi) / 2;
 784                         if (ss->syms[mid] <= key)
 785                                 lo = mid;
 786                         else
 787                                 hi = mid;
 788                 }
 789                 if (ss->syms[lo] == key)
 790                         return lo;
 791                 return -1;
 792         }
 793
 794 We will often want to form the union of two symsets.  When we do, we
 795 will often want to know if anything changed (as that might mean there is
 796 more work to do).  So `symset_union` reports whether anything was added
 797 to the first set.  We use a slow quadratic approach as these sets are
 798 rarely large.  If profiling shows this to be a problem it can be
 799 optimised later.
 800
 801         static int symset_union(struct symset *a, struct symset *b)
 802         {
 803                 int i;
 804                 int added = 0;
 805                 for (i = 0; i < b->cnt; i++)
 806                         if (symset_find(a, b->syms[i]) < 0) {
 807                                 unsigned short data = 0;
 808                                 if (b->data != NO_DATA)
 809                                         data = b->data[i];
 810                                 symset_add(a, b->syms[i], data);
 811                                 added++;
 812                         }
 813                 return added;
 814         }
 815
 816 And of course we must be able to free a symset.
 817
 818         static void symset_free(struct symset ss)
 819         {
 820                 free(ss.syms);
 821                 if (ss.data != NO_DATA)
 822                         free(ss.data);
 823         }
 824
 825 ### Symset Storage
 826
 827 Some symsets are simply stored somewhere appropriate in the `grammar`
 828 data structure, others needs a bit of indirection.  This applies
 829 particularly to "LA" sets.  These will be paired with "items" in an
 830 "itemset".  As itemsets will be stored in a symset, the "LA" set needs
 831 to be stored in the `data` field, so we need a mapping from a small
 832 (short) number to an LA `symset`.
 833
 834 As mentioned earlier this involves creating a list of unique symsets.
 835
 836 For now, we just use a linear list sorted by insertion.  A skiplist
 837 would be more efficient and may be added later.
 838
 839 ###### declarations
 840
 841         struct setlist {
 842                 struct symset ss;
 843                 int num;
 844                 struct setlist *next;
 845         };
 846
 847 ###### grammar fields
 848         struct setlist *sets;
 849         int nextset;
 850
 851 ###### functions
 852
 853         static int ss_cmp(struct symset a, struct symset b)
 854         {
 855                 int i;
 856                 int diff = a.cnt - b.cnt;
 857                 if (diff)
 858                         return diff;
 859                 for (i = 0; i < a.cnt; i++) {
 860                         diff = (int)a.syms[i] - (int)b.syms[i];
 861                         if (diff)
 862                                 return diff;
 863                 }
 864                 return 0;
 865         }
 866
 867         static int save_set(struct grammar *g, struct symset ss)
 868         {
 869                 struct setlist **sl = &g->sets;
 870                 int cmp = 1;
 871                 struct setlist *s;
 872
 873                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 874                         sl = & (*sl)->next;
 875                 if (cmp == 0) {
 876                         symset_free(ss);
 877                         return (*sl)->num;
 878                 }
 879
 880                 s = malloc(sizeof(*s));
 881                 s->ss = ss;
 882                 s->num = g->nextset;
 883                 g->nextset += 1;
 884                 s->next = *sl;
 885                 *sl = s;
 886                 return s->num;
 887         }
 888
 889 Finding a set by number is currently performed by a simple linear
 890 search.  If this turns out to hurt performance, we can store the sets in
 891 a dynamic array like the productions.
 892
 893         static struct symset set_find(struct grammar *g, int num)
 894         {
 895                 struct setlist *sl = g->sets;
 896                 while (sl && sl->num != num)
 897                         sl = sl->next;
 898                 return sl->ss;
 899         }
 900
 901 ### Setting `nullable`
 902
 903 We set `nullable` on the head symbol for any production for which all
 904 the body symbols (if any) are nullable.  As this is a recursive
 905 definition, any change in the `nullable` setting means that we need to
 906 re-evaluate where it needs to be set.
 907
 908 We simply loop around performing the same calculations until no more
 909 changes happen.
 910
 911 ###### symbol fields
 912         int nullable;
 913
 914 ###### functions
 915         static void set_nullable(struct grammar *g)
 916         {
 917                 int check_again = 1;
 918                 while (check_again) {
 919                         int p;
 920                         check_again = 0;
 921                         for (p = 0; p < g->production_count; p++) {
 922                                 struct production *pr = g->productions[p];
 923                                 int s;
 924
 925                                 if (pr->head->nullable)
 926                                         continue;
 927                                 for (s = 0; s < pr->body_size; s++)
 928                                         if (! pr->body[s]->nullable)
 929                                                 break;
 930                                 if (s == pr->body_size) {
 931                                         pr->head->nullable = 1;
 932                                         check_again = 1;
 933                                 }
 934                         }
 935                 }
 936         }
 937
 938 ### Building the `first` sets
 939
 940 When calculating what can follow a particular non-terminal, we will need
 941 to know what the "first" terminal in any subsequent non-terminal might
 942 be.  So we calculate the `first` set for every non-terminal and store
 943 them in an array.  We don't bother recording the "first" set for
 944 terminals as they are trivial.
 945
 946 As the "first" for one symbol might depend on the "first" of another, we
 947 repeat the calculations until no changes happen, just like with
 948 `set_nullable`.  This makes use of the fact that `symset_union` reports
 949 if any change happens.
 950
 951 The core of this, which finds the "first" of part of a production body,
 952 will be reused for computing the "follow" sets, so we split that out
 953 into a separate function, `add_first`, which also reports if it got all
 954 the way to the end of the production without finding a non-nullable
 955 symbol.
 956
 957 ###### grammar fields
 958         struct symset *first;
 959
 960 ###### functions
 961
 962         static int add_first(struct production *pr, int start,
 963                              struct symset *target, struct grammar *g,
 964                              int *to_end)
 965         {
 966                 int s;
 967                 int changed = 0;
 968                 for (s = start; s < pr->body_size; s++) {
 969                         struct symbol *bs = pr->body[s];
 970                         if (bs->type == Terminal) {
 971                                 if (symset_find(target, bs->num) < 0) {
 972                                         symset_add(target, bs->num, 0);
 973                                         changed = 1;
 974                                 }
 975                                 break;
 976                         } else if (symset_union(target, &g->first[bs->num]))
 977                                 changed = 1;
 978                         if (!bs->nullable)
 979                                 break;
 980                 }
 981                 if (to_end)
 982                         *to_end = (s == pr->body_size);
 983                 return changed;
 984         }
 985
 986         static void build_first(struct grammar *g)
 987         {
 988                 int check_again = 1;
 989                 int s;
 990                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
 991                 for (s = 0; s < g->num_syms; s++)
 992                         g->first[s] = INIT_SYMSET;
 993
 994                 while (check_again) {
 995                         int p;
 996                         check_again = 0;
 997                         for (p = 0; p < g->production_count; p++) {
 998                                 struct production *pr = g->productions[p];
 999                                 struct symset *head = &g->first[pr->head->num];
1000
1001                                 if (add_first(pr, 0, head, g, NULL))
1002                                         check_again = 1;
1003                         }
1004                 }
1005         }
1006
1007 ### Building the `follow` sets.
1008
1009 There are two different situations when we will want to generate
1010 "follow" sets.  If we are doing an SLR analysis, we want to generate a
1011 single "follow" set for each non-terminal in the grammar.  That is what
1012 is happening here.  If we are doing an LALR or LR analysis we will want
1013 to generate a separate "LA" set for each item.  We do that later in
1014 state generation.
1015
1016 There are two parts to generating a "follow" set.  Firstly we look at
1017 every place that any non-terminal appears in the body of any production,
1018 and we find the set of possible "first" symbols after there.  This is
1019 done using `add_first` above and only needs to be done once as the
1020 "first" sets are now stable and will not change.
1021
1022 ###### grammar fields
1023         struct symset *follow;
1024
1025 ###### follow code
1026
1027         for (p = 0; p < g->production_count; p++) {
1028                 struct production *pr = g->productions[p];
1029                 int b;
1030
1031                 for (b = 0; b < pr->body_size - 1; b++) {
1032                         struct symbol *bs = pr->body[b];
1033                         if (bs->type == Terminal)
1034                                 continue;
1035                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1036                 }
1037         }
1038
1039 The second part is to add the "follow" set of the head of a production
1040 to the "follow" sets of the final symbol in the production, and any
1041 other symbol which is followed only by `nullable` symbols.  As this
1042 depends on "follow" itself we need to repeatedly perform the process
1043 until no further changes happen.
1044
1045 Rather than 2 nested loops, one that repeats the whole process until
1046 there is no change, and one that iterates of the set of productions, we
1047 combine these two functions into a single loop.
1048
1049 ###### follow code
1050
1051         for (check_again = 0, p = 0;
1052              p < g->production_count;
1053              p < g->production_count-1
1054                 ? p++ : check_again ? (check_again = 0, p = 0)
1055                               : p++) {
1056                 struct production *pr = g->productions[p];
1057                 int b;
1058
1059                 for (b = pr->body_size - 1; b >= 0; b--) {
1060                         struct symbol *bs = pr->body[b];
1061                         if (bs->type == Terminal)
1062                                 break;
1063                         if (symset_union(&g->follow[bs->num],
1064                                          &g->follow[pr->head->num]))
1065                                 check_again = 1;
1066                         if (!bs->nullable)
1067                                 break;
1068                 }
1069         }
1070
1071 We now just need to create and initialise the `follow` list to get a
1072 complete function.
1073
1074 ###### functions
1075         static void build_follow(struct grammar *g)
1076         {
1077                 int s, check_again, p;
1078                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1079                 for (s = 0; s < g->num_syms; s++)
1080                         g->follow[s] = INIT_SYMSET;
1081                 ## follow code
1082         }
1083
1084 ### Building itemsets and states
1085
1086 There are three different levels of detail that can be chosen for
1087 building the itemsets and states for the LR grammar.  They are:
1088
1089 1. LR(0), LR(0.5), or SLR(1), where no look-ahead is included in the
1090    itemsets - look-ahead, if used at all, is simply the 'follow' sets
1091    already computed,
1092 2. LALR(1) where we build look-ahead sets with each item and merge
1093    the LA sets when we find two paths to the same "kernel" set of items.
1094 3. LR(1) where different look-ahead for any item in the set means
1095    a different item set must be created.
1096
1097 ###### forward declarations
1098         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1099
1100 We need to be able to look through existing states to see if a newly
1101 generated state already exists.  For now we use a simple sorted linked
1102 list.
1103
1104 An item is a pair of numbers: the production number and the position of
1105 "DOT", which is an index into the body of the production.  As the
1106 numbers are not enormous we can combine them into a single "short" and
1107 store them in a `symset` - 5 bits for "DOT" and 11 bits for the
1108 production number (so 2000 productions with maximum of 31 symbols in the
1109 body).
1110
1111 Comparing the itemsets will be a little different to comparing symsets
1112 as we want to do the lookup after generating the "kernel" of an
1113 itemset, so we need to ignore the offset=zero items which are added during
1114 completion.
1115
1116 To facilitate this, we modify the "DOT" number so that "0" sorts to
1117 the end of the list in the symset, and then only compare items before
1118 the first "0".
1119
1120 ###### declarations
1121         static inline unsigned short item_num(int production, int dot)
1122         {
1123                 return production | ((31-dot) << 11);
1124         }
1125         static inline int item_prod(unsigned short item)
1126         {
1127                 return item & 0x7ff;
1128         }
1129         static inline int item_dot(unsigned short item)
1130         {
1131                 return (31-(item >> 11)) & 0x1f;
1132         }
1133
1134 For LR(1) analysis we need to compare not just the itemset in a state
1135 but also the LA sets.  As we assign each unique LA set a number, we
1136 can just compare the symset and the data values together.
1137
1138 ###### functions
1139         static int itemset_cmp(struct symset a, struct symset b,
1140                                enum grammar_type type)
1141         {
1142                 int i;
1143                 int av, bv;
1144
1145                 for (i = 0;
1146                      i < a.cnt && i < b.cnt &&
1147                      item_dot(a.syms[i]) > 0 &&
1148                      item_dot(b.syms[i]) > 0;
1149                      i++) {
1150                         int diff = a.syms[i] - b.syms[i];
1151                         if (diff)
1152                                 return diff;
1153                         if (type == LR1) {
1154                                 diff = a.data[i] - b.data[i];
1155                                 if (diff)
1156                                         return diff;
1157                         }
1158                 }
1159                 if (i == a.cnt || item_dot(a.syms[i]) == 0)
1160                         av = -1;
1161                 else
1162                         av = a.syms[i];
1163                 if (i == b.cnt || item_dot(b.syms[i]) == 0)
1164                         bv = -1;
1165                 else
1166                         bv = b.syms[i];
1167                 if (av - bv)
1168                         return av - bv;
1169                 if (type < LR1 || av == -1)
1170                         return 0;
1171                 return
1172                         a.data[i] - b.data[i];
1173         }
1174
1175 It will be helpful to know if an itemset has been "completed" or not,
1176 particularly for LALR where itemsets get merged, at which point they
1177 need to be consider for completion again.  So a `completed` flag is
1178 needed.
1179
1180 And now we can build the list of itemsets.  The lookup routine returns
1181 both a success flag and a pointer to where in the list an insert should
1182 happen, so we don't need to search a second time.
1183
1184 ###### declarations
1185         struct itemset {
1186                 struct itemset *next;
1187                 short state;
1188                 struct symset items;
1189                 struct symset go_to;
1190                 enum assoc assoc;
1191                 unsigned short precedence;
1192                 char completed;
1193         };
1194
1195 ###### grammar fields
1196         struct itemset *items;
1197         int states;
1198
1199 ###### functions
1200         static int itemset_find(struct grammar *g, struct itemset ***where,
1201                                 struct symset kernel, enum grammar_type type)
1202         {
1203                 struct itemset **ip;
1204
1205                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1206                         struct itemset *i = *ip;
1207                         int diff;
1208                         diff = itemset_cmp(i->items, kernel, type);
1209                         if (diff < 0)
1210                                 continue;
1211                         if (diff > 0)
1212                                 break;
1213                         /* found */
1214                         *where = ip;
1215                         return 1;
1216                 }
1217                 *where = ip;
1218                 return 0;
1219         }
1220
1221 Adding an itemset may require merging the LA sets if LALR analysis is
1222 happening.  If any new LA set adds any symbols that weren't in the old
1223 LA set, we clear the `completed` flag so that the dependants of this
1224 itemset will be recalculated and their LA sets updated.
1225
1226 `add_itemset` must consume the symsets it is passed, either by adding
1227 them to a data structure, of freeing them.
1228
1229         static int add_itemset(struct grammar *g, struct symset ss,
1230                                enum assoc assoc, unsigned short precedence,
1231                                enum grammar_type type)
1232         {
1233                 struct itemset **where, *is;
1234                 int i;
1235                 int found = itemset_find(g, &where, ss, type);
1236                 if (!found) {
1237                         is = calloc(1, sizeof(*is));
1238                         is->state = g->states;
1239                         g->states += 1;
1240                         is->items = ss;
1241                         is->assoc = assoc;
1242                         is->precedence = precedence;
1243                         is->next = *where;
1244                         is->go_to = INIT_DATASET;
1245                         *where = is;
1246                         return is->state;
1247                 }
1248                 is = *where;
1249                 if (type != LALR) {
1250                         symset_free(ss);
1251                         return is->state;
1252                 }
1253                 for (i = 0; i < ss.cnt; i++) {
1254                         struct symset temp = INIT_SYMSET, s;
1255                         if (ss.data[i] == is->items.data[i])
1256                                 continue;
1257                         s = set_find(g, is->items.data[i]);
1258                         symset_union(&temp, &s);
1259                         s = set_find(g, ss.data[i]);
1260                         if (symset_union(&temp, &s)) {
1261                                 is->items.data[i] = save_set(g, temp);
1262                                 is->completed = 0;
1263                         } else
1264                                 symset_free(temp);
1265                 }
1266                 symset_free(ss);
1267                 return is->state;
1268         }
1269
1270 #### The build
1271
1272 To build all the itemsets, we first insert the initial itemset made from
1273 production zero, complete each itemset, and then generate new itemsets
1274 from old until no new ones can be made.
1275
1276 Completing an itemset means finding all the items where "DOT" is
1277 followed by a nonterminal and adding "DOT=0" items for every production
1278 from that non-terminal - providing each item hasn't already been added.
1279 When we add such an item it might get inserted before the current
1280 one, so to make sure we process it we reset the loop counter to the
1281 start.
1282
1283 If LA sets are needed, the LA set for each new item is found using
1284 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This
1285 may be supplemented by the LA set for the item which produced the new
1286 item.
1287
1288 We also collect a set of all symbols which follow "DOT" (in `done`) as
1289 this is used in the next stage.
1290
1291 When itemsets are created we assign a precedence to the itemset from the
1292 complete item, if there is one.  We ignore the possibility of there
1293 being two and don't (currently) handle precedence in such grammars.
1294 When completing a grammar we ignore any item where DOT is followed by a
1295 terminal with a precedence lower than that for the itemset.  Unless the
1296 terminal has right associativity, we also ignore items where the
1297 terminal has the same precedence.  The result is that unwanted items are
1298 still in the itemset, but the terminal doesn't get into the go to set,
1299 so the item is ineffective.
1300
1301 ###### complete itemset
1302         for (i = 0; i < is->items.cnt; i++) {
1303                 int p = item_prod(is->items.syms[i]);
1304                 int bs = item_dot(is->items.syms[i]);
1305                 struct production *pr = g->productions[p];
1306                 int p2;
1307                 struct symbol *s;
1308                 struct symset LA = INIT_SYMSET;
1309                 unsigned short sn = 0;
1310
1311                 if (bs == pr->body_size)
1312                         continue;
1313                 s = pr->body[bs];
1314                 if (s->precedence && is->precedence &&
1315                     is->precedence > s->precedence)
1316                         /* This terminal has a low precedence and
1317                          * shouldn't be shifted
1318                          */
1319                         continue;
1320                 if (s->precedence && is->precedence &&
1321                     is->precedence == s->precedence && s->assoc != Right)
1322                         /* This terminal has a matching precedence and is
1323                          * not Right-associative, so we mustn't shift it.
1324                          */
1325                         continue;
1326                 if (symset_find(&done, s->num) < 0)
1327                         symset_add(&done, s->num, 0);
1328
1329                 if (s->type != Nonterminal)
1330                         continue;
1331                 check_again = 1;
1332                 if (type >= LALR) {
1333                         // Need the LA set.
1334                         int to_end;
1335                         add_first(pr, bs+1, &LA, g, &to_end);
1336                         if (to_end) {
1337                                 struct symset ss = set_find(g, is->items.data[i]);
1338                                 symset_union(&LA, &ss);
1339                         }
1340                         sn = save_set(g, LA);
1341                         LA = set_find(g, sn);
1342                 }
1343
1344                 /* Add productions for this symbol */
1345                 for (p2 = s->first_production;
1346                      p2 < g->production_count &&
1347                       g->productions[p2]->head == s;
1348                      p2++) {
1349                         int itm = item_num(p2, 0);
1350                         int pos = symset_find(&is->items, itm);
1351                         if (pos < 0) {
1352                                 symset_add(&is->items, itm, sn);
1353                                 /* Will have re-ordered, so start
1354                                  * from beginning again */
1355                                 i = -1;
1356                         } else if (type >= LALR) {
1357                                 struct symset ss = set_find(g, is->items.data[pos]);
1358                                 struct symset tmp = INIT_SYMSET;
1359                                 struct symset *la = &LA;
1360
1361                                 symset_union(&tmp, &ss);
1362                                 if (symset_union(&tmp, la)) {
1363                                         is->items.data[pos] = save_set(g, tmp);
1364                                         i = -1;
1365                                 } else
1366                                         symset_free(tmp);
1367                         }
1368                 }
1369         }
1370
1371 For each symbol we found (and placed in `done`) we collect all the items
1372 for which this symbol is next, and create a new itemset with "DOT"
1373 advanced over the symbol.  This is then added to the collection of
1374 itemsets (or merged with a pre-existing itemset).
1375
1376 ###### derive itemsets
1377         // Now we have a completed itemset, so we need to
1378         // compute all the 'next' itemsets and create them
1379         // if they don't exist.
1380         for (i = 0; i < done.cnt; i++) {
1381                 int j;
1382                 unsigned short state;
1383                 struct symbol *sym = g->symtab[done.syms[i]];
1384                 enum assoc assoc = Non;
1385                 unsigned short precedence = 0;
1386                 struct symset newitemset = INIT_SYMSET;
1387                 if (type >= LALR)
1388                         newitemset = INIT_DATASET;
1389
1390                 for (j = 0; j < is->items.cnt; j++) {
1391                         int itm = is->items.syms[j];
1392                         int p = item_prod(itm);
1393                         int bp = item_dot(itm);
1394                         struct production *pr = g->productions[p];
1395                         unsigned short la = 0;
1396                         int pos;
1397
1398                         if (bp == pr->body_size)
1399                                 continue;
1400                         if (pr->body[bp] != sym)
1401                                 continue;
1402
1403                         bp += 1;
1404                         if (type >= LALR)
1405                                 la = is->items.data[j];
1406                         if (bp == pr->body_size &&
1407                             pr->precedence > 0 &&
1408                             pr->precedence > precedence) {
1409                                 // new itemset is reducible and has a precedence.
1410                                 precedence = pr->precedence;
1411                                 assoc = pr->assoc;
1412                         }
1413                         pos = symset_find(&newitemset, item_num(p, bp));
1414
1415                         if (pos < 0)
1416                                 symset_add(&newitemset, item_num(p, bp), la);
1417                         else if (type >= LALR) {
1418                                 // Need to merge la set.
1419                                 int la2 = newitemset.data[pos];
1420                                 if (la != la2) {
1421                                         struct symset ss = set_find(g, la2);
1422                                         struct symset LA = INIT_SYMSET;
1423                                         symset_union(&LA, &ss);
1424                                         ss = set_find(g, la);
1425                                         if (symset_union(&LA, &ss))
1426                                                 newitemset.data[pos] = save_set(g, LA);
1427                                         else
1428                                                 symset_free(LA);
1429                                 }
1430                         }
1431                 }
1432                 state = add_itemset(g, newitemset, assoc, precedence, type);
1433                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1434                         symset_add(&is->go_to, done.syms[i], state);
1435         }
1436
1437 All that is left is to create the initial itemset from production zero, and
1438 with `TK_eof` as the LA set.
1439
1440 ###### functions
1441         static void build_itemsets(struct grammar *g, enum grammar_type type)
1442         {
1443                 struct symset first = INIT_SYMSET;
1444                 struct itemset *is;
1445                 int check_again;
1446                 unsigned short la = 0;
1447                 if (type >= LALR) {
1448                         // LA set just has eof
1449                         struct symset eof = INIT_SYMSET;
1450                         symset_add(&eof, TK_eof, 0);
1451                         la = save_set(g, eof);
1452                         first = INIT_DATASET;
1453                 }
1454                 // production 0, offset 0 (with no data)
1455                 symset_add(&first, item_num(0, 0), la);
1456                 add_itemset(g, first, Non, 0, type);
1457                 for (check_again = 0, is = g->items;
1458                      is;
1459                      is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
1460                         int i;
1461                         struct symset done = INIT_SYMSET;
1462                         if (is->completed)
1463                                 continue;
1464                         is->completed = 1;
1465                         check_again = 1;
1466                         ## complete itemset
1467                         ## derive itemsets
1468                         symset_free(done);
1469                 }
1470         }
1471
1472 ### Completing the analysis.
1473
1474 The exact process of analysis depends on which level was requested.  For
1475 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1476 `LR1` we need first, but not follow.  For `SLR` we need both.
1477
1478 We don't build the "action" tables that you might expect as the parser
1479 doesn't use them.  They will be calculated to some extent if needed for
1480 a report.
1481
1482 Once we have built everything we allocate arrays for the two lists:
1483 symbols and itemsets.  This allows more efficient access during
1484 reporting.  The symbols are grouped as terminals and then non-terminals,
1485 and we record the changeover point in `first_nonterm`.
1486
1487 ###### grammar fields
1488         struct symbol **symtab;
1489         struct itemset **statetab;
1490         int first_nonterm;
1491
1492 ###### grammar_analyse
1493
1494         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1495         {
1496                 struct symbol *s;
1497                 struct itemset *is;
1498                 int snum = TK_reserved;
1499                 for (s = g->syms; s; s = s->next)
1500                         if (s->num < 0 && s->type == Terminal) {
1501                                 s->num = snum;
1502                                 snum++;
1503                         }
1504                 g->first_nonterm = snum;
1505                 for (s = g->syms; s; s = s->next)
1506                         if (s->num < 0 && s->type != Virtual) {
1507                                 s->num = snum;
1508                                 snum++;
1509                         }
1510                 for (s = g->syms; s; s = s->next)
1511                         if (s->num < 0) {
1512                                 s->num = snum;
1513                                 snum++;
1514                         }
1515                 g->num_syms = snum;
1516                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1517                 for (s = g->syms; s; s = s->next)
1518                         g->symtab[s->num] = s;
1519
1520                 set_nullable(g);
1521                 if (type >= SLR)
1522                         build_first(g);
1523
1524                 if (type == SLR)
1525                         build_follow(g);
1526
1527                 build_itemsets(g, type);
1528
1529                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1530                 for (is = g->items; is ; is = is->next)
1531                         g->statetab[is->state] = is;
1532         }
1533
1534 ## Reporting on the Grammar
1535
1536 The purpose of the report is to give the grammar developer insight into
1537 how the grammar parser will work.  It is basically a structured dump of
1538 all the tables that have been generated, plus a description of any conflicts.
1539
1540 ###### grammar_report
1541         static int grammar_report(struct grammar *g, enum grammar_type type)
1542         {
1543                 report_symbols(g);
1544                 if (g->follow)
1545                         report_follow(g);
1546                 report_itemsets(g);
1547                 return report_conflicts(g, type);
1548         }
1549
1550 Firstly we have the complete list of symbols, together with the
1551 "FIRST" set if that was generated.  We add a mark to each symbol to
1552 show if it is nullable (`.`).
1553
1554 ###### functions
1555
1556         static void report_symbols(struct grammar *g)
1557         {
1558                 int n;
1559                 if (g->first)
1560                         printf("SYMBOLS + FIRST:\n");
1561                 else
1562                         printf("SYMBOLS:\n");
1563
1564                 for (n = 0; n < g->num_syms; n++) {
1565                         struct symbol *s = g->symtab[n];
1566                         if (!s)
1567                                 continue;
1568
1569                         printf(" %c%3d%c: ",
1570                                s->nullable ? '.':' ',
1571                                s->num, symtypes[s->type]);
1572                         prtxt(s->name);
1573                         if (s->precedence)
1574                                 printf(" (%d%s)", s->precedence,
1575                                        assoc_names[s->assoc]);
1576
1577                         if (g->first && s->type == Nonterminal) {
1578                                 int j;
1579                                 char c = ':';
1580                                 for (j = 0; j < g->first[n].cnt; j++) {
1581                                         printf("%c ", c);
1582                                         c = ',';
1583                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1584                                 }
1585                         }
1586                         printf("\n");
1587                 }
1588         }
1589
1590 Then we have the follow sets if they were computed.
1591
1592         static void report_follow(struct grammar *g)
1593         {
1594                 int n;
1595                 printf("FOLLOW:\n");
1596                 for (n = 0; n < g->num_syms; n++)
1597                         if (g->follow[n].cnt) {
1598                                 int j;
1599                                 char c = ':';
1600                                 printf("  ");
1601                                 prtxt(g->symtab[n]->name);
1602                                 for (j = 0; j < g->follow[n].cnt; j++) {
1603                                         printf("%c ", c);
1604                                         c = ',';
1605                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1606                                 }
1607                                 printf("\n");
1608                         }
1609         }
1610
1611 And finally the item sets.  These include the GO TO tables and, for
1612 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1613 it up a bit.  First the items, with production number and associativity.
1614
1615         static void report_item(struct grammar *g, int itm)
1616         {
1617                 int p = item_prod(itm);
1618                 int dot = item_dot(itm);
1619                 struct production *pr = g->productions[p];
1620                 int i;
1621
1622                 printf("    ");
1623                 prtxt(pr->head->name);
1624                 printf(" ->");
1625                 for (i = 0; i < pr->body_size; i++) {
1626                         printf(" %s", (dot == i ? ". ": ""));
1627                         prtxt(pr->body[i]->name);
1628                 }
1629                 if (dot == pr->body_size)
1630                         printf(" .");
1631                 printf(" [%d]", p);
1632                 if (pr->precedence && dot == pr->body_size)
1633                         printf(" (%d%s)", pr->precedence,
1634                                assoc_names[pr->assoc]);
1635                 if (dot < pr->body_size &&
1636                     pr->body[dot]->precedence) {
1637                         struct symbol *s = pr->body[dot];
1638                         printf(" [%d%s]", s->precedence,
1639                                assoc_names[s->assoc]);
1640                 }
1641                 printf("\n");
1642         }
1643
1644 The LA sets which are (possibly) reported with each item:
1645
1646         static void report_la(struct grammar *g, int lanum)
1647         {
1648                 struct symset la = set_find(g, lanum);
1649                 int i;
1650                 char c = ':';
1651
1652                 printf("        LOOK AHEAD(%d)", lanum);
1653                 for (i = 0; i < la.cnt; i++) {
1654                         printf("%c ", c);
1655                         c = ',';
1656                         prtxt(g->symtab[la.syms[i]]->name);
1657                 }
1658                 printf("\n");
1659         }
1660
1661 Then the go to sets:
1662
1663         static void report_goto(struct grammar *g, struct symset gt)
1664         {
1665                 int i;
1666                 printf("    GOTO:\n");
1667
1668                 for (i = 0; i < gt.cnt; i++) {
1669                         printf("      ");
1670                         prtxt(g->symtab[gt.syms[i]]->name);
1671                         printf(" -> %d\n", gt.data[i]);
1672                 }
1673         }
1674
1675 Now we can report all the item sets complete with items, LA sets, and GO TO.
1676
1677         static void report_itemsets(struct grammar *g)
1678         {
1679                 int s;
1680                 printf("ITEM SETS(%d)\n", g->states);
1681                 for (s = 0; s < g->states; s++) {
1682                         int j;
1683                         struct itemset *is = g->statetab[s];
1684                         printf("  Itemset %d:", s);
1685                         if (is->precedence)
1686                                 printf(" %d%s", is->precedence, assoc_names[is->assoc]);
1687                         printf("\n");
1688                         for (j = 0; j < is->items.cnt; j++) {
1689                                 report_item(g, is->items.syms[j]);
1690                                 if (is->items.data != NO_DATA)
1691                                         report_la(g, is->items.data[j]);
1692                         }
1693                         report_goto(g, is->go_to);
1694                 }
1695         }
1696
1697 ### Reporting conflicts
1698
1699 Conflict detection varies a lot among different analysis levels.  However
1700 LR0 and LR0.5 are quite similar - having no follow sets - and SLR, LALR and
1701 LR1 are also similar as they have FOLLOW or LA sets.
1702
1703 ###### functions
1704
1705         ## conflict functions
1706
1707         static int report_conflicts(struct grammar *g, enum grammar_type type)
1708         {
1709                 int cnt = 0;
1710                 printf("Conflicts:\n");
1711                 if (type < SLR)
1712                         cnt = conflicts_lr0(g, type);
1713                 else
1714                         cnt = conflicts_slr(g, type);
1715                 if (cnt == 0)
1716                         printf(" - no conflicts\n");
1717                 return cnt;
1718         }
1719
1720 LR0 conflicts are any state which have both a reducible item and
1721 a shiftable item, or two reducible items.
1722
1723 LR05 conflicts only occur if two possible reductions exist,
1724 as shifts always over-ride reductions.
1725
1726 ###### conflict functions
1727         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1728         {
1729                 int i;
1730                 int cnt = 0;
1731                 for (i = 0; i < g->states; i++) {
1732                         struct itemset *is = g->statetab[i];
1733                         int last_reduce = -1;
1734                         int prev_reduce = -1;
1735                         int last_shift = -1;
1736                         int j;
1737                         if (!is)
1738                                 continue;
1739                         for (j = 0; j < is->items.cnt; j++) {
1740                                 int itm = is->items.syms[j];
1741                                 int p = item_prod(itm);
1742                                 int bp = item_dot(itm);
1743                                 struct production *pr = g->productions[p];
1744
1745                                 if (bp == pr->body_size) {
1746                                         prev_reduce = last_reduce;
1747                                         last_reduce = j;
1748                                         continue;
1749                                 }
1750                                 if (pr->body[bp]->type == Terminal)
1751                                         last_shift = j;
1752                         }
1753                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1754                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1755                                 report_item(g, is->items.syms[last_shift]);
1756                                 report_item(g, is->items.syms[last_reduce]);
1757                                 cnt++;
1758                         }
1759                         if (prev_reduce >= 0) {
1760                                 printf("  State %d has 2 (or more) reducible items\n", i);
1761                                 report_item(g, is->items.syms[prev_reduce]);
1762                                 report_item(g, is->items.syms[last_reduce]);
1763                                 cnt++;
1764                         }
1765                 }
1766                 return cnt;
1767         }
1768
1769 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1770 look ahead, or if a symbol in a look-ahead can be shifted.  They differ only
1771 in the source of the look ahead set.
1772
1773 We build two datasets to reflect the "action" table: one which maps
1774 terminals to items where that terminal could be shifted and another
1775 which maps terminals to items that could be reduced when the terminal
1776 is in look-ahead.  We report when we get conflicts between the two.
1777
1778         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1779         {
1780                 int i;
1781                 int cnt = 0;
1782
1783                 for (i = 0; i < g->states; i++) {
1784                         struct itemset *is = g->statetab[i];
1785                         struct symset shifts = INIT_DATASET;
1786                         struct symset reduce = INIT_DATASET;
1787                         int j;
1788                         if (!is)
1789                                 continue;
1790                         /* First collect the shifts */
1791                         for (j = 0; j < is->items.cnt; j++) {
1792                                 unsigned short itm = is->items.syms[j];
1793                                 int p = item_prod(itm);
1794                                 int bp = item_dot(itm);
1795                                 struct production *pr = g->productions[p];
1796                                 struct symbol *s;
1797
1798                                 if (bp >= pr->body_size ||
1799                                     pr->body[bp]->type != Terminal)
1800                                         /* not shiftable */
1801                                         continue;
1802
1803                                 s = pr->body[bp];
1804                                 if (s->precedence && is->precedence)
1805                                         /* Precedence resolves this, so no conflict */
1806                                         continue;
1807
1808                                 if (symset_find(&shifts, s->num) < 0)
1809                                         symset_add(&shifts, s->num, itm);
1810                         }
1811                         /* Now look for reductions and conflicts */
1812                         for (j = 0; j < is->items.cnt; j++) {
1813                                 unsigned short itm = is->items.syms[j];
1814                                 int p = item_prod(itm);
1815                                 int bp = item_dot(itm);
1816                                 struct production *pr = g->productions[p];
1817
1818                                 if (bp < pr->body_size)
1819                                         continue;
1820                                 /* reducible */
1821                                 struct symset la;
1822                                 if (type == SLR)
1823                                         la = g->follow[pr->head->num];
1824                                 else
1825                                         la = set_find(g, is->items.data[j]);
1826                                 int k;
1827                                 for (k = 0; k < la.cnt; k++) {
1828                                         int pos = symset_find(&shifts, la.syms[k]);
1829                                         if (pos >= 0) {
1830                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1831                                                 cnt++;
1832                                                         prtxt(g->symtab[la.syms[k]]->name);
1833                                                 printf(":\n");
1834                                                 report_item(g, shifts.data[pos]);
1835                                                 report_item(g, itm);
1836                                         }
1837                                         pos = symset_find(&reduce, la.syms[k]);
1838                                         if (pos < 0) {
1839                                                 symset_add(&reduce, la.syms[k], itm);
1840                                                 continue;
1841                                         }
1842                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1843                                         prtxt(g->symtab[la.syms[k]]->name);
1844                                         printf(":\n");
1845                                         report_item(g, itm);
1846                                         report_item(g, reduce.data[pos]);
1847                                         cnt++;
1848                                 }
1849                         }
1850                         symset_free(shifts);
1851                         symset_free(reduce);
1852                 }
1853                 return cnt;
1854         }
1855
1856
1857 ## Generating the parser
1858
1859 The exported part of the parser is the `parse_XX` function, where the name
1860 `XX` is based on the name of the parser files.
1861
1862 This takes a `code_node`, a partially initialized `token_config`, and an
1863 optional `FILE` to send tracing to.  The `token_config` gets the list of
1864 known words added and then is used with the `code_node` to initialize the
1865 scanner.
1866
1867 `parse_XX` then calls the library function `parser_run` to actually complete
1868 the parse.  This needs the `states` table and functions to call the various
1869 pieces of code provided in the grammar file, so they are generated first.
1870
1871 ###### parser_generate
1872
1873         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name,
1874                                struct code_node *pre_reduce)
1875         {
1876                 gen_known(f, g);
1877                 gen_non_term(f, g);
1878                 gen_goto(f, g);
1879                 gen_states(f, g);
1880                 gen_reduce(f, g, file, pre_reduce);
1881                 gen_free(f, g);
1882
1883                 fprintf(f, "#line 0 \"gen_parser\"\n");
1884                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1885                         name);
1886                 fprintf(f, "{\n");
1887                 fprintf(f, "\tstruct token_state *tokens;\n");
1888                 fprintf(f, "\tconfig->words_marks = known;\n");
1889                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1890                 fprintf(f, "\ttokens = token_open(code, config);\n");
1891                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
1892                 fprintf(f, "\ttoken_close(tokens);\n");
1893                 fprintf(f, "\treturn rv;\n");
1894                 fprintf(f, "}\n\n");
1895         }
1896
1897 ### Known words table
1898
1899 The known words table is simply an array of terminal symbols.
1900 The table of nonterminals used for tracing is a similar array.
1901
1902 ###### functions
1903
1904         static void gen_known(FILE *f, struct grammar *g)
1905         {
1906                 int i;
1907                 fprintf(f, "#line 0 \"gen_known\"\n");
1908                 fprintf(f, "static const char *known[] = {\n");
1909                 for (i = TK_reserved;
1910                      i < g->num_syms && g->symtab[i]->type == Terminal;
1911                      i++)
1912                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1913                                 g->symtab[i]->name.txt);
1914                 fprintf(f, "};\n\n");
1915         }
1916
1917         static void gen_non_term(FILE *f, struct grammar *g)
1918         {
1919                 int i;
1920                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1921                 fprintf(f, "static const char *non_term[] = {\n");
1922                 for (i = TK_reserved;
1923                      i < g->num_syms;
1924                      i++)
1925                         if (g->symtab[i]->type == Nonterminal)
1926                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1927                                         g->symtab[i]->name.txt);
1928                 fprintf(f, "};\n\n");
1929         }
1930
1931 ### States and the goto tables.
1932
1933 For each state we record the goto table and details of the reducible
1934 production if there is one.
1935 Some of the details of the reducible production are stored in the
1936 `do_reduce` function to come later.  Here we store the production
1937 number, the body size (useful for stack management), and the resulting
1938 symbol (useful for knowing how to free data later).
1939
1940 The go to table is stored in a simple array of `sym` and corresponding
1941 `state`.
1942
1943 ###### exported types
1944
1945         struct lookup {
1946                 short sym;
1947                 short state;
1948         };
1949         struct state {
1950                 short go_to_cnt;
1951                 const struct lookup * go_to;
1952                 short reduce_prod;
1953                 short reduce_size;
1954                 short reduce_sym;
1955                 short result_size;
1956         };
1957
1958 ###### functions
1959
1960         static void gen_goto(FILE *f, struct grammar *g)
1961         {
1962                 int i;
1963                 fprintf(f, "#line 0 \"gen_goto\"\n");
1964                 for (i = 0; i < g->states; i++) {
1965                         struct symset gt = g->statetab[i]->go_to;
1966                         int j;
1967
1968                         if (gt.cnt == 0)
1969                                 continue;
1970                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
1971                                 i);
1972                         for (j = 0; j < gt.cnt; j++)
1973                                 fprintf(f, "\t{ %d, %d },\n",
1974                                         gt.syms[j], gt.data[j]);
1975                         fprintf(f, "};\n");
1976                 }
1977         }
1978
1979         static void gen_states(FILE *f, struct grammar *g)
1980         {
1981                 int i;
1982                 fprintf(f, "#line 0 \"gen_states\"\n");
1983                 fprintf(f, "static const struct state states[] = {\n");
1984                 for (i = 0; i < g->states; i++) {
1985                         struct itemset *is = g->statetab[i];
1986                         int j, prod = -1, prod_len;
1987
1988                         for (j = 0; j < is->items.cnt; j++) {
1989                                 int itm = is->items.syms[j];
1990                                 int p = item_prod(itm);
1991                                 int bp = item_dot(itm);
1992                                 struct production *pr = g->productions[p];
1993
1994                                 if (bp < pr->body_size)
1995                                         continue;
1996                                 /* This is what we reduce - choose longest */
1997                                 if (prod < 0 || prod_len < pr->body_size) {
1998                                         prod = p;
1999                                         prod_len = pr->body_size;
2000                                 }
2001                         }
2002                         if (is->go_to.cnt)
2003                                 fprintf(f, "\t[%d] = { %d, goto_%d, ",
2004                                         i, is->go_to.cnt, i);
2005                         else
2006                                 fprintf(f, "\t[%d] = { 0, NULL, ", i);
2007                         if (prod >= 0) {
2008                                 struct production *pr = g->productions[prod];
2009                                 struct symbol *hd = pr->head;
2010                                 fprintf(f, "%d, %d, %d, ",
2011                                         prod, pr->body_size, pr->head->num);
2012                                 if (hd->struct_name.txt == NULL)
2013                                         fprintf(f, "0 },\n");
2014                                 else
2015                                         fprintf(f, "sizeof(struct %.*s%s) },\n",
2016                                                 hd->struct_name.len,
2017                                                 hd->struct_name.txt,
2018                                                 hd->isref ? "*" : "");
2019                         } else
2020                                 fprintf(f, "-1, -1, -1, -1 },\n");
2021                 }
2022                 fprintf(f, "};\n\n");
2023         }
2024
2025 ### The `do_reduce` function and the code
2026
2027 When the parser engine decides to reduce a production, it calls
2028 `do_reduce` which runs the code that was included with the production,
2029 if any.
2030
2031 This code needs to be able to store data somewhere.  Rather than
2032 requiring `do_reduce` to `malloc` that "somewhere", we pass in a large
2033 buffer and have `do_reduce` return the size to be saved.
2034
2035 In order for the code to access "global" context, we pass in the
2036 "config" pointer that was passed to the parser function.  If the `struct
2037 token_config` is embedded in some larger structure, the reducing code
2038 can access the larger structure using pointer manipulation.  Performing
2039 that pointer manipulation, and any other common code or variables for
2040 all reduce actions, can be provided in code node called "reduce" which
2041 is passed around in `pre_reduce` which you might have already noticed.
2042
2043 The code fragments require translation when written out.  Any `$N` needs
2044 to be converted to a reference either to that buffer (if `$0`) or to the
2045 structure returned by a previous reduction.  These pointers need to be
2046 cast to the appropriate type for each access.  All this is handled in
2047 `gen_code`.
2048
2049 `gen_code` also allows symbol references to contain a '`<`' as in
2050 '`$<2`'.  This is particularly useful for references (or pointers), but
2051 can be used with structures too.  The `<` implies that the value is
2052 being moved out, so the object will not be automatically freed.  It is
2053 equivalent to assigning `NULL` to the pointer or filling a structure
2054 with zeros.
2055
2056 Instead of a number `N`, the `$` or `$<` can be followed by some letters
2057 and possibly a number.  A number by itself (other than zero) selects a
2058 symbol from the body of the production.  A sequence of letters selects
2059 the shortest symbol in the body which contains those letters in the
2060 given order.  If a number follows the letters, then a later occurrence
2061 of that symbol is chosen.  So "`$AB2`" will refer to the structure
2062 attached to the second occurrence of the shortest symbol which contains
2063 an `A` followed by a `B`.  If there is no unique shortest system, or if
2064 the number given is too large, then the symbol reference is not
2065 transformed, and will cause an error when the code is compiled.
2066
2067 ###### functions
2068
2069         static int textchr(struct text t, char c, int s)
2070         {
2071                 int i;
2072                 for (i = s; i < t.len; i++)
2073                         if (t.txt[i] == c)
2074                                 return i;
2075                 return -1;
2076         }
2077
2078         static int subseq_match(char *seq, int slen, struct text name)
2079         {
2080                 int st = 0;
2081                 while (slen > 0) {
2082                         st = textchr(name, *seq, st);
2083                         if (st < 0)
2084                                 return 0;
2085                         slen -= 1;
2086                         seq += 1;
2087                         st += 1;
2088                 }
2089                 return 1;
2090         }
2091
2092         static int choose_sym(char **namep, int len, struct production *p)
2093         {
2094                 char *name = *namep;
2095                 char *nam = name;
2096                 int namlen;
2097                 int n = 0;
2098                 int i, s, slen;
2099                 char c;
2100
2101                 c = *name;
2102                 while (len > 0 &&
2103                        ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
2104                         name += 1;
2105                         len -= 1;
2106                         c = *name;
2107                 }
2108                 namlen = name-nam;
2109                 while (len > 0 && (c >= '0' && c <= '9')) {
2110                         name += 1;
2111                         len -= 1;
2112                         n = n * 10 + (c - '0');
2113                         c = *name;
2114                 }
2115                 if (namlen == 0) {
2116                         if (name == *namep || n > p->body_size)
2117                                 return -1;
2118                         *namep = name;
2119                         return n;
2120                 }
2121                 slen = 0; s = -1;
2122                 for (i = 0; i < p->body_size; i++) {
2123                         if (!subseq_match(nam, namlen, p->body[i]->name))
2124                                 continue;
2125                         if (slen == 0 || p->body[i]->name.len < slen) {
2126                                 s = i;
2127                                 slen = p->body[i]->name.len;
2128                         }
2129                         if (s >= 0 && p->body[i] != p->body[s] &&
2130                             p->body[i]->name.len == p->body[s]->name.len)
2131                                 /* not unique, so s cannot be used */
2132                                 s = -1;
2133                 }
2134                 if (s < 0)
2135                         return -1;
2136                 if (n == 0)
2137                         n = 1;
2138                 for (i = 0; i < p->body_size; i++)
2139                         if (p->body[i] == p->body[s]) {
2140                                 n -= 1;
2141                                 if (n == 0)
2142                                         break;
2143                         }
2144                 if (n > 0 || i == p->body_size)
2145                         return -1;
2146                 *namep = name;
2147                 return i + 1;
2148         }
2149
2150         static void gen_code(struct production *p, FILE *f, struct grammar *g)
2151         {
2152                 char *c;
2153                 char *used = calloc(1, p->body_size);
2154                 int i;
2155
2156                 fprintf(f, "\t\t\t");
2157                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
2158                         int n;
2159                         int use = 0;
2160                         if (*c != '$') {
2161                                 fputc(*c, f);
2162                                 if (*c == '\n')
2163                                         fputs("\t\t\t", f);
2164                                 continue;
2165                         }
2166                         c++;
2167                         if (*c == '<') {
2168                                 use = 1;
2169                                 c++;
2170                         }
2171                         n = choose_sym(&c, p->code.txt + p->code.len - c, p);
2172                         if (n < 0) {
2173                                 fputc('$', f);
2174                                 if (use)
2175                                         fputc('<', f);
2176                                 fputc(*c, f);
2177                                 continue;
2178                         }
2179                         if (n == 0)
2180                                 fprintf(f, "(*(struct %.*s*%s)ret)",
2181                                         p->head->struct_name.len,
2182                                         p->head->struct_name.txt,
2183                                         p->head->isref ? "*":"");
2184                         else if (p->body[n-1]->type == Terminal)
2185                                 fprintf(f, "(*(struct token *)body[%d])",
2186                                         n-1);
2187                         else if (p->body[n-1]->struct_name.txt == NULL)
2188                                 fprintf(f, "$%d", n);
2189                         else {
2190                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2191                                         p->body[n-1]->struct_name.len,
2192                                         p->body[n-1]->struct_name.txt,
2193                                         p->body[n-1]->isref ? "*":"", n-1);
2194                                 used[n-1] = use;
2195                         }
2196                         c -= 1;
2197                 }
2198                 fputs("\n", f);
2199                 for (i = 0; i < p->body_size; i++) {
2200                         if (p->body[i]->struct_name.txt &&
2201                             used[i]) {
2202                                 // assume this has been copied out
2203                                 if (p->body[i]->isref)
2204                                         fprintf(f, "\t\t*(void**)body[%d] = NULL;\n", i);
2205                                 else
2206                                         fprintf(f, "\t\tmemset(body[%d], 0, sizeof(struct %.*s));\n", i, p->body[i]->struct_name.len, p->body[i]->struct_name.txt);
2207                         }
2208                 }
2209                 free(used);
2210         }
2211
2212 ###### functions
2213
2214         static void gen_reduce(FILE *f, struct grammar *g, char *file,
2215                                struct code_node *pre_reduce)
2216         {
2217                 int i;
2218                 fprintf(f, "#line 1 \"gen_reduce\"\n");
2219                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
2220                 fprintf(f, "{\n");
2221                 fprintf(f, "\tint ret_size = 0;\n");
2222                 if (pre_reduce)
2223                         code_node_print(f, pre_reduce, file);
2224
2225                 fprintf(f, "#line 4 \"gen_reduce\"\n");
2226                 fprintf(f, "\tswitch(prod) {\n");
2227                 for (i = 0; i < g->production_count; i++) {
2228                         struct production *p = g->productions[i];
2229                         fprintf(f, "\tcase %d:\n", i);
2230
2231                         if (p->code.txt) {
2232                                 fprintf(f, "#line %d \"%s\"\n", p->code_line, file);
2233                                 gen_code(p, f, g);
2234                         }
2235
2236                         if (p->head->struct_name.txt)
2237                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2238                                         p->head->struct_name.len,
2239                                         p->head->struct_name.txt,
2240                                         p->head->isref ? "*":"");
2241
2242                         fprintf(f, "\t\tbreak;\n");
2243                 }
2244                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2245         }
2246
2247 ### `do_free`
2248
2249 As each non-terminal can potentially cause a different type of data
2250 structure to be allocated and filled in, we need to be able to free it when
2251 done.
2252
2253 It is particularly important to have fine control over freeing during error
2254 recovery where individual stack frames might need to be freed.
2255
2256 For this, the grammar author is required to defined a `free_XX` function for
2257 each structure that is used by a non-terminal.  `do_free` will call whichever
2258 is appropriate given a symbol number, and will call `free` (as is
2259 appropriate for tokens) on any terminal symbol.
2260
2261 ###### functions
2262
2263         static void gen_free(FILE *f, struct grammar *g)
2264         {
2265                 int i;
2266
2267                 fprintf(f, "#line 0 \"gen_free\"\n");
2268                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2269                 fprintf(f, "{\n");
2270                 fprintf(f, "\tif (!asn) return;\n");
2271                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2272                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2273                 fprintf(f, "\tswitch(sym) {\n");
2274
2275                 for (i = 0; i < g->num_syms; i++) {
2276                         struct symbol *s = g->symtab[i];
2277                         if (!s ||
2278                             s->type != Nonterminal ||
2279                             s->struct_name.txt == NULL)
2280                                 continue;
2281
2282                         fprintf(f, "\tcase %d:\n", s->num);
2283                         if (s->isref) {
2284                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2285                                         s->struct_name.len,
2286                                         s->struct_name.txt);
2287                                 fprintf(f, "\t\tfree(asn);\n");
2288                         } else
2289                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2290                                         s->struct_name.len,
2291                                         s->struct_name.txt);
2292                         fprintf(f, "\t\tbreak;\n");
2293                 }
2294                 fprintf(f, "\t}\n}\n\n");
2295         }
2296
2297 ## The main routine.
2298
2299 There are three key parts to the "main" function of parsergen: processing
2300 the arguments, loading the grammar file, and dealing with the grammar.
2301
2302 ### Argument processing.
2303
2304 Command line options allow the selection of analysis mode, name of output
2305 file, and whether or not a report should be generated.  By default we create
2306 a report only if no code output was requested.
2307
2308 The `parse_XX` function name uses the basename of the output file which
2309 should not have a suffix (`.c`).  `.c` is added to the given name for the
2310 code, and `.h` is added for the header (if header text is specifed in the
2311 grammar file).
2312
2313 ###### includes
2314         #include <getopt.h>
2315
2316 ###### declarations
2317         static const struct option long_options[] = {
2318                 { "LR0",        0, NULL, '0' },
2319                 { "LR05",       0, NULL, '5' },
2320                 { "SLR",        0, NULL, 'S' },
2321                 { "LALR",       0, NULL, 'L' },
2322                 { "LR1",        0, NULL, '1' },
2323                 { "tag",        1, NULL, 't' },
2324                 { "report",     0, NULL, 'R' },
2325                 { "output",     1, NULL, 'o' },
2326                 { NULL,         0, NULL, 0   }
2327         };
2328         const char *options = "05SL1t:Ro:";
2329
2330 ###### process arguments
2331         int opt;
2332         char *outfile = NULL;
2333         char *infile;
2334         char *name;
2335         char *tag = NULL;
2336         int report = 1;
2337         enum grammar_type type = LR05;
2338         while ((opt = getopt_long(argc, argv, options,
2339                                   long_options, NULL)) != -1) {
2340                 switch(opt) {
2341                 case '0':
2342                         type = LR0; break;
2343                 case '5':
2344                         type = LR05; break;
2345                 case 'S':
2346                         type = SLR; break;
2347                 case 'L':
2348                         type = LALR; break;
2349                 case '1':
2350                         type = LR1; break;
2351                 case 'R':
2352                         report = 2; break;
2353                 case 'o':
2354                         outfile = optarg; break;
2355                 case 't':
2356                         tag = optarg; break;
2357                 default:
2358                         fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
2359                         exit(1);
2360                 }
2361         }
2362         if (optind < argc)
2363                 infile = argv[optind++];
2364         else {
2365                 fprintf(stderr, "No input file given\n");
2366                 exit(1);
2367         }
2368         if (outfile && report == 1)
2369                 report = 0;
2370         name = outfile;
2371         if (name && strchr(name, '/'))
2372                 name = strrchr(name, '/')+1;
2373
2374         if (optind < argc) {
2375                 fprintf(stderr, "Excess command line arguments\n");
2376                 exit(1);
2377         }
2378
2379 ### Loading the grammar file
2380
2381 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2382 map it.
2383
2384 Once we have extracted the code (with `mdcode`) we expect to find three
2385 or four sections: header, code, grammar, reduce.
2386 Anything else that is not excluded by the `--tag` option is an error.
2387
2388 "header", "code", and "reduce" are optional, though it is hard to build
2389 a working parser without the first two.  "grammar" must be provided.
2390
2391 ###### includes
2392         #include <fcntl.h>
2393         #include <sys/mman.h>
2394         #include <errno.h>
2395
2396 ###### functions
2397         static int errs;
2398         static void pr_err(char *msg)
2399         {
2400                 errs++;
2401                 fprintf(stderr, "%s\n", msg);
2402         }
2403
2404 ###### load file
2405         struct section *table;
2406         int fd;
2407         int len;
2408         char *file;
2409         fd = open(infile, O_RDONLY);
2410         if (fd < 0) {
2411                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2412                         infile, strerror(errno));
2413                 exit(1);
2414         }
2415         len = lseek(fd, 0, 2);
2416         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2417         table = code_extract(file, file+len, pr_err);
2418
2419         struct code_node *hdr = NULL;
2420         struct code_node *code = NULL;
2421         struct code_node *gram = NULL;
2422         struct code_node *pre_reduce = NULL;
2423         for (s = table; s; s = s->next) {
2424                 struct text sec = s->section;
2425                 if (tag && !strip_tag(&sec, tag))
2426                         continue;
2427                 if (text_is(sec, "header"))
2428                         hdr = s->code;
2429                 else if (text_is(sec, "code"))
2430                         code = s->code;
2431                 else if (text_is(sec, "grammar"))
2432                         gram = s->code;
2433                 else if (text_is(sec, "reduce"))
2434                         pre_reduce = s->code;
2435                 else {
2436                         fprintf(stderr, "Unknown content section: %.*s\n",
2437                                 s->section.len, s->section.txt);
2438                         rv |= 2;
2439                 }
2440         }
2441
2442 ### Processing the input
2443
2444 As we need to append an extention to a filename and then open it for
2445 writing, and we need to do this twice, it helps to have a separate function.
2446
2447 ###### functions
2448
2449         static FILE *open_ext(char *base, char *ext)
2450         {
2451                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2452                 FILE *f;
2453                 strcat(strcpy(fn, base), ext);
2454                 f = fopen(fn, "w");
2455                 free(fn);
2456                 return f;
2457         }
2458
2459 If we can read the grammar, then we analyse and optionally report on it.  If
2460 the report finds conflicts we will exit with an error status.
2461
2462 ###### process input
2463         struct grammar *g = NULL;
2464         if (gram == NULL) {
2465                 fprintf(stderr, "No grammar section provided\n");
2466                 rv |= 2;
2467         } else {
2468                 g = grammar_read(gram);
2469                 if (!g) {
2470                         fprintf(stderr, "Failure to parse grammar\n");
2471                         rv |= 2;
2472                 }
2473         }
2474         if (g) {
2475                 grammar_analyse(g, type);
2476                 if (report)
2477                         if (grammar_report(g, type))
2478                                 rv |= 1;
2479         }
2480
2481 If a "headers" section is defined, we write it out and include a declaration
2482 for the `parse_XX` function so it can be used from separate code.
2483
2484         if (rv == 0 && hdr && outfile) {
2485                 FILE *f = open_ext(outfile, ".h");
2486                 if (f) {
2487                         code_node_print(f, hdr, infile);
2488                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2489                                 name);
2490                         fclose(f);
2491                 } else {
2492                         fprintf(stderr, "Cannot create %s.h\n",
2493                                 outfile);
2494                         rv |= 4;
2495                 }
2496         }
2497
2498 And if all goes well and an output file was provided, we create the `.c`
2499 file with the code section (if any) and the parser tables and function.
2500
2501         if (rv == 0 && outfile) {
2502                 FILE *f = open_ext(outfile, ".c");
2503                 if (f) {
2504                         if (code)
2505                                 code_node_print(f, code, infile);
2506                         gen_parser(f, g, infile, name, pre_reduce);
2507                         fclose(f);
2508                 } else {
2509                         fprintf(stderr, "Cannot create %s.c\n",
2510                                 outfile);
2511                         rv |= 4;
2512                 }
2513         }
2514
2515 And that about wraps it up.  We need to set the locale so that UTF-8 is
2516 recognised properly, and link with `libicuuc` as `libmdcode` requires that.
2517
2518 ###### File: parsergen.mk
2519         parsergen : parsergen.o libscanner.o libmdcode.o
2520                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2521
2522 ###### includes
2523         #include <locale.h>
2524
2525 ###### main
2526
2527         int main(int argc, char *argv[])
2528         {
2529                 struct section *s;
2530                 int rv = 0;
2531
2532                 setlocale(LC_ALL,"");
2533
2534                 ## process arguments
2535                 ## load file
2536                 ## process input
2537
2538                 return rv;
2539         }
2540
2541 ## The SHIFT/REDUCE parser
2542
2543 Having analysed the grammar and generated all the tables, we only need
2544 the shift/reduce engine to bring it all together.
2545
2546 ### Goto table lookup
2547
2548 The parser generator has nicely provided us with goto tables sorted by
2549 symbol number.  We need a binary search function to find a symbol in the
2550 table.
2551
2552 ###### parser functions
2553
2554         static int search(const struct state *l, int sym)
2555         {
2556                 int lo = 0;
2557                 int hi = l->go_to_cnt;
2558
2559                 if (hi == 0)
2560                         return -1;
2561                 while (lo + 1 < hi) {
2562                         int mid = (lo + hi) / 2;
2563                         if (l->go_to[mid].sym <= sym)
2564                                 lo = mid;
2565                         else
2566                                 hi = mid;
2567                 }
2568                 if (l->go_to[lo].sym == sym)
2569                         return l->go_to[lo].state;
2570                 else
2571                         return -1;
2572         }
2573
2574 ### Memory allocation
2575
2576 The `scanner` returns tokens in a local variable - we want them in allocated
2577 memory so they can live in the `asn_stack`.  So we provide `tok_copy` to
2578 make an allocated copy as required.
2579
2580 ###### parser includes
2581         #include <memory.h>
2582
2583 ###### parser functions
2584
2585         static struct token *tok_copy(struct token tk)
2586         {
2587                 struct token *new = malloc(sizeof(*new));
2588                 *new = tk;
2589                 return new;
2590         }
2591
2592 ### The state stack.
2593
2594 The core data structure for the parser is the stack.  This tracks all
2595 the symbols that have been recognised or partially recognised.
2596
2597 The stack usually won't grow very large - maybe a few tens of entries.
2598 So we dynamically grow an array as required but never bother to
2599 shrink it down again.
2600
2601 We keep the stack as two separate allocations.  One, `asn_stack` stores
2602 the "abstract syntax nodes" which are created by each reduction.  When
2603 we call `do_reduce` we need to pass an array of the `asn`s of the body
2604 of the production, and by keeping a separate `asn` stack, we can just
2605 pass a pointer into this stack.
2606
2607 The other allocation stores all other stack fields of which there are
2608 several.  The `state` is the most important one and guides the parsing
2609 process.  The `sym` is nearly unnecessary as it is implicit in the
2610 state.  However when we want to free entries from the `asn_stack`, it
2611 helps to know what type they are so we can call the right freeing
2612 function.  The symbol leads us to the right free function through
2613 `do_free`.
2614
2615 The stack is most properly seen as alternating states and symbols -
2616 states, like the 'DOT' in items, are between symbols.  Each frame in
2617 our stack holds a state and the symbol that was before it.  The
2618 bottom of stack holds the start state but no symbol, as nothing came
2619 before the beginning.  As we need to store some value, `TK_eof` is used
2620 to mark the beginning of the file as well as the end.
2621
2622 ###### parser functions
2623
2624         struct parser {
2625                 struct frame {
2626                         short state;
2627                         short sym;
2628                 } *stack;
2629                 void **asn_stack;
2630                 int stack_size;
2631                 int tos;
2632         };
2633
2634 #### Shift and pop
2635
2636 Two operations are needed on the stack - shift (which is like push) and pop.
2637
2638 Shift applies not only to terminals but also to non-terminals.  When we
2639 reduce a production we will pop off frames corresponding to the body
2640 symbols, then push on a frame for the head of the production.  This last
2641 is exactly the same process as shifting in a terminal so we use the same
2642 function for both.  In both cases we provide the symbol.  The state is
2643 deduced from the current top-of-stack state and the new symbol.
2644
2645 To simplify other code we arrange for `shift` to fail if there is no `goto`
2646 state for the symbol.  This is useful in basic parsing due to our design
2647 that we shift when we can, and reduce when we cannot.  So the `shift`
2648 function reports if it could.
2649
2650 `shift` is also used to push state zero onto the stack, so if the
2651 stack is empty, it always chooses zero as the next state.
2652
2653 So `shift` finds the next state.  If that succeeds it extends the
2654 allocations if needed and pushes all the information onto the stacks.
2655
2656 ###### parser functions
2657
2658         static int shift(struct parser *p,
2659                          short sym, void *asn,
2660                          const struct state states[])
2661         {
2662                 struct frame next = {0};
2663                 int newstate = p->tos
2664                         ? search(&states[p->stack[p->tos-1].state],
2665                                  sym)
2666                         : 0;
2667                 if (newstate < 0)
2668                         return 0;
2669                 if (p->tos >= p->stack_size) {
2670                         p->stack_size += 10;
2671                         p->stack = realloc(p->stack, p->stack_size
2672                                            * sizeof(p->stack[0]));
2673                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2674                                            * sizeof(p->asn_stack[0]));
2675                 }
2676                 next.sym = sym;
2677                 next.state = newstate;
2678
2679                 p->stack[p->tos] = next;
2680                 p->asn_stack[p->tos] = asn;
2681                 p->tos++;
2682                 return 1;
2683         }
2684
2685 `pop` primarily moves the top of stack (`tos`) back down the required
2686 amount and frees any `asn` entries that need to be freed.  It is called
2687 _after_ we reduce a production, just before we `shift` the nonterminal
2688 in.
2689
2690 ###### parser functions
2691
2692         static void pop(struct parser *p, int num,
2693                         void(*do_free)(short sym, void *asn))
2694         {
2695                 int i;
2696
2697                 p->tos -= num;
2698                 for (i = 0; i < num; i++)
2699                         do_free(p->stack[p->tos+i].sym, p->asn_stack[p->tos+i]);
2700         }
2701
2702 ### The heart of the parser.
2703
2704 Now we have the parser.  For each token we might shift it, trigger a
2705 reduction, or start error handling.  2D tokens (IN, OUT, EOL) also need
2706 to be handled.
2707
2708 We return whatever `asn` was returned by reducing production zero.
2709
2710 If we can neither shift nor reduce we have an error to handle.  We pop
2711 single entries off the stack until we can shift the `TK_error` symbol,
2712 then drop input tokens until we find one we can shift into the new error
2713 state.  We need to ensure that something is dropped or shifted after an
2714 error, or we could get into an infinite loop, only shifting in
2715 `TK_error`, then reducing.  So we track if there has been a shift since
2716 the last error, and if not the next error always discards one token.
2717
2718 When we find `TK_in` and `TK_out` tokens which report indents we need
2719 to handle them directly as the grammar cannot express what we want to
2720 do with them.
2721
2722 `TK_in` tokens are easy: we simply update indent count in the top stack frame to
2723 record how many indents there are following the previous token.
2724
2725 `TK_out` tokens must be canceled against an indent count
2726 within the stack.  If we can reduce some symbols that are all since
2727 the most recent indent, then we do that first.  If the minimum prefix
2728 of the current state then extends back before the most recent indent,
2729 that indent can be cancelled.  If the minimum prefix is shorter then
2730 the indent had ended prematurely and we must start error handling, which
2731 is still a work-in-progress.
2732
2733 `TK_newline` tokens are ignored unless the top stack frame records
2734 that they are permitted.  In that case they will not be considered for
2735 shifting if it is possible to reduce some symbols that are all since
2736 the most recent start of line.  This is how a newline forcibly
2737 terminates any line-like structure - we try to reduce down to at most
2738 one symbol for each line where newlines are allowed.
2739 A consequence of this is that a rule like
2740
2741 ###### Example: newlines - broken
2742
2743         Newlines ->
2744                 | NEWLINE Newlines
2745         IfStatement -> Newlines if ....
2746
2747 cannot work, as the NEWLINE will never be shifted as the empty string
2748 will be reduced first.  Optional sets of newlines need to be include
2749 in the thing that preceed:
2750
2751 ###### Example: newlines - works
2752
2753         If -> if
2754                 | NEWLINE If
2755         IfStatement -> If ....
2756
2757 Here the NEWLINE will be shifted because nothing can be reduced until
2758 the `if` is seen.
2759
2760 When during error handling we discard tokens read in, we want to keep
2761 discarding until we see one that is recognised.  If we had a full set
2762 of LR(1) grammar states, this would mean looking in the look-ahead set,
2763 but we don't keep a full look-ahead set.  We only record the subset
2764 that leads to SHIFT.  We can, however, deduce the look-ahead set by
2765 looking at the SHIFT subsets for all states that we can get to by
2766 reducing zero or more times.  So we need a little function which
2767 checks if a given token is in any of these look-ahead sets.
2768
2769 ###### parser includes
2770         #include "parser.h"
2771
2772 ###### parser_run
2773
2774         static int in_lookahead(struct token *tk, const struct state *states, int state)
2775         {
2776                 while (state >= 0) {
2777                         if (search(&states[state], tk->num) >= 0)
2778                                 return 1;
2779                         if (states[state].reduce_prod < 0)
2780                                 return 0;
2781                         state = search(&states[state], states[state].reduce_sym);
2782                 }
2783                 return 0;
2784         }
2785
2786         void *parser_run(struct token_state *tokens,
2787                          const struct state states[],
2788                          int (*do_reduce)(int, void**, struct token_config*, void*),
2789                          void (*do_free)(short, void*),
2790                          FILE *trace, const char *non_term[],
2791                          struct token_config *config)
2792         {
2793                 struct parser p = { 0 };
2794                 struct token *tk = NULL;
2795                 int accepted = 0;
2796                 int shift_since_err = 1;
2797                 void *ret = NULL;
2798
2799                 shift(&p, TK_eof, NULL, states);
2800                 while (!accepted && p.tos > 0) {
2801                         struct token *err_tk;
2802                         struct frame *tos = &p.stack[p.tos-1];
2803                         if (!tk)
2804                                 tk = tok_copy(token_next(tokens));
2805                         parser_trace(trace, &p,
2806                                      tk, states, non_term, config->known_count);
2807
2808                         if (tk->num == TK_in) {
2809                                 free(tk);
2810                                 tk = NULL;
2811                                 parser_trace_action(trace, "Record");
2812                                 continue;
2813                         }
2814                         if (tk->num == TK_out) {
2815                                 if (1) {
2816                                         // OK to cancel
2817
2818                                         free(tk);
2819                                         tk = NULL;
2820                                         parser_trace_action(trace, "Cancel");
2821                                         continue;
2822                                 }
2823                                 // fall through to error handling as both SHIFT and REDUCE
2824                                 // will fail.
2825                         }
2826                         if (tk->num == TK_newline) {
2827                                 if (1) {
2828                                         free(tk);
2829                                         tk = NULL;
2830                                         parser_trace_action(trace, "Discard");
2831                                         continue;
2832                                 }
2833                         }
2834                         if (shift(&p, tk->num, tk, states)) {
2835                                 shift_since_err = 1;
2836                                 tk = NULL;
2837                                 parser_trace_action(trace, "Shift");
2838                                 continue;
2839                         }
2840
2841                         if (states[tos->state].reduce_prod >= 0) {
2842                                 void **body;
2843                                 void *res;
2844                                 const struct state *nextstate = &states[tos->state];
2845                                 int prod = nextstate->reduce_prod;
2846                                 int size = nextstate->reduce_size;
2847                                 int res_size = nextstate->result_size;
2848
2849                                 body = p.asn_stack + (p.tos - size);
2850                                 res = res_size ? calloc(1, res_size) : NULL;
2851                                 res_size = do_reduce(prod, body, config, res);
2852                                 if (res_size != nextstate->result_size)
2853                                         abort();
2854
2855                                 pop(&p, size, do_free);
2856
2857                                 if (!shift(&p, nextstate->reduce_sym,
2858                                            res, states)) {
2859                                         if (prod != 0) abort();
2860                                         accepted = 1;
2861                                         ret = res;
2862                                 }
2863                                 parser_trace_action(trace, "Reduce");
2864                                 continue;
2865                         }
2866                         /* Error. We walk up the stack until we
2867                          * find a state which will accept TK_error.
2868                          * We then shift in TK_error and see what state
2869                          * that takes us too.
2870                          * Then we discard input tokens until
2871                          * we find one that is acceptable.
2872                          */
2873                         parser_trace_action(trace, "ERROR");
2874
2875                         err_tk = tok_copy(*tk);
2876                         while (p.tos > 0 &&
2877                                shift(&p, TK_error, err_tk, states) == 0)
2878                                 // discard this state
2879                                 pop(&p, 1, do_free);
2880                         if (p.tos == 0) {
2881                                 free(err_tk);
2882                                 // no state accepted TK_error
2883                                 break;
2884                         }
2885                         if (!shift_since_err) {
2886                                 /* must discard at least one token to avoid
2887                                  * infinite loop.
2888                                  */
2889                                 if (tk->num == TK_eof)
2890                                         break;
2891                                 free(tk);
2892                                 tk = tok_copy(token_next(tokens));
2893                         }
2894                         shift_since_err = 0;
2895                         tos = &p.stack[p.tos-1];
2896                         while (!in_lookahead(tk, states, tos->state) &&
2897                                tk->num != TK_eof) {
2898                                 free(tk);
2899                                 tk = tok_copy(token_next(tokens));
2900                                 shift_since_err = 1;
2901                         }
2902                 }
2903                 free(tk);
2904                 pop(&p, p.tos, do_free);
2905                 free(p.asn_stack);
2906                 free(p.stack);
2907                 return ret;
2908         }
2909
2910 ###### exported functions
2911         void *parser_run(struct token_state *tokens,
2912                          const struct state states[],
2913                          int (*do_reduce)(int, void**, struct token_config*, void*),
2914                          void (*do_free)(short, void*),
2915                          FILE *trace, const char *non_term[],
2916                          struct token_config *config);
2917
2918 ### Tracing
2919
2920 Being able to visualize the parser in action can be invaluable when
2921 debugging the parser code, or trying to understand how the parse of a
2922 particular grammar progresses.  The stack contains all the important
2923 state, so just printing out the stack every time around the parse loop
2924 can make it possible to see exactly what is happening.
2925
2926 This doesn't explicitly show each SHIFT and REDUCE action.  However they
2927 are easily deduced from the change between consecutive lines, and the
2928 details of each state can be found by cross referencing the states list
2929 in the stack with the "report" that parsergen can generate.
2930
2931 For terminal symbols, we just dump the token.  For non-terminals we
2932 print the name of the symbol.  The look ahead token is reported at the
2933 end inside square brackets.
2934
2935 ###### parser functions
2936
2937         static char *reserved_words[] = {
2938                 [TK_error]        = "ERROR",
2939                 [TK_in]           = "IN",
2940                 [TK_out]          = "OUT",
2941                 [TK_newline]      = "NEWLINE",
2942                 [TK_eof]          = "$eof",
2943         };
2944
2945         void parser_trace(FILE *trace, struct parser *p,
2946                           struct token *tk, const struct state states[],
2947                           const char *non_term[], int knowns)
2948         {
2949                 int i;
2950                 if (!trace)
2951                         return;
2952                 for (i = 0; i < p->tos; i++) {
2953                         struct frame *f = &p->stack[i];
2954                         if (i) {
2955                                 int sym = f->sym;
2956                                 if (sym < TK_reserved &&
2957                                     reserved_words[sym] != NULL)
2958                                         fputs(reserved_words[sym], trace);
2959                                 else if (sym < TK_reserved + knowns) {
2960                                         struct token *t = p->asn_stack[i];
2961                                         text_dump(trace, t->txt, 20);
2962                                 } else
2963                                         fputs(non_term[sym - TK_reserved - knowns],
2964                                               trace);
2965                                 fputs(" ", trace);
2966                         }
2967                         fprintf(trace, "(%d) ", f->state);
2968                 }
2969                 fprintf(trace, "[");
2970                 if (tk->num < TK_reserved &&
2971                     reserved_words[tk->num] != NULL)
2972                         fputs(reserved_words[tk->num], trace);
2973                 else
2974                         text_dump(trace, tk->txt, 20);
2975                 fprintf(trace, ":%d:%d]", tk->line, tk->col);
2976         }
2977
2978         void parser_trace_action(FILE *trace, char *action)
2979         {
2980                 if (trace)
2981                         fprintf(trace, " - %s\n", action);
2982         }
2983
2984 # A Worked Example
2985
2986 The obvious example for a parser is a calculator.
2987
2988 As `scanner` provides number parsing function using `libgmp` it is not much
2989 work to perform arbitrary rational number calculations.
2990
2991 This calculator takes one expression, or an equality test, per line.
2992 The results are printed and if any equality test fails, the program
2993 exits with an error.
2994
2995 ###### File: parsergen.mk
2996         calc.c calc.h : parsergen parsergen.mdc
2997                 ./parsergen --tag calc -o calc parsergen.mdc
2998         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2999                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
3000         calctest : calc
3001                 ./calc parsergen.mdc
3002         demos :: calctest
3003
3004 # calc: header
3005
3006         #include "parse_number.h"
3007         // what do we use for a demo-grammar?  A calculator of course.
3008         struct number {
3009                 mpq_t val;
3010                 char tail[2];
3011                 int err;
3012         };
3013
3014 # calc: code
3015
3016         #include <stdlib.h>
3017         #include <unistd.h>
3018         #include <fcntl.h>
3019         #include <sys/mman.h>
3020         #include <stdio.h>
3021         #include <malloc.h>
3022         #include <gmp.h>
3023         #include <string.h>
3024         #include "mdcode.h"
3025         #include "scanner.h"
3026         #include "parser.h"
3027
3028         #include "calc.h"
3029
3030         static void free_number(struct number *n)
3031         {
3032                 mpq_clear(n->val);
3033                 free(n);
3034         }
3035
3036         static int text_is(struct text t, char *s)
3037         {
3038                 return (strlen(s) == t.len &&
3039                         strncmp(s, t.txt, t.len) == 0);
3040         }
3041
3042         int main(int argc, char *argv[])
3043         {
3044                 int fd = open(argv[1], O_RDONLY);
3045                 int len = lseek(fd, 0, 2);
3046                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
3047                 struct section *table = code_extract(file, file+len, NULL);
3048                 struct section *s;
3049                 struct token_config config = {
3050                         .ignored = (1 << TK_line_comment)
3051                                  | (1 << TK_in)
3052                                  | (1 << TK_out),
3053                         .number_chars = ".,_+-",
3054                         .word_start = "",
3055                         .word_cont = "",
3056                 };
3057                 for (s = table; s; s = s->next)
3058                         if (text_is(s->section, "example: input"))
3059                                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
3060                 while (table) {
3061                         struct section *t = table->next;
3062                         code_free(table->code);
3063                         free(table);
3064                         table = t;
3065                 }
3066                 exit(0);
3067         }
3068
3069 # calc: grammar
3070
3071         $LEFT + -
3072         $LEFT * / //
3073
3074         Session -> Session Line
3075                 | Line
3076
3077         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
3078                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
3079                                         gmp_printf("  or as a decimal: %Fg\n", fl);
3080                                         mpf_clear(fl);
3081                                         }
3082                                      }$
3083                 | Expression = Expression NEWLINE ${
3084                         if (mpq_equal($1.val, $3.val))
3085                                 gmp_printf("Both equal %Qd\n", $1.val);
3086                         else {
3087                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
3088                                         $1.val, $3.val);
3089                                 exit(1);
3090                         }
3091                 }$
3092                 | NEWLINE ${ printf("Blank line\n"); }$
3093                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
3094
3095         $number
3096         Expression -> Expression + Expression ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
3097                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
3098                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
3099                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
3100                 | Expression // Expression ${ {
3101                         mpz_t z0, z1, z2;
3102                         mpq_init($0.val);
3103                         mpz_init(z0); mpz_init(z1); mpz_init(z2);
3104                         mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
3105                         mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
3106                         mpz_tdiv_q(z0, z1, z2);
3107                         mpq_set_z($0.val, z0);
3108                         mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
3109                 } }$
3110                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
3111                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
3112
3113 # example: input
3114
3115         355/113
3116         3.1415926535 - 355/113
3117         2 + 4 * 5
3118         1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
3119         10 * 9 / 2
3120         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
3121
3122         355//113
3123
3124         error