ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 There are several distinct sections.
   8
   9 1. `grammar_read` will read a grammar from a literate-code file and
  10    store the productions, symbols, and code fragments.
  11 2. `grammar_analyse` will take that grammar and build LR parsing
  12    states and look-ahead tables.
  13 3. `grammar_report` will present the details of the analysis
  14    in a readable format and will report any conflicts.
  15 4. `parser_generate` will write out C code files with various
  16    tables and with the code fragments arranged in useful places.
  17 5. `parser_run` is a library function intended to be linked together
  18    with the generated parser tables to complete the implementation of
  19    a parser.
  20 6. `calc.cgm` is a test grammar for a simple calculator.
  21
  22 ###### File: parsergen.c
  23         #include <unistd.h>
  24         #include <stdlib.h>
  25         #include <stdio.h>
  26         ## includes
  27         ## forward declarations
  28         ## declarations
  29         ## functions
  30         ## grammar_read
  31         ## grammar_analyse
  32         ## grammar_report
  33         ## parser_generate
  34         ## main
  35 ###### File: parser.h
  36         ## exported types
  37         ## exported functions
  38 ###### File: libparser.c
  39         #include <unistd.h>
  40         #include <stdlib.h>
  41         #include <stdio.h>
  42         ## parser includes
  43         ## parser
  44 ###### File: calc.cgm
  45         ## demo grammar
  46 ###### File: parsergen.mk
  47         CFLAGS += -Wall -g
  48         all :: parsergen calc
  49         parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
  50                 ./md2c parsergen.mdc
  51
  52 ## Reading the grammar
  53
  54 The grammar must be stored in a markdown literate code file as parsed
  55 by "[mdcode][]".  It must have three top level (i.e. unreferenced)
  56 sections: `header`, `code`, and `grammar`.  The first two will be
  57 literally copied into the generated `.c` and `.h`. files.  The last
  58 contains the grammar.  This is tokenised with "[scanner][]".
  59
  60 [mdcode]: mdcode.html
  61 [scanner]: scanner.html
  62
  63 ###### includes
  64         #include "mdcode.h"
  65         #include "scanner.h"
  66
  67 ###### parser includes
  68         #include "mdcode.h"
  69         #include "scanner.h"
  70
  71 The grammar contains production sets, precedence/associativity
  72 declarations, and data type declarations.  These are all parsed with
  73 _ad hoc_ parsing as we don't have a parser generator yet.
  74
  75 The precedence and associativity can be set for each production, but
  76 can be inherited from symbols.  The data type is potentially defined
  77 for each non-terminal and describes what C structure is used to store
  78 information about each symbol.
  79
  80 ###### declarations
  81         enum assoc {Left, Right, Non};
  82         char *assoc_names[] = {"Left","Right","Non"};
  83
  84         struct symbol {
  85                 struct text struct_name;
  86                 enum assoc assoc;
  87                 unsigned short precedence;
  88                 ## symbol fields
  89         };
  90         struct production {
  91                 unsigned short precedence;
  92                 enum assoc assoc;
  93                 ## production fields
  94         };
  95         struct grammar {
  96                 ## grammar fields
  97         };
  98
  99 The strings reported by `mdcode` and `scanner` are `struct text` which have
 100 length rather than being null terminated.  To help with printing an
 101 comparing we define `text_is` and `prtxt`, which should possibly go in
 102 `mdcode`.  `scanner` does provide `text_dump` which is useful for strings
 103 which might contain control characters.
 104
 105 ###### functions
 106         static int text_is(struct text t, char *s)
 107         {
 108                 return (strlen(s) == t.len &&
 109                         strncmp(s, t.txt, t.len) == 0);
 110         }
 111         static void prtxt(struct text t)
 112         {
 113                 printf("%.*s", t.len, t.txt);
 114         }
 115
 116 ### Symbols
 117
 118 Productions are comprised primarily of symbols - terminal and
 119 non-terminal.  We do not make a syntactic distinction between the two,
 120 though non-terminals must be identifiers.  Non-terminal symbols are
 121 those which appear in the head of a production, terminal symbols are
 122 those which don't.  There are also "virtual" symbols used for precedence
 123 marking discussed later, and sometimes we won't know what type a symbol
 124 is yet.
 125
 126 ###### forward declarations
 127         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 128         char *symtypes = "UVTN";
 129 ###### symbol fields
 130         enum symtype type;
 131
 132 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 133 table of known symbols and the resulting parser will report them as
 134 `TK_reserved + N`.  A small set of identifiers are reserved for the
 135 different token types that `scanner` can report.
 136
 137 ###### declarations
 138         static char *reserved_words[] = {
 139                 [TK_error]        = "ERROR",
 140                 [TK_number]       = "NUMBER",
 141                 [TK_ident]        = "IDENTIFIER",
 142                 [TK_mark]         = "MARK",
 143                 [TK_string]       = "STRING",
 144                 [TK_multi_string] = "MULTI_STRING",
 145                 [TK_indent]       = "INDENT",
 146                 [TK_undent]       = "UNDENT",
 147                 [TK_newline]      = "NEWLINE",
 148                 [TK_eof]          = "$eof",
 149         };
 150 ###### symbol fields
 151         short num;
 152
 153 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 154 recognised.  The former is automatically expected at the end of the text
 155 being parsed. The latter are always ignored by the parser.
 156
 157 All of these symbols are stored in a simple symbol table.  We use the
 158 `struct text` from `mdcode` to store the name and link them together
 159 into a sorted list using an insertion sort.
 160
 161 We don't have separate `find` and `insert` functions as any symbol we
 162 find needs to be remembered.  We simply expect `find` to always return a
 163 symbol, but its type might be `Unknown`.
 164
 165 ###### includes
 166         #include <string.h>
 167
 168 ###### symbol fields
 169         struct text name;
 170         struct symbol *next;
 171
 172 ###### grammar fields
 173         struct symbol *syms;
 174         int num_syms;
 175
 176 ###### functions
 177         static int text_cmp(struct text a, struct text b)
 178         {
 179                 int len = a.len;
 180                 if (a.len > b.len)
 181                         len = b.len;
 182                 int cmp = strncmp(a.txt, b.txt, len);
 183                 if (cmp)
 184                         return cmp;
 185                 else
 186                         return a.len - b.len;
 187         }
 188
 189         static struct symbol *sym_find(struct grammar *g, struct text s)
 190         {
 191                 struct symbol **l = &g->syms;
 192                 struct symbol *n;
 193                 int cmp = 1;
 194
 195                 while (*l &&
 196                         (cmp = text_cmp((*l)->name, s)) < 0)
 197                                 l = & (*l)->next;
 198                 if (cmp == 0)
 199                         return *l;
 200                 n = calloc(1, sizeof(*n));
 201                 n->name = s;
 202                 n->type = Unknown;
 203                 n->next = *l;
 204                 n->num = -1;
 205                 *l = n;
 206                 return n;
 207         }
 208
 209         static void symbols_init(struct grammar *g)
 210         {
 211                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 212                 int i;
 213                 for (i = 0; i < entries; i++) {
 214                         struct text t;
 215                         struct symbol *s;
 216                         t.txt = reserved_words[i];
 217                         if (!t.txt)
 218                                 continue;
 219                         t.len = strlen(t.txt);
 220                         s = sym_find(g, t);
 221                         s->type = Terminal;
 222                         s->num = i;
 223                 }
 224         }
 225
 226 ### Data types and precedence.
 227
 228 Data type specification and precedence specification are both
 229 introduced by a dollar sign at the start of the line.  If the next
 230 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 231 precedence, otherwise it specifies a data type.
 232
 233 The data type name is simply stored and applied to the head of all
 234 subsequent productions.  It must be the name of a structure, so `$expr`
 235 maps to `struct expr`.
 236
 237 The precedence line must contain a list of symbols - typically
 238 terminal symbols, but not necessarily.  It can only contain symbols
 239 that have not been seen yet, so precedence declaration must precede
 240 the productions that they affect.
 241
 242 A precedence line may also contain a "virtual" symbol which is an
 243 identifier preceded by `$$`.  Virtual terminals carry precedence
 244 information but are not included in the grammar.  A production can
 245 declare that it inherits the precedence of a given virtual symbol.
 246
 247 This use for `$$` precludes it from being used as a symbol in the
 248 described language.  Two other symbols: `${` and `}$` are also
 249 unavailable.
 250
 251 Each new precedence line introduces a new precedence level and
 252 declares how it associates.  This level is stored in each symbol
 253 listed and may be inherited by any production which uses the symbol.  A
 254 production inherits from the last symbol which has a precedence.
 255
 256 ###### grammar fields
 257         struct text current_type;
 258         int prec_levels;
 259
 260 ###### declarations
 261         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 262         static const char *known[] = { "$$", "${", "}$" };
 263
 264 ###### functions
 265         static char *dollar_line(struct token_state *ts, struct grammar *g)
 266         {
 267                 struct token t = token_next(ts);
 268                 char *err;
 269                 enum assoc assoc;
 270                 int found;
 271
 272                 if (t.num != TK_ident) {
 273                         err = "type or assoc expected after '$'";
 274                         goto abort;
 275                 }
 276                 if (text_is(t.txt, "LEFT"))
 277                         assoc = Left;
 278                 else if (text_is(t.txt, "RIGHT"))
 279                         assoc = Right;
 280                 else if (text_is(t.txt, "NON"))
 281                         assoc = Non;
 282                 else {
 283                         g->current_type = t.txt;
 284                         t = token_next(ts);
 285                         if (t.num != TK_newline) {
 286                                 err = "Extra tokens after type name";
 287                                 goto abort;
 288                         }
 289                         return NULL;
 290                 }
 291
 292                 // This is a precedence line, need some symbols.
 293                 found = 0;
 294                 g->prec_levels += 1;
 295                 t = token_next(ts);
 296                 while (t.num != TK_newline) {
 297                         enum symtype type = Terminal;
 298                         struct symbol *s;
 299                         if (t.num == TK_virtual) {
 300                                 type = Virtual;
 301                                 t = token_next(ts);
 302                                 if (t.num != TK_ident) {
 303                                         err = "$$ must be followed by a word";
 304                                         goto abort;
 305                                 }
 306                         } else if (t.num != TK_ident &&
 307                                    t.num != TK_mark) {
 308                                 err = "Illegal token in precedence line";
 309                                 goto abort;
 310                         }
 311                         s = sym_find(g, t.txt);
 312                         if (s->type != Unknown) {
 313                                 err = "Symbols in precedence line must not already be known.";
 314                                 goto abort;
 315                         }
 316                         s->type = type;
 317                         s->precedence = g->prec_levels;
 318                         s->assoc = assoc;
 319                         found += 1;
 320                 }
 321                 if (found == 0)
 322                         err = "No symbols given on precedence line";
 323                         goto abort;
 324                 return NULL;
 325         abort:
 326                 while (t.num != TK_newline && t.num != TK_eof)
 327                         t = token_next(ts);
 328                 return err;
 329         }
 330
 331 ### Productions
 332
 333 A production either starts with an identifier which is the head
 334 non-terminal, or a vertical bar (`|`) in which case this production
 335 uses the same head as the previous one.  The identifier must be
 336 followed by a `->` mark.  All productions for a given non-terminal must
 337 be grouped together with the `nonterminal ->` given only once.
 338
 339 After this a (possibly empty) sequence of identifiers and marks form
 340 the body of the production.  A virtual symbol may be given after the
 341 body by preceding it with `$$`.  If a virtual symbol is given, the
 342 precedence of the production is that for the virtual symbol.  If none
 343 is given, the precedence is inherited from the last symbol in the
 344 production which has a precedence specified.
 345
 346 After the optional precedence may come  the `${` mark.  This indicates
 347 the start of a code fragment.  If present, this must be on the same
 348 line as the start of the production.
 349
 350 All of the text from the `${` through to the matching `}$` is
 351 collected and forms the code-fragment for the production.  It must all
 352 be in one `code_node` of the literate code.  The `}$` must be
 353 at the end of a line.
 354
 355 Text in the code fragment will undergo substitutions where `$N` for
 356 some numeric `N` will be replaced with a variable holding the parse
 357 information for the particular symbol in the production.  `$0` is the
 358 head of the production, `$1` is the first symbol of the body, etc.
 359 The type of `$N` for a terminal symbol is `struct token`.  For
 360 non-terminal, it is whatever has been declared for that symbol.
 361
 362 While building productions we will need to add to an array which needs to
 363 grow dynamically.
 364
 365 ###### functions
 366         static void array_add(void *varray, int *cnt, void *new)
 367         {
 368                 void ***array = varray;
 369                 int current = 0;
 370                 const int step = 8;
 371                 current = ((*cnt-1) | (step-1))+1;
 372                 if (*cnt == current) {
 373                         /* must grow */
 374                         current += step;
 375                         *array = realloc(*array, current * sizeof(void*));
 376                 }
 377                 (*array)[*cnt] = new;
 378                 (*cnt) += 1;
 379         }
 380
 381 Collecting the code fragment simply involves reading tokens until we
 382 hit the end token `}$`, and noting the character position of the start and
 383 the end.
 384
 385 ###### functions
 386         static struct text collect_code(struct token_state *state,
 387                                         struct token start)
 388         {
 389                 struct text code;
 390                 struct token t;
 391                 code.txt = start.txt.txt + start.txt.len;
 392                 do
 393                         t = token_next(state);
 394                 while (t.node == start.node &&
 395                        t.num != TK_close && t.num != TK_error &&
 396                        t.num != TK_eof);
 397                 if (t.num == TK_close && t.node == start.node)
 398                         code.len = t.txt.txt - code.txt;
 399                 else
 400                         code.txt = NULL;
 401                 return code;
 402         }
 403
 404 Now we have all the bit we need to parse a full production.
 405
 406 ###### includes
 407         #include <memory.h>
 408
 409 ###### grammar fields
 410         struct production **productions;
 411         int production_count;
 412
 413 ###### production fields
 414         struct symbol  *head;
 415         struct symbol **body;
 416         int             body_size;
 417         struct text     code;
 418
 419 ###### symbol fields
 420         int first_production;
 421
 422 ###### functions
 423         static char *parse_production(struct grammar *g,
 424                                       struct symbol *head,
 425                                       struct token_state *state)
 426         {
 427                 /* Head has already been parsed. */
 428                 struct token tk;
 429                 char *err;
 430                 struct production p, *pp;
 431
 432                 memset(&p, 0, sizeof(p));
 433                 p.head = head;
 434                 tk = token_next(state);
 435                 while (tk.num == TK_ident || tk.num == TK_mark) {
 436                         struct symbol *bs = sym_find(g, tk.txt);
 437                         if (bs->type == Unknown)
 438                                 bs->type = Terminal;
 439                         if (bs->type == Virtual) {
 440                                 err = "Virtual symbol not permitted in production";
 441                                 goto abort;
 442                         }
 443                         if (bs->precedence) {
 444                                 p.precedence = bs->precedence;
 445                                 p.assoc = bs->assoc;
 446                         }
 447                         array_add(&p.body, &p.body_size, bs);
 448                         tk = token_next(state);
 449                 }
 450                 if (tk.num == TK_virtual) {
 451                         struct symbol *vs;
 452                         tk = token_next(state);
 453                         if (tk.num != TK_ident) {
 454                                 err = "word required after $$";
 455                                 goto abort;
 456                         }
 457                         vs = sym_find(g, tk.txt);
 458                         if (vs->type != Virtual) {
 459                                 err = "symbol after $$ must be virtual";
 460                                 goto abort;
 461                         }
 462                         p.precedence = vs->precedence;
 463                         p.assoc = vs->assoc;
 464                         tk = token_next(state);
 465                 }
 466                 if (tk.num == TK_open) {
 467                         p.code = collect_code(state, tk);
 468                         if (p.code.txt == NULL) {
 469                                 err = "code fragment not closed properly";
 470                                 goto abort;
 471                         }
 472                         tk = token_next(state);
 473                 }
 474                 if (tk.num != TK_newline && tk.num != TK_eof) {
 475                         err = "stray tokens at end of line";
 476                         goto abort;
 477                 }
 478                 pp = malloc(sizeof(*pp));
 479                 *pp = p;
 480                 array_add(&g->productions, &g->production_count, pp);
 481                 return NULL;
 482         abort:
 483                 while (tk.num != TK_newline && tk.num != TK_eof)
 484                         tk = token_next(state);
 485                 return err;
 486         }
 487
 488 With the ability to parse production and dollar-lines, we have nearly all
 489 that we need to parse a grammar from a `code_node`.
 490
 491 The head of the first production will effectively be the `start` symbol of
 492 the grammar.  However it wont _actually_ be so.  Processing the grammar is
 493 greatly simplified if the real start symbol only has a single production,
 494 and expect `$eof` as the final terminal.  So when we find the first explicit
 495 production we insert an extra production as production zero which looks like
 496
 497 ###### Example: production 0
 498         $start -> START $eof
 499
 500 where `START` is the first non-terminal give.
 501
 502 ###### create production zero
 503         struct production *p = calloc(1,sizeof(*p));
 504         struct text start = {"$start",6};
 505         struct text eof = {"$eof",4};
 506         p->head = sym_find(g, start);
 507         p->head->type = Nonterminal;
 508         array_add(&p->body, &p->body_size, head);
 509         array_add(&p->body, &p->body_size, sym_find(g, eof));
 510         g->start  = p->head->num;
 511         p->head->first_production = g->production_count;
 512         array_add(&g->productions, &g->production_count, p);
 513
 514 Now we are ready to read in the grammar.
 515
 516 ###### grammar fields
 517         int start;      // the 'start' symbol.
 518
 519 ###### grammar_read
 520         static struct grammar *grammar_read(struct code_node *code)
 521         {
 522                 struct token_config conf = {
 523                         .word_start = "",
 524                         .word_cont = "",
 525                         .words_marks = known,
 526                         .known_count = sizeof(known)/sizeof(known[0]),
 527                         .number_chars = "",
 528                         .ignored = (1 << TK_line_comment)
 529                                  | (1 << TK_block_comment)
 530                                  | (0 << TK_number)
 531                                  | (1 << TK_string)
 532                                  | (1 << TK_multi_string)
 533                                  | (1 << TK_indent)
 534                                  | (1 << TK_undent),
 535                 };
 536
 537                 struct token_state *state = token_open(code, &conf);
 538                 struct token tk = token_next(state);
 539                 struct symbol *head = NULL;
 540                 struct grammar *g;
 541                 char *err = NULL;
 542
 543                 g = calloc(1, sizeof(*g));
 544                 symbols_init(g);
 545
 546                 for (tk = token_next(state); tk.num != TK_eof;
 547                      tk = token_next(state)) {
 548                         if (tk.num == TK_newline)
 549                                 continue;
 550                         if (tk.num == TK_ident) {
 551                                 // new non-terminal
 552                                 head = sym_find(g, tk.txt);
 553                                 if (head->type == Nonterminal)
 554                                         err = "This non-terminal has already be used.";
 555                                 else if (head->type == Virtual)
 556                                         err = "Virtual symbol not permitted in head of production";
 557                                 else {
 558                                         head->type = Nonterminal;
 559                                         head->struct_name = g->current_type;
 560                                         if (g->start == 0) {
 561                                                 ## create production zero
 562                                         }
 563                                         head->first_production = g->production_count;
 564                                         tk = token_next(state);
 565                                         if (tk.num == TK_mark &&
 566                                             text_is(tk.txt, "->"))
 567                                                 err = parse_production(g, head, state);
 568                                         else
 569                                                 err = "'->' missing in production";
 570                                 }
 571                         } else if (tk.num == TK_mark
 572                                    && text_is(tk.txt, "|")) {
 573                                 // another production for same non-term
 574                                 if (head)
 575                                         err = parse_production(g, head, state);
 576                                 else
 577                                         err = "First production must have a head";
 578                         } else if (tk.num == TK_mark
 579                                    &&  text_is(tk.txt, "$")) {
 580                                 err = dollar_line(state, g);
 581                         } else {
 582                                 err = "Unrecognised token at start of line.";
 583                         }
 584                         if (err)
 585                                 goto abort;
 586                 }
 587                 token_close(state);
 588                 return g;
 589         abort:
 590                 fprintf(stderr, "Error at line %d: %s\n",
 591                         tk.line, err);
 592                 token_close(state);
 593                 free(g);
 594                 return NULL;
 595         }
 596
 597 ## Analysing the grammar
 598
 599 The central task in analysing the grammar is to determine a set of
 600 states to drive the parsing process.  This will require finding
 601 various sets of symbols and of "items".  Some of these sets will need
 602 to attach information to each element in the set, so they are more
 603 like maps.
 604
 605 Each "item" may have a 'look-ahead' or `LA` set associated with
 606 it.  Many of these will be the same as each other.  To avoid memory
 607 wastage, and to simplify some comparisons of sets, these sets will be
 608 stored in a list of unique sets, each assigned a number.
 609
 610 Once we have the data structures in hand to manage these sets and
 611 lists, we can start setting the 'nullable' flag, build the 'FIRST'
 612 sets, and then create the item sets which define the various states.
 613
 614 ### Symbol sets.
 615
 616 Though we don't only store symbols in these sets, they are the main
 617 things we store, so they are called symbol sets or "symsets".
 618
 619 A symset has a size, an array of shorts, and an optional array of data
 620 values, which are also shorts.  If the array of data is not present,
 621 we store an impossible pointer, as `NULL` is used to indicate that no
 622 memory has been allocated yet;
 623
 624 ###### declarations
 625         struct symset {
 626                 short cnt;
 627                 unsigned short *syms, *data;
 628         };
 629         #define NO_DATA ((unsigned short *)1)
 630         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 631         const struct symset INIT_DATASET = { 0, NULL, NULL };
 632
 633 The arrays of shorts are allocated in blocks of 8 and are kept sorted
 634 by using an insertion sort.  We don't explicitly record the amount of
 635 allocated space as it can be derived directly from the current `cnt` using
 636 `((cnt - 1) | 7) + 1`.
 637
 638 ###### functions
 639         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 640         {
 641                 int i;
 642                 int current = ((s->cnt-1) | 7) + 1;
 643                 if (current == s->cnt) {
 644                         current += 8;
 645                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 646                         if (s->data != NO_DATA)
 647                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 648                 }
 649                 i = s->cnt;
 650                 while (i > 0 && s->syms[i-1] > key) {
 651                         s->syms[i] = s->syms[i-1];
 652                         if (s->data != NO_DATA)
 653                                 s->data[i] = s->data[i-1];
 654                         i--;
 655                 }
 656                 s->syms[i] = key;
 657                 if (s->data != NO_DATA)
 658                         s->data[i] = val;
 659                 s->cnt += 1;
 660         }
 661
 662 Finding a symbol (or item) in a `symset` uses a simple binary search.
 663 We return the index where the value was found (so data can be accessed),
 664 or `-1` to indicate failure.
 665
 666         static int symset_find(struct symset *ss, unsigned short key)
 667         {
 668                 int lo = 0;
 669                 int hi = ss->cnt;
 670
 671                 if (hi == 0)
 672                         return -1;
 673                 while (lo + 1 < hi) {
 674                         int mid = (lo + hi) / 2;
 675                         if (ss->syms[mid] <= key)
 676                                 lo = mid;
 677                         else
 678                                 hi = mid;
 679                 }
 680                 if (ss->syms[lo] == key)
 681                         return lo;
 682                 return -1;
 683         }
 684
 685 We will often want to form the union of two symsets.  When we do, we
 686 will often want to know if anything changed (as they might mean there
 687 is more work to do).  So `symset_union` reports whether anything was
 688 added to the first set.  We use a slow quadratic approach as these
 689 sets don't really get very big.  If profiles shows this to be a problem is
 690 can be optimised later.
 691
 692         static int symset_union(struct symset *a, struct symset *b)
 693         {
 694                 int i;
 695                 int added = 0;
 696                 for (i = 0; i < b->cnt; i++)
 697                         if (symset_find(a, b->syms[i]) < 0) {
 698                                 unsigned short data = 0;
 699                                 if (b->data != NO_DATA)
 700                                         data = b->data[i];
 701                                 symset_add(a, b->syms[i], data);
 702                                 added++;
 703                         }
 704                 return added;
 705         }
 706
 707 And of course we must be able to free a symset.
 708
 709         static void symset_free(struct symset ss)
 710         {
 711                 free(ss.syms);
 712                 if (ss.data != NO_DATA)
 713                         free(ss.data);
 714         }
 715
 716 ### Symset Storage
 717
 718 Some symsets are simply stored somewhere appropriate in the `grammar`
 719 data structure, others needs a bit of indirection.  This applies
 720 particularly to "LA" sets.  These will be paired with "items" in an
 721 "itemset".  As itemsets will be stored in a symset, the "LA" set needs to be
 722 stored in the `data` field, so we need a mapping from a small (short)
 723 number to an LA `symset`.
 724
 725 As mentioned earlier this involves creating a list of unique symsets.
 726
 727 For now, we just use a linear list sorted by insertion.  A skiplist
 728 would be more efficient and may be added later.
 729
 730 ###### declarations
 731
 732         struct setlist {
 733                 struct symset ss;
 734                 int num;
 735                 struct setlist *next;
 736         };
 737
 738 ###### grammar fields
 739         struct setlist *sets;
 740         int nextset;
 741
 742 ###### functions
 743
 744         static int ss_cmp(struct symset a, struct symset b)
 745         {
 746                 int i;
 747                 int diff = a.cnt - b.cnt;
 748                 if (diff)
 749                         return diff;
 750                 for (i = 0; i < a.cnt; i++) {
 751                         diff = (int)a.syms[i] - (int)b.syms[i];
 752                         if (diff)
 753                                 return diff;
 754                 }
 755                 return 0;
 756         }
 757
 758         static int save_set(struct grammar *g, struct symset ss)
 759         {
 760                 struct setlist **sl = &g->sets;
 761                 int cmp = 1;
 762                 struct setlist *s;
 763
 764                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 765                         sl = & (*sl)->next;
 766                 if (cmp == 0) {
 767                         symset_free(ss);
 768                         return (*sl)->num;
 769                 }
 770
 771                 s = malloc(sizeof(*s));
 772                 s->ss = ss;
 773                 s->num = g->nextset;
 774                 g->nextset += 1;
 775                 s->next = *sl;
 776                 *sl = s;
 777                 return s->num;
 778         }
 779
 780 Finding a set by number is currently performed by a simple linear search.
 781 If this turns out to hurt performance, we can store the sets in a dynamic
 782 array like the productions.
 783
 784         static struct symset set_find(struct grammar *g, int num)
 785         {
 786                 struct setlist *sl = g->sets;
 787                 while (sl && sl->num != num)
 788                         sl = sl->next;
 789                 return sl->ss;
 790         }
 791
 792
 793 ### Setting `nullable`
 794
 795 We set `nullable` on the head symbol for any production for which all
 796 the body symbols (if any) are nullable.  As this is a recursive
 797 definition, any change in the `nullable` setting means that we need to
 798 re-evaluate where it needs to be set.
 799
 800 We simply loop around performing the same calculations until no more
 801 changes happen.
 802
 803 ###### symbol fields
 804         int nullable;
 805
 806 ###### functions
 807         static void set_nullable(struct grammar *g)
 808         {
 809                 int check_again = 1;
 810                 while (check_again) {
 811                         int p;
 812                         check_again = 0;
 813                         for (p = 0; p < g->production_count; p++) {
 814                                 struct production *pr = g->productions[p];
 815                                 int s;
 816
 817                                 if (pr->head->nullable)
 818                                         continue;
 819                                 for (s = 0; s < pr->body_size; s++)
 820                                         if (! pr->body[s]->nullable)
 821                                                 break;
 822                                 if (s == pr->body_size) {
 823                                         pr->head->nullable = 1;
 824                                         check_again = 1;
 825                                 }
 826                         }
 827                 }
 828         }
 829
 830 ### Building the `first` sets
 831
 832 When calculating what can follow a particular non-terminal, we will need to
 833 know what the "first" terminal in an subsequent non-terminal might be.  So
 834 we calculate the `first` set for every non-terminal and store them in an
 835 array.  We don't bother recording the "first" set for terminals as they are
 836 trivial.
 837
 838 As the "first" for one symbol might depend on the "first" of another,
 839 we repeat the calculations until no changes happen, just like with
 840 `set_nullable`.  This makes use of the fact that `symset_union`
 841 reports if any change happens.
 842
 843 The core of this which finds the "first" of part of a production body
 844 will be reused for computing the "follow" sets, so we split it out
 845 into a separate function.
 846
 847 ###### grammar fields
 848         struct symset *first;
 849
 850 ###### functions
 851
 852         static int add_first(struct production *pr, int start,
 853                              struct symset *target, struct grammar *g,
 854                              int *to_end)
 855         {
 856                 int s;
 857                 int changed = 0;
 858                 for (s = start; s < pr->body_size; s++) {
 859                         struct symbol *bs = pr->body[s];
 860                         if (bs->type == Terminal) {
 861                                 if (symset_find(target, bs->num) < 0) {
 862                                         symset_add(target, bs->num, 0);
 863                                         changed = 1;
 864                                 }
 865                                 break;
 866                         } else if (symset_union(target, &g->first[bs->num]))
 867                                 changed = 1;
 868                         if (!bs->nullable)
 869                                 break;
 870                 }
 871                 if (to_end)
 872                         *to_end = (s == pr->body_size);
 873                 return changed;
 874         }
 875
 876         static void build_first(struct grammar *g)
 877         {
 878                 int check_again = 1;
 879                 int s;
 880                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
 881                 for (s = 0; s < g->num_syms; s++)
 882                         g->first[s] = INIT_SYMSET;
 883
 884                 while (check_again) {
 885                         int p;
 886                         check_again = 0;
 887                         for (p = 0; p < g->production_count; p++) {
 888                                 struct production *pr = g->productions[p];
 889                                 struct symset *head = &g->first[pr->head->num];
 890
 891                                 if (add_first(pr, 0, head, g, NULL))
 892                                         check_again = 1;
 893                         }
 894                 }
 895         }
 896
 897 ### Building the `follow` sets.
 898
 899 There are two different situations when we will want to generate "follow"
 900 sets.  If we are doing an SLR analysis, we want to generate a single
 901 "follow" set for each non-terminal in the grammar.  That is what is
 902 happening here.  If we are doing an LALR or LR analysis we will want
 903 to generate a separate "LA" set for each item.  We do that later
 904 in state generation.
 905
 906 There are two parts to generating a "follow" set.  Firstly we look at
 907 every place that any non-terminal appears in the body of any
 908 production, and we find the set of possible "first" symbols after
 909 there.  This is done using `add_first` above and only needs to be done
 910 once as the "first" sets are now stable and will not change.
 911
 912 ###### follow code
 913
 914         for (p = 0; p < g->production_count; p++) {
 915                 struct production *pr = g->productions[p];
 916                 int b;
 917
 918                 for (b = 0; b < pr->body_size - 1; b++) {
 919                         struct symbol *bs = pr->body[b];
 920                         if (bs->type == Terminal)
 921                                 continue;
 922                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
 923                 }
 924         }
 925
 926 The second part is to add the "follow" set of the head of a production
 927 to the "follow" sets of the final symbol in the production, and any
 928 other symbol which is followed only by `nullable` symbols.  As this
 929 depends on "follow" itself we need to repeatedly perform the process
 930 until no further changes happen.
 931
 932 ###### follow code
 933
 934         for (again = 0, p = 0;
 935              p < g->production_count;
 936              p < g->production_count-1
 937                 ? p++ : again ? (again = 0, p = 0)
 938                               : p++) {
 939                 struct production *pr = g->productions[p];
 940                 int b;
 941
 942                 for (b = pr->body_size - 1; b >= 0; b--) {
 943                         struct symbol *bs = pr->body[b];
 944                         if (bs->type == Terminal)
 945                                 break;
 946                         if (symset_union(&g->follow[bs->num],
 947                                          &g->follow[pr->head->num]))
 948                                 again = 1;
 949                         if (!bs->nullable)
 950                                 break;
 951                 }
 952         }
 953
 954 We now just need to create and initialise the `follow` list to get a
 955 complete function.
 956
 957 ###### grammar fields
 958         struct symset *follow;
 959
 960 ###### functions
 961         static void build_follow(struct grammar *g)
 962         {
 963                 int s, again, p;
 964                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
 965                 for (s = 0; s < g->num_syms; s++)
 966                         g->follow[s] = INIT_SYMSET;
 967                 ## follow code
 968         }
 969
 970 ### Building itemsets and states
 971
 972 There are three different levels of detail that can be chosen for
 973 building the itemsets and states for the LR grammar.  They are:
 974
 975 1. LR(0) or SLR(1), where no look-ahead is considered.
 976 2. LALR(1) where we build look-ahead sets with each item and merge
 977    the LA sets when we find two paths to the same "kernel" set of items.
 978 3. LR(1) where different look-ahead for any item in the code means
 979    a different state must be created.
 980
 981 ###### forward declarations
 982         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
 983
 984 We need to be able to look through existing states to see if a newly
 985 generated state already exists.  For now we use a simple sorted linked
 986 list.
 987
 988 An item is a pair of numbers: the production number and the position of
 989 "DOT", which is an index into the body of the production.
 990 As the numbers are not enormous we can combine them into a single "short"
 991 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
 992 production number (so 4000 productions with maximum of 15 symbols in the
 993 body).
 994
 995 Comparing the itemsets will be a little different to comparing symsets
 996 as we want to do the lookup after generating the "kernel" of an
 997 itemset, so we need to ignore the offset=zero items which are added during
 998 completion.
 999
1000 To facilitate this, we modify the "DOT" number so that "0" sorts to the end of
1001 the list in the symset, and then only compare items before the first "0".
1002
1003 ###### declarations
1004         static inline unsigned short item_num(int production, int index)
1005         {
1006                 return production | ((31-index) << 11);
1007         }
1008         static inline int item_prod(unsigned short item)
1009         {
1010                 return item & 0x7ff;
1011         }
1012         static inline int item_index(unsigned short item)
1013         {
1014                 return (31-(item >> 11)) & 0x1f;
1015         }
1016
1017 For LR(1) analysis we need to compare not just the itemset in a state
1018 but also the LA sets.  As we assign each unique LA set a number, we
1019 can just compare the symset and the data values together.
1020
1021 ###### functions
1022         static int itemset_cmp(struct symset a, struct symset b,
1023                                enum grammar_type type)
1024         {
1025                 int i;
1026                 int av, bv;
1027
1028                 for (i = 0;
1029                      i < a.cnt && i < b.cnt &&
1030                      item_index(a.syms[i]) > 0 &&
1031                      item_index(b.syms[i]) > 0;
1032                      i++) {
1033                         int diff = a.syms[i] - b.syms[i];
1034                         if (diff)
1035                                 return diff;
1036                         if (type == LR1) {
1037                                 diff = a.data[i] - b.data[i];
1038                                 if (diff)
1039                                         return diff;
1040                         }
1041                 }
1042                 if (i == a.cnt || item_index(a.syms[i]) == 0)
1043                         av = -1;
1044                 else
1045                         av = a.syms[i];
1046                 if (i == b.cnt || item_index(b.syms[i]) == 0)
1047                         bv = -1;
1048                 else
1049                         bv = b.syms[i];
1050                 if (av - bv)
1051                         return av - bv;
1052                 if (type < LR1 || av == -1)
1053                         return 0;
1054                 return
1055                         a.data[i] - b.data[i];
1056         }
1057
1058 And now we can build the list of itemsets.  The lookup routine returns
1059 both a success flag and a pointer to where in the list an insert
1060 should happen, so we don't need to search a second time.
1061
1062 ###### declarations
1063         struct itemset {
1064                 struct itemset *next;
1065                 short state;
1066                 struct symset items;
1067                 struct symset go_to;
1068                 char completed;
1069         };
1070
1071 ###### grammar fields
1072         struct itemset *items;
1073         int states;
1074
1075 ###### functions
1076         static int itemset_find(struct grammar *g, struct itemset ***where,
1077                                 struct symset kernel, enum grammar_type type)
1078         {
1079                 struct itemset **ip;
1080
1081                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1082                         struct itemset *i = *ip;
1083                         int diff;
1084                         diff = itemset_cmp(i->items, kernel, type);
1085                         if (diff < 0)
1086                                 continue;
1087                         if (diff > 0)
1088                                 break;
1089                         /* found */
1090                         *where = ip;
1091                         return 1;
1092                 }
1093                 *where = ip;
1094                 return 0;
1095         }
1096
1097 Adding an itemset may require merging the LA sets if LALR analysis is
1098 happening. If any new LA set add symbol that weren't in the old LA set, we
1099 clear the `completed` flag so that the dependants of this itemset will be
1100 recalculated and their LA sets updated.
1101
1102 `add_itemset` must consume the symsets it is passed, either by adding
1103 them to a data structure, of freeing them.
1104
1105         static int add_itemset(struct grammar *g, struct symset ss,
1106                                enum grammar_type type)
1107         {
1108                 struct itemset **where, *is;
1109                 int i;
1110                 int found = itemset_find(g, &where, ss, type);
1111                 if (!found) {
1112                         is = calloc(1, sizeof(*is));
1113                         is->state = g->states;
1114                         g->states += 1;
1115                         is->items = ss;
1116                         is->next = *where;
1117                         is->go_to = INIT_DATASET;
1118                         *where = is;
1119                         return is->state;
1120                 }
1121                 is = *where;
1122                 if (type != LALR) {
1123                         symset_free(ss);
1124                         return is->state;
1125                 }
1126                 for (i = 0; i < ss.cnt; i++) {
1127                         struct symset temp = INIT_SYMSET, s;
1128                         if (ss.data[i] == is->items.data[i])
1129                                 continue;
1130                         s = set_find(g, is->items.data[i]);
1131                         symset_union(&temp, &s);
1132                         s = set_find(g, ss.data[i]);
1133                         if (symset_union(&temp, &s)) {
1134                                 is->items.data[i] = save_set(g, temp);
1135                                 is->completed = 0;
1136                         } else
1137                                 symset_free(temp);
1138                 }
1139                 symset_free(ss);
1140                 return is->state;
1141         }
1142
1143 #### The build
1144
1145 To build all the itemsets, we first insert the initial itemset made from the
1146 start symbol, complete each itemset, and then generate new itemsets from old
1147 until no new ones can be made.
1148
1149 Completing an itemset means finding all the items where "DOT" is followed by
1150 a nonterminal and adding "DOT=0" items for every production from that
1151 non-terminal - providing each item hasn't already been added.
1152
1153 If LA sets are needed, the LA set for each new item is found using
1154 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This may
1155 be supplemented by the LA set for the item which produce the new item.
1156
1157 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1158 is used in the next stage.
1159
1160 NOTE: precedence handling should happen here - I haven't written this yet
1161 though.
1162
1163 ###### complete itemset
1164         for (i = 0; i < is->items.cnt; i++) {
1165                 int p = item_prod(is->items.syms[i]);
1166                 int bs = item_index(is->items.syms[i]);
1167                 struct production *pr = g->productions[p];
1168                 int p2;
1169                 struct symbol *s;
1170                 struct symset LA = INIT_SYMSET;
1171                 unsigned short sn = 0;
1172
1173                 if (bs == pr->body_size)
1174                         continue;
1175                 s = pr->body[bs];
1176                 if (symset_find(&done, s->num) < 0)
1177                         symset_add(&done, s->num, 0);
1178                 if (s->type != Nonterminal)
1179                         continue;
1180                 again = 1;
1181                 if (type >= LALR) {
1182                         // Need the LA set.
1183                         int to_end;
1184                         add_first(pr, bs+1, &LA, g, &to_end);
1185                         if (to_end) {
1186                                 struct symset ss = set_find(g, is->items.data[i]);
1187                                 symset_union(&LA, &ss);
1188                         }
1189                         sn = save_set(g, LA);
1190                         LA = set_find(g, sn);
1191                 }
1192                 /* Add productions for this symbol */
1193                 for (p2 = s->first_production;
1194                      p2 < g->production_count &&
1195                       g->productions[p2]->head == s;
1196                      p2++) {
1197                         int itm = item_num(p2, 0);
1198                         int pos = symset_find(&is->items, itm);
1199                         if (pos < 0) {
1200                                 symset_add(&is->items, itm, sn);
1201                                 /* Will have re-ordered, so start
1202                                  * from beginning again */
1203                                 i = -1;
1204                         } else if (type >= LALR) {
1205                                 struct symset ss = set_find(g, is->items.data[pos]);
1206                                 struct symset tmp = INIT_SYMSET;
1207
1208                                 symset_union(&tmp, &ss);
1209                                 if (symset_union(&tmp, &LA)) {
1210                                         is->items.data[pos] = save_set(g, tmp);
1211                                         i = -1;
1212                                 }else
1213                                         symset_free(tmp);
1214                         }
1215                 }
1216         }
1217
1218 For each symbol we found (and placed in `done`) we collect all the items for
1219 which this symbol is next, and create a new itemset with "DOT" advanced over
1220 the symbol.  This is then added to the collection of itemsets (or merged
1221 with a pre-existing itemset).
1222
1223 ###### derive itemsets
1224         // Now we have a completed itemset, so we need to
1225         // create all the 'next' itemset and create them
1226         // if they don't exist.
1227         for (i = 0; i < done.cnt; i++) {
1228                 int j;
1229                 unsigned short state;
1230                 struct symset newitemset = INIT_SYMSET;
1231                 if (type >= LALR)
1232                         newitemset = INIT_DATASET;
1233
1234                 for (j = 0; j < is->items.cnt; j++) {
1235                         int itm = is->items.syms[j];
1236                         int p = item_prod(itm);
1237                         int bp = item_index(itm);
1238                         struct production *pr = g->productions[p];
1239                         unsigned short la = 0;
1240                         int pos;
1241
1242                         if (bp == pr->body_size)
1243                                 continue;
1244                         if (pr->body[bp]->num != done.syms[i])
1245                                 continue;
1246                         if (type >= LALR)
1247                                 la = is->items.data[j];
1248                         pos = symset_find(&newitemset, pr->head->num);
1249                         if (pos < 0)
1250                                 symset_add(&newitemset, item_num(p, bp+1), la);
1251                         else if (type >= LALR) {
1252                                 // Need to merge la set.
1253                                 int la2 = newitemset.data[pos];
1254                                 if (la != la2) {
1255                                         struct symset ss = set_find(g, la2);
1256                                         struct symset LA = INIT_SYMSET;
1257                                         symset_union(&LA, &ss);
1258                                         ss = set_find(g, la);
1259                                         if (symset_union(&LA, &ss))
1260                                                 newitemset.data[pos] = save_set(g, LA);
1261                                         else
1262                                                 symset_free(LA);
1263                                 }
1264                         }
1265                 }
1266                 state = add_itemset(g, newitemset, type);
1267                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1268                         symset_add(&is->go_to, done.syms[i], state);
1269         }
1270
1271 All that is left is to crate the initial itemset from production zero, and
1272 with `TK_eof` as the LA set.
1273
1274 ###### functions
1275         static void build_itemsets(struct grammar *g, enum grammar_type type)
1276         {
1277                 struct symset first = INIT_SYMSET;
1278                 struct itemset *is;
1279                 int again;
1280                 unsigned short la = 0;
1281                 if (type >= LALR) {
1282                         // LA set just has eof
1283                         struct symset eof = INIT_SYMSET;
1284                         symset_add(&eof, TK_eof, 0);
1285                         la = save_set(g, eof);
1286                         first = INIT_DATASET;
1287                 }
1288                 // production 0, offset 0  (with no data)
1289                 symset_add(&first, item_num(0, 0), la);
1290                 add_itemset(g, first, type);
1291                 for (again = 0, is = g->items;
1292                      is;
1293                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
1294                         int i;
1295                         struct symset done = INIT_SYMSET;
1296                         if (is->completed)
1297                                 continue;
1298                         is->completed = 1;
1299                         ## complete itemset
1300                         ## derive itemsets
1301                         symset_free(done);
1302                 }
1303         }
1304
1305 ### Completing the analysis.
1306
1307 The exact process of analysis depends on which level was requested.  For
1308 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1309 `LR1` we need first, but not follow.  For `SLR` we need both.
1310
1311 We don't build the "action" tables that you might expect as the parser
1312 doesn't use them.  They will be calculated to some extent if needed for
1313 a report.
1314
1315 Once we have built everything we allocate arrays for the two lists:
1316 symbols and itemsets.  This allows more efficient access during reporting.
1317 The symbols are grouped as terminals and non-terminals and we record the
1318 changeover point in `first_nonterm`.
1319
1320 ###### grammar fields
1321         struct symbol **symtab;
1322         struct itemset **statetab;
1323         int first_nonterm;
1324
1325 ###### grammar_analyse
1326
1327         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1328         {
1329                 struct symbol *s;
1330                 struct itemset *is;
1331                 int snum = TK_reserved;
1332                 for (s = g->syms; s; s = s->next)
1333                         if (s->num < 0 && s->type == Terminal) {
1334                                 s->num = snum;
1335                                 snum++;
1336                         }
1337                 g->first_nonterm = snum;
1338                 for (s = g->syms; s; s = s->next)
1339                         if (s->num < 0) {
1340                                 s->num = snum;
1341                                 snum++;
1342                         }
1343                 g->num_syms = snum;
1344                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1345                 for (s = g->syms; s; s = s->next)
1346                         g->symtab[s->num] = s;
1347
1348                 if (type >= SLR) {
1349                         set_nullable(g);
1350                         build_first(g);
1351                 }
1352                 if (type == SLR)
1353                         build_follow(g);
1354
1355                 build_itemsets(g, type);
1356
1357                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1358                 for (is = g->items; is ; is = is->next)
1359                         g->statetab[is->state] = is;
1360         }
1361
1362 ## Reporting on the Grammar
1363
1364 The purpose of the report is to give the grammar developer insight into
1365 how the grammar parser will work.  It is basically a structured dump of
1366 all the tables that have been generated, plus an description of any conflicts.
1367
1368 ###### grammar_report
1369         static int grammar_report(struct grammar *g, enum grammar_type type)
1370         {
1371                 report_symbols(g);
1372                 if (g->follow)
1373                         report_follow(g);
1374                 report_itemsets(g);
1375                 return report_conflicts(g, type);
1376         }
1377
1378 Firstly we have the complete list of symbols, together with the "FIRST"
1379 set if that was generated.
1380
1381 ###### functions
1382
1383         static void report_symbols(struct grammar *g)
1384         {
1385                 int n;
1386                 if (g->first)
1387                         printf("SYMBOLS + FIRST:\n");
1388                 else
1389                         printf("SYMBOLS:\n");
1390
1391                 for (n = 0; n < g->num_syms; n++) {
1392                         struct symbol *s = g->symtab[n];
1393                         if (!s)
1394                                 continue;
1395
1396                         printf(" %c%3d%c: ",
1397                                s->nullable ? '*':' ',
1398                                s->num, symtypes[s->type]);
1399                         prtxt(s->name);
1400                         if (s->precedence)
1401                                 printf(" (%d%s)", s->precedence,
1402                                        assoc_names[s->assoc]);
1403
1404                         if (g->first && s->type == Nonterminal) {
1405                                 int j;
1406                                 char c = ':';
1407                                 for (j = 0; j < g->first[n].cnt; j++) {
1408                                         printf("%c ", c);
1409                                         c = ',';
1410                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1411                                 }
1412                         }
1413                         printf("\n");
1414                 }
1415         }
1416
1417 Then we have to follow sets if they were computed.
1418
1419         static void report_follow(struct grammar *g)
1420         {
1421                 int n;
1422                 printf("FOLLOW:\n");
1423                 for (n = 0; n < g->num_syms; n++)
1424                         if (g->follow[n].cnt) {
1425                                 int j;
1426                                 char c = ':';
1427                                 printf("  ");
1428                                 prtxt(g->symtab[n]->name);
1429                                 for (j = 0; j < g->follow[n].cnt; j++) {
1430                                         printf("%c ", c);
1431                                         c = ',';
1432                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1433                                 }
1434                                 printf("\n");
1435                         }
1436         }
1437
1438 And finally the item sets.  These include the GO TO tables and, for
1439 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1440 it up a bit.  First the items, with production number and associativity.
1441
1442         static void report_item(struct grammar *g, int itm)
1443         {
1444                 int p = item_prod(itm);
1445                 int dot = item_index(itm);
1446                 struct production *pr = g->productions[p];
1447                 int i;
1448
1449                 printf("    ");
1450                 prtxt(pr->head->name);
1451                 printf(" ->");
1452                 for (i = 0; i < pr->body_size; i++) {
1453                         printf(" %s", (dot == i ? ". ": ""));
1454                         prtxt(pr->body[i]->name);
1455                 }
1456                 if (dot == pr->body_size)
1457                         printf(" .");
1458                 printf(" [%d]", p);
1459                 if (pr->precedence)
1460                         printf(" (%d%s)", pr->precedence,
1461                                assoc_names[pr->assoc]);
1462                 printf("\n");
1463         }
1464
1465 The LA sets which are (possibly) reported with each item:
1466
1467         static void report_la(struct grammar *g, int lanum)
1468         {
1469                 struct symset la = set_find(g, lanum);
1470                 int i;
1471                 char c = ':';
1472
1473                 printf("        LOOK AHEAD(%d)", lanum);
1474                 for (i = 0; i < la.cnt; i++) {
1475                         printf("%c ", c);
1476                         c = ',';
1477                         prtxt(g->symtab[la.syms[i]]->name);
1478                 }
1479                 printf("\n");
1480         }
1481
1482 Then the go to sets:
1483
1484
1485         static void report_goto(struct grammar *g, struct symset gt)
1486         {
1487                 int i;
1488                 printf("    GOTO:\n");
1489
1490                 for (i = 0; i < gt.cnt; i++) {
1491                         printf("      ");
1492                         prtxt(g->symtab[gt.syms[i]]->name);
1493                         printf(" -> %d\n", gt.data[i]);
1494                 }
1495         }
1496
1497 Now we can report all the item sets complete with items, LA sets, and GO TO.
1498
1499         static void report_itemsets(struct grammar *g)
1500         {
1501                 int s;
1502                 printf("ITEM SETS(%d)\n", g->states);
1503                 for (s = 0; s < g->states; s++) {
1504                         int j;
1505                         struct itemset *is = g->statetab[s];
1506                         printf("  Itemset %d:\n", s);
1507                         for (j = 0; j < is->items.cnt; j++) {
1508                                 report_item(g, is->items.syms[j]);
1509                                 if (is->items.data != NO_DATA)
1510                                         report_la(g, is->items.data[j]);
1511                         }
1512                         report_goto(g, is->go_to);
1513                 }
1514         }
1515
1516 ### Reporting conflicts
1517
1518 Conflict detection varies a lot among different analysis levels.  However
1519 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1520 LR1 are also similar as they have FOLLOW or LA sets.
1521
1522 ###### functions
1523
1524         ## conflict functions
1525
1526         static int report_conflicts(struct grammar *g, enum grammar_type type)
1527         {
1528                 int cnt = 0;
1529                 printf("Conflicts:\n");
1530                 if (type < SLR)
1531                         cnt = conflicts_lr0(g, type);
1532                 else
1533                         cnt = conflicts_slr(g, type);
1534                 if (cnt == 0)
1535                         printf(" - no conflicts\n");
1536                 return cnt;
1537         }
1538
1539 LR0 conflicts are any state which have both a reducible item and
1540 a shiftable item.
1541
1542 LR05 conflicts only occurs if two possibly reductions exist,
1543 as shifts always over-ride reductions.
1544
1545 ###### conflict functions
1546         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1547         {
1548                 int i;
1549                 int cnt = 0;
1550                 for (i = 0; i < g->states; i++) {
1551                         struct itemset *is = g->statetab[i];
1552                         int last_reduce = -1;
1553                         int prev_reduce = -1;
1554                         int last_shift = -1;
1555                         int j;
1556                         if (!is)
1557                                 continue;
1558                         for (j = 0; j < is->items.cnt; j++) {
1559                                 int itm = is->items.syms[j];
1560                                 int p = item_prod(itm);
1561                                 int bp = item_index(itm);
1562                                 struct production *pr = g->productions[p];
1563
1564                                 if (bp == pr->body_size) {
1565                                         prev_reduce = last_reduce;
1566                                         last_reduce = j;
1567                                         continue;
1568                                 }
1569                                 if (pr->body[bp]->type == Terminal)
1570                                         last_shift = j;
1571                         }
1572                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1573                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1574                                 report_item(g, is->items.syms[last_shift]);
1575                                 report_item(g, is->items.syms[last_reduce]);
1576                                 cnt++;
1577                         }
1578                         if (prev_reduce >= 0) {
1579                                 printf("  State %d has 2 (or more) reducible items\n", i);
1580                                 report_item(g, is->items.syms[prev_reduce]);
1581                                 report_item(g, is->items.syms[last_reduce]);
1582                                 cnt++;
1583                         }
1584                 }
1585                 return cnt;
1586         }
1587
1588 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1589 look ahead, or if a symbol in a look-ahead can be shifted.  The differ only
1590 in the source of the look ahead set.
1591
1592 We build a dataset mapping terminal to item for possible SHIFTs and then
1593 another for possible REDUCE operations. We report when we get conflicts
1594 between the two.
1595
1596         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1597         {
1598                 int i;
1599                 int cnt = 0;
1600
1601                 for (i = 0; i < g->states; i++) {
1602                         struct itemset *is = g->statetab[i];
1603                         struct symset shifts = INIT_DATASET;
1604                         struct symset reduce = INIT_DATASET;
1605                         int j;
1606                         if (!is)
1607                                 continue;
1608                         /* First collect the shifts */
1609                         for (j = 0; j < is->items.cnt; j++) {
1610                                 unsigned short itm = is->items.syms[j];
1611                                 int p = item_prod(itm);
1612                                 int bp = item_index(itm);
1613                                 struct production *pr = g->productions[p];
1614
1615                                 if (bp < pr->body_size &&
1616                                     pr->body[bp]->type == Terminal) {
1617                                         /* shiftable */
1618                                         int sym = pr->body[bp]->num;
1619                                         if (symset_find(&shifts, sym) < 0)
1620                                                 symset_add(&shifts, sym, itm);
1621                                 }
1622                         }
1623                         /* Now look for reduction and conflicts */
1624                         for (j = 0; j < is->items.cnt; j++) {
1625                                 unsigned short itm = is->items.syms[j];
1626                                 int p = item_prod(itm);
1627                                 int bp = item_index(itm);
1628                                 struct production *pr = g->productions[p];
1629
1630                                 if (bp < pr->body_size)
1631                                         continue;
1632                                 /* reducible */
1633                                 struct symset la;
1634                                 if (type == SLR)
1635                                         la = g->follow[pr->head->num];
1636                                 else
1637                                         la = set_find(g, is->items.data[j]);
1638                                 int k;
1639                                 for (k = 0; k < la.cnt; k++) {
1640                                         int pos = symset_find(&shifts, la.syms[k]);
1641                                         if (pos >= 0) {
1642                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1643                                                 prtxt(g->symtab[la.syms[k]]->name);
1644                                                 printf(":\n");
1645                                                 report_item(g, shifts.data[pos]);
1646                                                 report_item(g, itm);
1647                                                 cnt++;
1648                                         }
1649                                         pos = symset_find(&reduce, la.syms[k]);
1650                                         if (pos < 0) {
1651                                                 symset_add(&reduce, la.syms[k], itm);
1652                                                 continue;
1653                                         }
1654                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1655                                         prtxt(g->symtab[la.syms[k]]->name);
1656                                         printf(":\n");
1657                                         report_item(g, itm);
1658                                         report_item(g, reduce.data[pos]);
1659                                         cnt++;
1660                                 }
1661                         }
1662                         symset_free(shifts);
1663                         symset_free(reduce);
1664                 }
1665                 return cnt;
1666         }
1667
1668
1669 ## Generating the parser
1670
1671 The export part of the parser is the `parse_XX` function, where the name
1672 `XX` is based on the name of the parser files.
1673
1674 This takes a `code_node`, a partially initialized `token_config`, and an
1675 optional `FILE` to send tracing to.  The `token_config` gets the list of
1676 known words added and then is used with the `code_node` to initialize the
1677 scanner.
1678
1679 `parse_XX` then call the library function `parser_run` to actually complete
1680 the parse,  This needs the `states` table and function to call the various
1681 pieces of code provided in the grammar file, so they are generated first.
1682
1683 ###### parser_generate
1684
1685         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1686         {
1687                 gen_known(f, g);
1688                 gen_goto(f, g);
1689                 gen_states(f, g);
1690                 gen_reduce(f, g, file);
1691                 gen_free(f, g);
1692
1693                 fprintf(f, "#line 0 \"gen_parser\"\n");
1694                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1695                         name);
1696                 fprintf(f, "{\n");
1697                 fprintf(f, "\tstruct token_state *tokens;\n");
1698                 fprintf(f, "\tconfig->words_marks = known;\n");
1699                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1700                 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1701                 fprintf(f, "\ttokens = token_open(code, config);\n");
1702                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace);\n");
1703                 fprintf(f, "\ttoken_close(tokens);\n");
1704                 fprintf(f, "\treturn rv;\n");
1705                 fprintf(f, "}\n\n");
1706         }
1707
1708 ### Table words table
1709
1710 The know words is simply an array of terminal symbols.
1711
1712 ###### functions
1713
1714         static void gen_known(FILE *f, struct grammar *g)
1715         {
1716                 int i;
1717                 fprintf(f, "#line 0 \"gen_known\"\n");
1718                 fprintf(f, "static const char *known[] = {\n");
1719                 for (i = TK_reserved;
1720                      i < g->num_syms && g->symtab[i]->type == Terminal;
1721                      i++)
1722                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1723                                 g->symtab[i]->name.txt);
1724                 fprintf(f, "};\n\n");
1725         }
1726
1727 ### States and the goto tables.
1728
1729 For each state we record the goto table and the reducible production if
1730 there is one.
1731 Some of the details of the reducible production are stored in the
1732 `do_reduce` function to come later.  Here we store the production number,
1733 the body size (useful for stack management) and the resulting symbol (useful
1734 for knowing how to free data later).
1735
1736 The go to table is stored in a simple array of `sym` and corresponding
1737 `state`.
1738
1739 ###### exported types
1740
1741         struct lookup {
1742                 short sym;
1743                 short state;
1744         };
1745         struct state {
1746                 short go_to_cnt;
1747                 const struct lookup * go_to;
1748                 short reduce_prod;
1749                 short reduce_size;
1750                 short reduce_sym;
1751         };
1752
1753
1754 ###### functions
1755
1756         static void gen_goto(FILE *f, struct grammar *g)
1757         {
1758                 int i;
1759                 fprintf(f, "#line 0 \"gen_goto\"\n");
1760                 for (i = 0; i < g->states; i++) {
1761                         int j;
1762                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
1763                                 i);
1764                         struct symset gt = g->statetab[i]->go_to;
1765                         for (j = 0; j < gt.cnt; j++)
1766                                 fprintf(f, "\t{ %d, %d },\n",
1767                                         gt.syms[j], gt.data[j]);
1768                         fprintf(f, "};\n");
1769                 }
1770         }
1771
1772 ###### functions
1773
1774         static void gen_states(FILE *f, struct grammar *g)
1775         {
1776                 int i;
1777                 fprintf(f, "#line 0 \"gen_states\"\n");
1778                 fprintf(f, "static const struct state states[] = {\n");
1779                 for (i = 0; i < g->states; i++) {
1780                         struct itemset *is = g->statetab[i];
1781                         int j, prod = -1;
1782                         for (j = 0; j < is->items.cnt; j++) {
1783                                 int itm = is->items.syms[j];
1784                                 int p = item_prod(itm);
1785                                 int bp = item_index(itm);
1786                                 struct production *pr = g->productions[p];
1787
1788                                 if (bp < pr->body_size)
1789                                         continue;
1790                                 /* This is what we reduce */
1791                                 prod = p;
1792                                 break;
1793                         }
1794
1795                         fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d },\n",
1796                                 i, is->go_to.cnt, i, prod,
1797                                 prod < 0 ? -1 : g->productions[prod]->body_size,
1798                                 prod < 0 ? -1 : g->productions[prod]->head->num);
1799                 }
1800                 fprintf(f, "};\n\n");
1801         }
1802
1803 ### The `do_reduce` function and the code
1804
1805 When the parser engine decides to reduce a production, it calls `do_reduce`.
1806 This has two functions.
1807
1808 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1809 production being reduced.  Secondly it runs the code that was included with
1810 the production if any.
1811
1812 This code needs to be able to store data somewhere.  Rather than requiring
1813 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1814 `do_reduce` return the size to be saved.
1815
1816 The code fragment requires translation when written out.  Any `$N` needs to
1817 be converted to a reference either to that buffer (if `$0`) or to the
1818 structure returned by a previous reduction.  These pointer need to be cast
1819 to the appropriate type for each access.  All this is handling in
1820 `gen_code`.
1821
1822
1823 ###### functions
1824
1825         static void gen_code(struct production *p, FILE *f, struct grammar *g)
1826         {
1827                 char *c;
1828                 fprintf(f, "\t\t\t");
1829                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1830                         int n;
1831                         if (*c != '$') {
1832                                 fputc(*c, f);
1833                                 if (*c == '\n')
1834                                         fputs("\t\t\t", f);
1835                                 continue;
1836                         }
1837                         c++;
1838                         if (*c < '0' || *c > '9') {
1839                                 fputc(*c, f);
1840                                 continue;
1841                         }
1842                         n = *c - '0';
1843                         while (c[1] >= '0' && c[1] <= '9') {
1844                                 c += 1;
1845                                 n = n * 10 + *c - '0';
1846                         }
1847                         if (n == 0)
1848                                 fprintf(f, "(*(struct %.*s*)ret)",
1849                                         p->head->struct_name.len,
1850                                         p->head->struct_name.txt);
1851                         else if (n > p->body_size)
1852                                 fprintf(f, "$%d", n);
1853                         else if (p->body[n-1]->type == Terminal)
1854                                 fprintf(f, "(*(struct token *)body[%d])",
1855                                         n-1);
1856                         else if (p->body[n-1]->struct_name.txt == NULL)
1857                                 fprintf(f, "$%d", n);
1858                         else
1859                                 fprintf(f, "(*(struct %.*s*)body[%d])",
1860                                         p->body[n-1]->struct_name.len,
1861                                         p->body[n-1]->struct_name.txt, n-1);
1862                 }
1863                 fputs("\n", f);
1864         }
1865
1866 ###### functions
1867
1868         static void gen_reduce(FILE *f, struct grammar *g, char *file)
1869         {
1870                 int i, j;
1871                 fprintf(f, "#line 0 \"gen_reduce\"\n");
1872                 fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n");
1873                 fprintf(f, "                      void *ret, FILE *trace)\n");
1874                 fprintf(f, "{\n");
1875                 fprintf(f, "\tint ret_size = 0;\n");
1876
1877                 fprintf(f, "\tswitch(prod) {\n");
1878                 for (i = 0; i < g->production_count; i++) {
1879                         struct production *p = g->productions[i];
1880                         fprintf(f, "\tcase %d:\n", i);
1881
1882                         if (p->code.txt)
1883                                 gen_code(p, f, g);
1884
1885                         fprintf(f, "\t\tif (trace) {\n");
1886                         fprintf(f, "\t\t\tfprintf(trace, \"[%%2d]%.*s ->\", depth);\n",
1887                                 p->head->name.len, p->head->name.txt);
1888                         for (j = 0; j < p->body_size; j++) {
1889                                 if (p->body[j]->type == Terminal) {
1890                                         fputs("\t\t\tfputs(\" \", trace);\n", f);
1891                                         fprintf(f, "\t\t\ttext_dump(trace, (*(struct token*)body[%d]).txt, 20);\n", j);
1892                                 } else {
1893                                         fprintf(f, "\t\t\tfprintf(trace, \" %.*s\");\n",
1894                                                 p->body[j]->name.len,
1895                                                 p->body[j]->name.txt);
1896                                 }
1897                         }
1898                         fprintf(f, "\t\t}\n");
1899
1900                         if (p->head->struct_name.txt)
1901                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
1902                                         p->head->struct_name.len,
1903                                         p->head->struct_name.txt);
1904
1905                         fprintf(f, "\t\tbreak;\n");
1906                 }
1907                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
1908         }
1909
1910 ### `do_free`
1911
1912 As each non-terminal can potentially cause a different type of data
1913 structure to be allocated and filled in, we need to be able to free it when
1914 done.
1915
1916 It is particularly important to have fine control over freeing during error
1917 recovery where individual stack frames might need to be freed.
1918
1919 For this, the grammar author required to defined a `free_XX` function for
1920 each structure that is used by a non-terminal.  `do_free` all call whichever
1921 is appropriate given a symbol number, and will call `free` (as is
1922 appropriate for tokens` on any terminal symbol.
1923
1924 ###### functions
1925
1926         static void gen_free(FILE *f, struct grammar *g)
1927         {
1928                 int i;
1929
1930                 fprintf(f, "#line 0 \"gen_free\"\n");
1931                 fprintf(f, "static void do_free(short sym, void *asn)\n");
1932                 fprintf(f, "{\n");
1933                 fprintf(f, "\tif (!asn) return;\n");
1934                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
1935                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
1936                 fprintf(f, "\tswitch(sym) {\n");
1937
1938                 for (i = 0; i < g->num_syms; i++) {
1939                         struct symbol *s = g->symtab[i];
1940                         if (!s ||
1941                             s->type != Nonterminal ||
1942                             s->struct_name.txt == NULL)
1943                                 continue;
1944
1945                         fprintf(f, "\tcase %d:\n", s->num);
1946                         fprintf(f, "\t\tfree_%.*s(asn);\n",
1947                                 s->struct_name.len,
1948                                 s->struct_name.txt);
1949                         fprintf(f, "\t\tbreak;\n");
1950                 }
1951                 fprintf(f, "\t}\n}\n\n");
1952         }
1953
1954 ## The main routine.
1955
1956 There are three key parts to the "main" function of parsergen: processing
1957 the arguments, loading the grammar file, and dealing with the grammar.
1958
1959 ### Argument processing.
1960
1961 Command line options allow the selection of analysis mode, name of output
1962 file, and whether or not a report should be generated.  By default we create
1963 a report only if no code output was requested.
1964
1965 The `parse_XX` function name uses the basename of the output file which
1966 should not have a suffix (`.c`).  `.c` is added to the given name for the
1967 code, and `.h` is added for the header (if header text is specifed in the
1968 grammar file).
1969
1970 ###### includes
1971         #include <getopt.h>
1972
1973 ###### declarations
1974         static const struct option long_options[] = {
1975                 { "LR0",        0, NULL, '0' },
1976                 { "LR05",       0, NULL, '5' },
1977                 { "SLR",        0, NULL, 'S' },
1978                 { "LALR",       0, NULL, 'L' },
1979                 { "LR1",        0, NULL, '1' },
1980                 { "report",     0, NULL, 'R' },
1981                 { "output",     1, NULL, 'o' },
1982                 { NULL,         0, NULL, 0   }
1983         };
1984         const char *options = "05SL1Ro:";
1985
1986 ###### process arguments
1987         int opt;
1988         char *outfile = NULL;
1989         char *infile;
1990         char *name;
1991         int report = 1;
1992         enum grammar_type type = LR05;
1993         while ((opt = getopt_long(argc, argv, options,
1994                                   long_options, NULL)) != -1) {
1995                 switch(opt) {
1996                 case '0':
1997                         type = LR0; break;
1998                 case '5':
1999                         type = LR05; break;
2000                 case 'S':
2001                         type = SLR; break;
2002                 case 'L':
2003                         type = LALR; break;
2004                 case '1':
2005                         type = LR1; break;
2006                 case 'R':
2007                         report = 2; break;
2008                 case 'o':
2009                         outfile = optarg; break;
2010                 default:
2011                         fprintf(stderr, "Usage: parsergen ...\n");
2012                         exit(1);
2013                 }
2014         }
2015         if (optind < argc)
2016                 infile = argv[optind++];
2017         else {
2018                 fprintf(stderr, "No input file given\n");
2019                 exit(1);
2020         }
2021         if (outfile && report == 1)
2022                 report = 0;
2023         name = outfile;
2024         if (name && strchr(name, '/'))
2025                 name = strrchr(name, '/')+1;
2026
2027         if (optind < argc) {
2028                 fprintf(stderr, "Excess command line arguments\n");
2029                 exit(1);
2030         }
2031
2032 ### Loading the grammar file
2033
2034 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2035 map it.
2036
2037 One we have extracted the code (with `mdcode`) we expect to file three
2038 sections: header, code, and grammar.  Anything else is an error.
2039
2040 "header" and "code" are optional, though it is hard to build a working
2041 parser with neither. "grammar" must be provided.
2042
2043 ###### includes
2044         #include <fcntl.h>
2045         #include <sys/mman.h>
2046         #include <errno.h>
2047
2048 ###### functions
2049         static int errs;
2050         static void pr_err(char *msg)
2051         {
2052                 errs++;
2053                 fprintf(stderr, "%s\n", msg);
2054         }
2055
2056 ###### load file
2057         struct section *table;
2058         int fd;
2059         int len;
2060         char *file;
2061         fd = open(infile, O_RDONLY);
2062         if (fd < 0) {
2063                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2064                         infile, strerror(errno));
2065                 exit(1);
2066         }
2067         len = lseek(fd, 0, 2);
2068         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2069         table = code_extract(file, file+len, pr_err);
2070
2071         struct code_node *hdr = NULL;
2072         struct code_node *code = NULL;
2073         struct code_node *gram = NULL;
2074         for (s = table; s; s = s->next) {
2075                 if (text_is(s->section, "header"))
2076                         hdr = s->code;
2077                 else if (text_is(s->section, "code"))
2078                         code = s->code;
2079                 else if (text_is(s->section, "grammar"))
2080                         gram = s->code;
2081                 else {
2082                         fprintf(stderr, "Unknown content section: %.*s\n",
2083                                 s->section.len, s->section.txt);
2084                         rv |= 2;
2085                 }
2086         }
2087
2088 ### Processing the input
2089
2090 As we need to append an extention to a filename and then open it for
2091 writing, and we need to do this twice, it helps to have a separate function.
2092
2093 ###### functions
2094
2095         static FILE *open_ext(char *base, char *ext)
2096         {
2097                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2098                 FILE *f;
2099                 strcat(strcpy(fn, base), ext);
2100                 f = fopen(fn, "w");
2101                 free(fn);
2102                 return f;
2103         }
2104
2105 If we can read the grammar, then we analyse and optionally report on it.  If
2106 the report finds conflicts we will exit with an error status.
2107
2108 ###### process input
2109         struct grammar *g = NULL;
2110         if (gram == NULL) {
2111                 fprintf(stderr, "No grammar section provided\n");
2112                 rv |= 2;
2113         } else {
2114                 g = grammar_read(gram);
2115                 if (!g) {
2116                         fprintf(stderr, "Failure to parse grammar\n");
2117                         rv |= 2;
2118                 }
2119         }
2120         if (g) {
2121                 grammar_analyse(g, type);
2122                 if (report)
2123                         if (grammar_report(g, type))
2124                                 rv |= 1;
2125         }
2126
2127 If a headers section is defined, we write it out and include a declaration
2128 for the `parse_XX` function so it can be used from separate code.
2129
2130         if (rv == 0 && hdr && outfile) {
2131                 FILE *f = open_ext(outfile, ".h");
2132                 if (f) {
2133                         code_node_print(f, hdr, infile);
2134                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2135                                 name);
2136                         fclose(f);
2137                 } else {
2138                         fprintf(stderr, "Cannot create %s.h\n",
2139                                 outfile);
2140                         rv |= 4;
2141                 }
2142         }
2143
2144 And if all goes well and an output file was provided, we create the `.c`
2145 file with the code section (if any) and the parser tables and function.
2146
2147         if (rv == 0 && outfile) {
2148                 FILE *f = open_ext(outfile, ".c");
2149                 if (f) {
2150                         if (code)
2151                                 code_node_print(f, code, infile);
2152                         gen_parser(f, g, infile, name);
2153                         fclose(f);
2154                 } else {
2155                         fprintf(stderr, "Cannot create %s.c\n",
2156                                 outfile);
2157                         rv |= 4;
2158                 }
2159         }
2160
2161 And that about wraps it up.  We need to set the locale so that UTF-8 is
2162 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2163
2164 ###### File: parsergen.mk
2165         parsergen : parsergen.o libscanner.o libmdcode.o
2166                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2167
2168 ###### includes
2169         #include <locale.h>
2170
2171 ###### main
2172
2173         int main(int argc, char *argv[])
2174         {
2175                 struct section *s;
2176                 int rv = 0;
2177
2178                 setlocale(LC_ALL,"");
2179
2180                 ## process arguments
2181                 ## load file
2182                 ## process input
2183
2184                 return rv;
2185         }
2186
2187 ## The SHIFT/REDUCE parser
2188
2189 Having analysed the grammar and generated all the table, we only need the
2190 shift/reduce engine to bring it all together.
2191
2192 ### Goto table lookup
2193
2194 The parser generator has nicely provided us with goto tables sorted by
2195 symbol number.  We need a binary search function to find a symbol in the
2196 table.
2197
2198 ###### parser
2199
2200         static int search(const struct state *l, int sym)
2201         {
2202                 int lo = 0;
2203                 int hi = l->go_to_cnt;
2204
2205                 if (hi == 0)
2206                         return -1;
2207                 while (lo + 1 < hi) {
2208                         int mid = (lo + hi) / 2;
2209                         if (l->go_to[mid].sym <= sym)
2210                                 lo = mid;
2211                         else
2212                                 hi = mid;
2213                 }
2214                 if (l->go_to[lo].sym == sym)
2215                         return l->go_to[lo].state;
2216                 else
2217                         return -1;
2218         }
2219
2220 ### The state stack.
2221
2222 The core data structure for the parser is the stack.  This track all the
2223 symbols that have been recognised or partially recognised.
2224
2225 The stack usually won't grow very large - maybe a few tens of entries.  So
2226 we dynamically resize and array as required but never bother to shrink it
2227 down again.
2228
2229 We keep the stack as two separate allocations.  One, `asn_stack` stores the
2230 "abstract syntax nodes" which are created by each reduction.  When we call
2231 `do_reduce` we need to pass an array of the `asn`s of the body of the
2232 production, and by keeping a separate `asn` stack, we can just pass a
2233 pointer into this stack.
2234
2235 The other allocation store all other stack fields of which there are two.
2236 The `state` is the most important one and guides the parsing process.  The
2237 `sym` is nearly unnecessary.  However when we want to free entries from the
2238 `asn_stack`, it helps to know what type they are so we can call the right
2239 freeing function.  The symbol leads us to the right free function through
2240 `do_free`.
2241
2242 ###### parser
2243
2244         struct parser {
2245                 int state;
2246                 struct frame {
2247                         short state;
2248                         short sym;
2249                 } *stack;
2250                 void **asn_stack;
2251                 int stack_size;
2252                 int tos;
2253         };
2254
2255 #### Shift and pop
2256
2257 The operations are needed on the stack - shift (which is like push) and pop.
2258
2259 Shift applies no only to terminals but also to non-terminals.  When we
2260 reduce a production we will pop off entries corresponding to the body
2261 symbols, then push on an item for the head of the production.  This last is
2262 exactly the same process as shifting in a terminal so we use the same
2263 function for both.
2264
2265 To simplify other code we arrange for `shift` to fail if there is no `goto`
2266 state for the symbol.  This is useful in basic parsing due to our design
2267 that we shift when we can, and reduce when we cannot.  So the `shift`
2268 function reports if it could.
2269
2270 So `shift` finds the next state.  If that succeed it extends the allocations
2271 if needed and pushed all the information onto the stacks.
2272
2273 ###### parser
2274
2275         static int shift(struct parser *p,
2276                          int sym, void *asn,
2277                          const struct state states[])
2278         {
2279                 // Push an entry onto the stack
2280                 int newstate = search(&states[p->state], sym);
2281                 if (newstate < 0)
2282                         return 0;
2283                 if (p->tos >= p->stack_size) {
2284                         p->stack_size += 10;
2285                         p->stack = realloc(p->stack, p->stack_size
2286                                            * sizeof(p->stack[0]));
2287                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2288                                            * sizeof(p->asn_stack[0]));
2289                 }
2290                 p->stack[p->tos].state = p->state;
2291                 p->stack[p->tos].sym = sym;
2292                 p->asn_stack[p->tos] = asn;
2293                 p->tos++;
2294                 p->state = newstate;
2295                 return 1;
2296         }
2297
2298 `pop` simply moves the top of stack (`tos`) back down the required amount
2299 and frees and `asn` entries that need to be freed.  It is called _after_ we
2300 reduce a production, just before we `shift` the nonterminal in.
2301
2302 ###### parser
2303
2304         static void pop(struct parser *p, int num,
2305                         void(*do_free)(short sym, void *asn))
2306         {
2307                 int i;
2308                 p->tos -= num;
2309                 for (i = 0; i < num; i++)
2310                         do_free(p->stack[p->tos+i].sym,
2311                                 p->asn_stack[p->tos+i]);
2312
2313                 p->state = p->stack[p->tos].state;
2314         }
2315
2316 ### Memory allocation
2317
2318 The `scanner` returns tokens in a local variable - we want them in allocated
2319 memory so they can live in the `asn_stack`.  Similarly the `asn` produce by
2320 a reduce is in a large buffer.  Both of these require some allocation and
2321 copying, hence `memdup` and `tokcopy`.
2322
2323 ###### parser includes
2324         #include <memory.h>
2325
2326 ###### parser
2327
2328         void *memdup(void *m, int len)
2329         {
2330                 void *ret;
2331                 if (len == 0)
2332                         return NULL;
2333                 ret = malloc(len);
2334                 memcpy(ret, m, len);
2335                 return ret;
2336         }
2337
2338         static struct token *tok_copy(struct token tk)
2339         {
2340                 struct token *new = malloc(sizeof(*new));
2341                 *new = tk;
2342                 return new;
2343         }
2344
2345 ### The heart of the parser.
2346
2347 Now we have the parser.  If we can shift, we do.  If not and we can reduce
2348 we do.  If the production we reduced was production zero, then we have
2349 accepted the input and can finish.
2350
2351 If we can neither shift nor reduce we have an error to handle.  We pop
2352 single entries off the stack until we can shift the `TK_error` symbol, the
2353 drop input tokens until we find one we can shift into the new error state.
2354
2355 We return whatever `asn` was returned by reducing production zero.
2356
2357 ###### parser includes
2358         #include "parser.h"
2359 ###### parser
2360         void *parser_run(struct token_state *tokens,
2361                          const struct state states[],
2362                          int (*do_reduce)(int, int, void**, void*, FILE*),
2363                          void (*do_free)(short, void*),
2364                          FILE *trace)
2365         {
2366                 struct parser p = { 0 };
2367                 struct token *tk;
2368                 int accepted = 0;
2369                 void *ret;
2370
2371                 tk = tok_copy(token_next(tokens));
2372                 while (!accepted) {
2373                         if (shift(&p, tk->num, tk, states)) {
2374                                 if (trace) {
2375                                         fputs("Shift ", trace);
2376                                         text_dump(trace, tk->txt, 20);
2377                                         fputs("\n", trace);
2378                                 }
2379                                 tk = tok_copy(token_next(tokens));
2380                                 continue;
2381                         }
2382                         if (states[p.state].reduce_prod >= 0) {
2383                                 void **body;
2384                                 int prod = states[p.state].reduce_prod;
2385                                 int size = states[p.state].reduce_size;
2386                                 int sym = states[p.state].reduce_sym;
2387                                 int bufsize;
2388                                 static char buf[16*1024];
2389
2390                                 body = p.asn_stack +
2391                                         (p.tos - states[p.state].reduce_size);
2392
2393                                 bufsize = do_reduce(prod, p.tos, body,
2394                                                     buf, trace);
2395                                 if (trace)
2396                                         fputs("\n", trace);
2397
2398                                 pop(&p, size, do_free);
2399                                 shift(&p, sym, memdup(buf, bufsize), states);
2400                                 if (prod == 0)
2401                                         accepted = 1;
2402                                 continue;
2403                         }
2404                         /* Error. we walk up the stack until we
2405                          * find a state which will accept TK_error.
2406                          * We then shift in TK_error and see what state
2407                          * that takes us too.
2408                          * Then we discard input tokens until
2409                          * we find one that is acceptable.
2410                          */
2411                         while (shift(&p, TK_error, tk, states) == 0
2412                                && p.tos > 0)
2413                                 // discard this state
2414                                 pop(&p, 1, do_free);
2415                         tk = tok_copy(*tk);
2416                         while (search(&states[p.state], tk->num) < 0 &&
2417                                tk->num != TK_eof) {
2418                                 free(tk);
2419                                 tk = tok_copy(token_next(tokens));
2420                         }
2421                         if (p.tos == 0 && tk->num == TK_eof)
2422                                 break;
2423                 }
2424                 free(tk);
2425                 if (accepted)
2426                         ret = p.asn_stack[0];
2427                 else
2428                         pop(&p, p.tos, do_free);
2429                 free(p.asn_stack);
2430                 free(p.stack);
2431                 return ret;
2432         }
2433
2434 ###### exported functions
2435         void *parser_run(struct token_state *tokens,
2436                          const struct state states[],
2437                          int (*do_reduce)(int, int, void**, void*, FILE*),
2438                          void (*do_free)(short, void*),
2439                          FILE *trace);
2440
2441
2442 # A Worked Example
2443
2444 The obvious example for a parser is a calculator.
2445
2446 As `scanner` provides number parsing function using `libgmp` is it not much
2447 work to perform arbitrary rational number calculations.
2448
2449 This calculator takes one expression, or an equality test per line.  The
2450 results are printed and in any equality test fails, the program exits with
2451 an error.
2452
2453 Embedding mdcode inside mdcode is rather horrible.  I'd like to find a
2454 better approach, but as the grammar file must have 3 components I need
2455 something like this.
2456
2457 ###### File: parsergen.mk
2458         calc.c : parsergen calc.cgm
2459                 ./parsergen -o calc calc.cgm
2460         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2461                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2462
2463 ###### demo grammar
2464
2465         # header
2466         ~~~~~
2467
2468         #include "number.h"
2469         // what do we use for a demo-grammar?  A calculator of course.
2470         struct number {
2471                 mpq_t val;
2472                 char tail[2];
2473                 int err;
2474         };
2475
2476         ~~~~~
2477         # code
2478         ~~~~~
2479
2480         #include <stdlib.h>
2481         #include <unistd.h>
2482         #include <fcntl.h>
2483         #include <sys/mman.h>
2484         #include <stdio.h>
2485         #include <malloc.h>
2486         #include <gmp.h>
2487         #include "mdcode.h"
2488         #include "scanner.h"
2489         #include "number.h"
2490         #include "parser.h"
2491
2492         #include "calc.h"
2493
2494         static void free_number(struct number *n)
2495         {
2496                 mpq_clear(n->val);
2497                 free(n);
2498         }
2499
2500         int main(int argc, char *argv[])
2501         {
2502                 int fd = open(argv[1], O_RDONLY);
2503                 int len = lseek(fd, 0, 2);
2504                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2505                 struct section *s = code_extract(file, file+len, NULL);
2506                 struct token_config config = {
2507                         .ignored = (1 << TK_line_comment)
2508                                  | (1 << TK_block_comment)
2509                                  | (1 << TK_indent)
2510                                  | (1 << TK_undent),
2511                         .number_chars = ".,_+-",
2512                         .word_start = "",
2513                         .word_cont = "",
2514                 };
2515                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2516                 exit(0);
2517         }
2518
2519         ~~~~~
2520         # grammar
2521         ~~~~~
2522
2523         Session -> Session Line
2524                 | Line
2525
2526         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2527                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2528                                         gmp_printf("  or as a decimal: %Fg\n", fl);
2529                                         mpf_clear(fl);
2530                                         }
2531                                      }$
2532                 | Expression = Expression NEWLINE ${
2533                         if (mpq_equal($1.val, $3.val))
2534                                 gmp_printf("Both equal %Qd\n", $1.val);
2535                         else {
2536                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
2537                                         $1.val, $3.val);
2538                                 exit(1);
2539                         }
2540                 }$
2541                 |  NEWLINE ${ printf("Blank line\n"); }$
2542                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2543
2544         $number
2545         Expression -> Expression +  Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2546                 | Expression -  Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2547                 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2548
2549         Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2550                 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2551                 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2552
2553         Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2554                 | ( Expression  ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$