ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 There are several distinct sections.
   8
   9 1. `grammar_read` will read a grammar from a literate-code file and
  10    store the productions, symbols, and code fragments.
  11 2. `grammar_analyse` will take that grammar and build LR parsing
  12    states and look-ahead tables.
  13 3. `grammar_report` will present the details of the analysis
  14    in a readable format and will report any conflicts.
  15 4. `parser_generate` will write out C code files with various
  16    tables and with the code fragments arranged in useful places.
  17 5. `parser_run` is a library function intended to be linked together
  18    with the generated parser tables to complete the implementation of
  19    a parser.
  20 6. `calc.cgm` is a test grammar for a simple calculator.
  21
  22 ###### File: parsergen.c
  23         #include <unistd.h>
  24         #include <stdlib.h>
  25         #include <stdio.h>
  26         ## includes
  27         ## forward declarations
  28         ## declarations
  29         ## functions
  30         ## grammar_read
  31         ## grammar_analyse
  32         ## grammar_report
  33         ## parser_generate
  34         ## main
  35 ###### File: parser.h
  36         ## exported types
  37         ## exported functions
  38 ###### File: libparser.c
  39         #include <unistd.h>
  40         #include <stdlib.h>
  41         #include <stdio.h>
  42         ## parser includes
  43         ## parser functions
  44         ## parser
  45 ###### File: calc.cgm
  46         ## demo grammar
  47 ###### File: parsergen.mk
  48         CFLAGS += -Wall -g
  49         all :: parsergen calc
  50         parsergen.c parsergen.mk calc.cgm libparser.c parser.h : parsergen.mdc
  51                 ./md2c parsergen.mdc
  52
  53 ## Reading the grammar
  54
  55 The grammar must be stored in a markdown literate code file as parsed
  56 by "[mdcode][]".  It must have three top level (i.e. unreferenced)
  57 sections: `header`, `code`, and `grammar`.  The first two will be
  58 literally copied into the generated `.c` and `.h`. files.  The last
  59 contains the grammar.  This is tokenised with "[scanner][]".
  60
  61 [mdcode]: mdcode.html
  62 [scanner]: scanner.html
  63
  64 ###### includes
  65         #include "mdcode.h"
  66         #include "scanner.h"
  67
  68 ###### parser includes
  69         #include "mdcode.h"
  70         #include "scanner.h"
  71
  72 The grammar contains production sets, precedence/associativity
  73 declarations, and data type declarations.  These are all parsed with
  74 _ad hoc_ parsing as we don't have a parser generator yet.
  75
  76 The precedence and associativity can be set for each production, but
  77 can be inherited from symbols.  The data type is potentially defined
  78 for each non-terminal and describes what C structure is used to store
  79 information about each symbol.
  80
  81 ###### declarations
  82         enum assoc {Left, Right, Non};
  83         char *assoc_names[] = {"Left","Right","Non"};
  84
  85         struct symbol {
  86                 struct text struct_name;
  87                 enum assoc assoc;
  88                 unsigned short precedence;
  89                 ## symbol fields
  90         };
  91         struct production {
  92                 unsigned short precedence;
  93                 enum assoc assoc;
  94                 ## production fields
  95         };
  96         struct grammar {
  97                 ## grammar fields
  98         };
  99
 100 The strings reported by `mdcode` and `scanner` are `struct text` which have
 101 length rather than being null terminated.  To help with printing an
 102 comparing we define `text_is` and `prtxt`, which should possibly go in
 103 `mdcode`.  `scanner` does provide `text_dump` which is useful for strings
 104 which might contain control characters.
 105
 106 ###### functions
 107         static int text_is(struct text t, char *s)
 108         {
 109                 return (strlen(s) == t.len &&
 110                         strncmp(s, t.txt, t.len) == 0);
 111         }
 112         static void prtxt(struct text t)
 113         {
 114                 printf("%.*s", t.len, t.txt);
 115         }
 116
 117 ### Symbols
 118
 119 Productions are comprised primarily of symbols - terminal and
 120 non-terminal.  We do not make a syntactic distinction between the two,
 121 though non-terminals must be identifiers.  Non-terminal symbols are
 122 those which appear in the head of a production, terminal symbols are
 123 those which don't.  There are also "virtual" symbols used for precedence
 124 marking discussed later, and sometimes we won't know what type a symbol
 125 is yet.
 126
 127 ###### forward declarations
 128         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 129         char *symtypes = "UVTN";
 130 ###### symbol fields
 131         enum symtype type;
 132
 133 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 134 table of known symbols and the resulting parser will report them as
 135 `TK_reserved + N`.  A small set of identifiers are reserved for the
 136 different token types that `scanner` can report.
 137
 138 ###### declarations
 139         static char *reserved_words[] = {
 140                 [TK_error]        = "ERROR",
 141                 [TK_number]       = "NUMBER",
 142                 [TK_ident]        = "IDENTIFIER",
 143                 [TK_mark]         = "MARK",
 144                 [TK_string]       = "STRING",
 145                 [TK_multi_string] = "MULTI_STRING",
 146                 [TK_indent]       = "INDENT",
 147                 [TK_undent]       = "UNDENT",
 148                 [TK_newline]      = "NEWLINE",
 149                 [TK_eof]          = "$eof",
 150         };
 151 ###### symbol fields
 152         short num;
 153
 154 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 155 recognised.  The former is automatically expected at the end of the text
 156 being parsed. The latter are always ignored by the parser.
 157
 158 All of these symbols are stored in a simple symbol table.  We use the
 159 `struct text` from `mdcode` to store the name and link them together
 160 into a sorted list using an insertion sort.
 161
 162 We don't have separate `find` and `insert` functions as any symbol we
 163 find needs to be remembered.  We simply expect `find` to always return a
 164 symbol, but its type might be `Unknown`.
 165
 166 ###### includes
 167         #include <string.h>
 168
 169 ###### symbol fields
 170         struct text name;
 171         struct symbol *next;
 172
 173 ###### grammar fields
 174         struct symbol *syms;
 175         int num_syms;
 176
 177 ###### functions
 178         static int text_cmp(struct text a, struct text b)
 179         {
 180                 int len = a.len;
 181                 if (a.len > b.len)
 182                         len = b.len;
 183                 int cmp = strncmp(a.txt, b.txt, len);
 184                 if (cmp)
 185                         return cmp;
 186                 else
 187                         return a.len - b.len;
 188         }
 189
 190         static struct symbol *sym_find(struct grammar *g, struct text s)
 191         {
 192                 struct symbol **l = &g->syms;
 193                 struct symbol *n;
 194                 int cmp = 1;
 195
 196                 while (*l &&
 197                         (cmp = text_cmp((*l)->name, s)) < 0)
 198                                 l = & (*l)->next;
 199                 if (cmp == 0)
 200                         return *l;
 201                 n = calloc(1, sizeof(*n));
 202                 n->name = s;
 203                 n->type = Unknown;
 204                 n->next = *l;
 205                 n->num = -1;
 206                 *l = n;
 207                 return n;
 208         }
 209
 210         static void symbols_init(struct grammar *g)
 211         {
 212                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 213                 int i;
 214                 for (i = 0; i < entries; i++) {
 215                         struct text t;
 216                         struct symbol *s;
 217                         t.txt = reserved_words[i];
 218                         if (!t.txt)
 219                                 continue;
 220                         t.len = strlen(t.txt);
 221                         s = sym_find(g, t);
 222                         s->type = Terminal;
 223                         s->num = i;
 224                 }
 225         }
 226
 227 ### Data types and precedence.
 228
 229 Data type specification and precedence specification are both
 230 introduced by a dollar sign at the start of the line.  If the next
 231 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 232 precedence, otherwise it specifies a data type.
 233
 234 The data type name is simply stored and applied to the head of all
 235 subsequent productions.  It must be the name of a structure, so `$expr`
 236 maps to `struct expr`.
 237
 238 Any productions given before the first data type will have no data type
 239 and can carry no information.  In order to allow other non-terminals to
 240 have no type, the data type `$void` can be given.  This does *not* mean
 241 that `struct void` will be used, but rather than no type will be
 242 associated with future non-terminals.
 243
 244 The precedence line must contain a list of symbols - typically
 245 terminal symbols, but not necessarily.  It can only contain symbols
 246 that have not been seen yet, so precedence declaration must precede
 247 the productions that they affect.
 248
 249 A precedence line may also contain a "virtual" symbol which is an
 250 identifier preceded by `$$`.  Virtual terminals carry precedence
 251 information but are not included in the grammar.  A production can
 252 declare that it inherits the precedence of a given virtual symbol.
 253
 254 This use for `$$` precludes it from being used as a symbol in the
 255 described language.  Two other symbols: `${` and `}$` are also
 256 unavailable.
 257
 258 Each new precedence line introduces a new precedence level and
 259 declares how it associates.  This level is stored in each symbol
 260 listed and may be inherited by any production which uses the symbol.  A
 261 production inherits from the last symbol which has a precedence.
 262
 263 ###### grammar fields
 264         struct text current_type;
 265         int prec_levels;
 266
 267 ###### declarations
 268         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 269         static const char *known[] = { "$$", "${", "}$" };
 270
 271 ###### functions
 272         static char *dollar_line(struct token_state *ts, struct grammar *g)
 273         {
 274                 struct token t = token_next(ts);
 275                 char *err;
 276                 enum assoc assoc;
 277                 int found;
 278
 279                 if (t.num != TK_ident) {
 280                         err = "type or assoc expected after '$'";
 281                         goto abort;
 282                 }
 283                 if (text_is(t.txt, "LEFT"))
 284                         assoc = Left;
 285                 else if (text_is(t.txt, "RIGHT"))
 286                         assoc = Right;
 287                 else if (text_is(t.txt, "NON"))
 288                         assoc = Non;
 289                 else {
 290                         g->current_type = t.txt;
 291                         if (text_is(t.txt, "void"))
 292                                 g->current_type.txt = NULL;
 293                         t = token_next(ts);
 294                         if (t.num != TK_newline) {
 295                                 err = "Extra tokens after type name";
 296                                 goto abort;
 297                         }
 298                         return NULL;
 299                 }
 300
 301                 // This is a precedence line, need some symbols.
 302                 found = 0;
 303                 g->prec_levels += 1;
 304                 t = token_next(ts);
 305                 while (t.num != TK_newline) {
 306                         enum symtype type = Terminal;
 307                         struct symbol *s;
 308                         if (t.num == TK_virtual) {
 309                                 type = Virtual;
 310                                 t = token_next(ts);
 311                                 if (t.num != TK_ident) {
 312                                         err = "$$ must be followed by a word";
 313                                         goto abort;
 314                                 }
 315                         } else if (t.num != TK_ident &&
 316                                    t.num != TK_mark) {
 317                                 err = "Illegal token in precedence line";
 318                                 goto abort;
 319                         }
 320                         s = sym_find(g, t.txt);
 321                         if (s->type != Unknown) {
 322                                 err = "Symbols in precedence line must not already be known.";
 323                                 goto abort;
 324                         }
 325                         s->type = type;
 326                         s->precedence = g->prec_levels;
 327                         s->assoc = assoc;
 328                         found += 1;
 329                 }
 330                 if (found == 0)
 331                         err = "No symbols given on precedence line";
 332                         goto abort;
 333                 return NULL;
 334         abort:
 335                 while (t.num != TK_newline && t.num != TK_eof)
 336                         t = token_next(ts);
 337                 return err;
 338         }
 339
 340 ### Productions
 341
 342 A production either starts with an identifier which is the head
 343 non-terminal, or a vertical bar (`|`) in which case this production
 344 uses the same head as the previous one.  The identifier must be
 345 followed by a `->` mark.  All productions for a given non-terminal must
 346 be grouped together with the `nonterminal ->` given only once.
 347
 348 After this a (possibly empty) sequence of identifiers and marks form
 349 the body of the production.  A virtual symbol may be given after the
 350 body by preceding it with `$$`.  If a virtual symbol is given, the
 351 precedence of the production is that for the virtual symbol.  If none
 352 is given, the precedence is inherited from the last symbol in the
 353 production which has a precedence specified.
 354
 355 After the optional precedence may come  the `${` mark.  This indicates
 356 the start of a code fragment.  If present, this must be on the same
 357 line as the start of the production.
 358
 359 All of the text from the `${` through to the matching `}$` is
 360 collected and forms the code-fragment for the production.  It must all
 361 be in one `code_node` of the literate code.  The `}$` must be
 362 at the end of a line.
 363
 364 Text in the code fragment will undergo substitutions where `$N` for
 365 some numeric `N` will be replaced with a variable holding the parse
 366 information for the particular symbol in the production.  `$0` is the
 367 head of the production, `$1` is the first symbol of the body, etc.
 368 The type of `$N` for a terminal symbol is `struct token`.  For
 369 non-terminal, it is whatever has been declared for that symbol.
 370
 371 While building productions we will need to add to an array which needs to
 372 grow dynamically.
 373
 374 ###### functions
 375         static void array_add(void *varray, int *cnt, void *new)
 376         {
 377                 void ***array = varray;
 378                 int current = 0;
 379                 const int step = 8;
 380                 current = ((*cnt-1) | (step-1))+1;
 381                 if (*cnt == current) {
 382                         /* must grow */
 383                         current += step;
 384                         *array = realloc(*array, current * sizeof(void*));
 385                 }
 386                 (*array)[*cnt] = new;
 387                 (*cnt) += 1;
 388         }
 389
 390 Collecting the code fragment simply involves reading tokens until we
 391 hit the end token `}$`, and noting the character position of the start and
 392 the end.
 393
 394 ###### functions
 395         static struct text collect_code(struct token_state *state,
 396                                         struct token start)
 397         {
 398                 struct text code;
 399                 struct token t;
 400                 code.txt = start.txt.txt + start.txt.len;
 401                 do
 402                         t = token_next(state);
 403                 while (t.node == start.node &&
 404                        t.num != TK_close && t.num != TK_error &&
 405                        t.num != TK_eof);
 406                 if (t.num == TK_close && t.node == start.node)
 407                         code.len = t.txt.txt - code.txt;
 408                 else
 409                         code.txt = NULL;
 410                 return code;
 411         }
 412
 413 Now we have all the bit we need to parse a full production.
 414
 415 ###### includes
 416         #include <memory.h>
 417
 418 ###### grammar fields
 419         struct production **productions;
 420         int production_count;
 421
 422 ###### production fields
 423         struct symbol  *head;
 424         struct symbol **body;
 425         int             body_size;
 426         struct text     code;
 427
 428 ###### symbol fields
 429         int first_production;
 430
 431 ###### functions
 432         static char *parse_production(struct grammar *g,
 433                                       struct symbol *head,
 434                                       struct token_state *state)
 435         {
 436                 /* Head has already been parsed. */
 437                 struct token tk;
 438                 char *err;
 439                 struct production p, *pp;
 440
 441                 memset(&p, 0, sizeof(p));
 442                 p.head = head;
 443                 tk = token_next(state);
 444                 while (tk.num == TK_ident || tk.num == TK_mark) {
 445                         struct symbol *bs = sym_find(g, tk.txt);
 446                         if (bs->type == Unknown)
 447                                 bs->type = Terminal;
 448                         if (bs->type == Virtual) {
 449                                 err = "Virtual symbol not permitted in production";
 450                                 goto abort;
 451                         }
 452                         if (bs->precedence) {
 453                                 p.precedence = bs->precedence;
 454                                 p.assoc = bs->assoc;
 455                         }
 456                         array_add(&p.body, &p.body_size, bs);
 457                         tk = token_next(state);
 458                 }
 459                 if (tk.num == TK_virtual) {
 460                         struct symbol *vs;
 461                         tk = token_next(state);
 462                         if (tk.num != TK_ident) {
 463                                 err = "word required after $$";
 464                                 goto abort;
 465                         }
 466                         vs = sym_find(g, tk.txt);
 467                         if (vs->type != Virtual) {
 468                                 err = "symbol after $$ must be virtual";
 469                                 goto abort;
 470                         }
 471                         p.precedence = vs->precedence;
 472                         p.assoc = vs->assoc;
 473                         tk = token_next(state);
 474                 }
 475                 if (tk.num == TK_open) {
 476                         p.code = collect_code(state, tk);
 477                         if (p.code.txt == NULL) {
 478                                 err = "code fragment not closed properly";
 479                                 goto abort;
 480                         }
 481                         tk = token_next(state);
 482                 }
 483                 if (tk.num != TK_newline && tk.num != TK_eof) {
 484                         err = "stray tokens at end of line";
 485                         goto abort;
 486                 }
 487                 pp = malloc(sizeof(*pp));
 488                 *pp = p;
 489                 array_add(&g->productions, &g->production_count, pp);
 490                 return NULL;
 491         abort:
 492                 while (tk.num != TK_newline && tk.num != TK_eof)
 493                         tk = token_next(state);
 494                 return err;
 495         }
 496
 497 With the ability to parse production and dollar-lines, we have nearly all
 498 that we need to parse a grammar from a `code_node`.
 499
 500 The head of the first production will effectively be the `start` symbol of
 501 the grammar.  However it wont _actually_ be so.  Processing the grammar is
 502 greatly simplified if the real start symbol only has a single production,
 503 and expect `$eof` as the final terminal.  So when we find the first explicit
 504 production we insert an extra production as production zero which looks like
 505
 506 ###### Example: production 0
 507         $start -> START $eof
 508
 509 where `START` is the first non-terminal give.
 510
 511 ###### create production zero
 512         struct production *p = calloc(1,sizeof(*p));
 513         struct text start = {"$start",6};
 514         struct text eof = {"$eof",4};
 515         p->head = sym_find(g, start);
 516         p->head->type = Nonterminal;
 517         array_add(&p->body, &p->body_size, head);
 518         array_add(&p->body, &p->body_size, sym_find(g, eof));
 519         g->start  = p->head->num;
 520         p->head->first_production = g->production_count;
 521         array_add(&g->productions, &g->production_count, p);
 522
 523 Now we are ready to read in the grammar.
 524
 525 ###### grammar fields
 526         int start;      // the 'start' symbol.
 527
 528 ###### grammar_read
 529         static struct grammar *grammar_read(struct code_node *code)
 530         {
 531                 struct token_config conf = {
 532                         .word_start = "",
 533                         .word_cont = "",
 534                         .words_marks = known,
 535                         .known_count = sizeof(known)/sizeof(known[0]),
 536                         .number_chars = "",
 537                         .ignored = (1 << TK_line_comment)
 538                                  | (1 << TK_block_comment)
 539                                  | (0 << TK_number)
 540                                  | (1 << TK_string)
 541                                  | (1 << TK_multi_string)
 542                                  | (1 << TK_indent)
 543                                  | (1 << TK_undent),
 544                 };
 545
 546                 struct token_state *state = token_open(code, &conf);
 547                 struct token tk = token_next(state);
 548                 struct symbol *head = NULL;
 549                 struct grammar *g;
 550                 char *err = NULL;
 551
 552                 g = calloc(1, sizeof(*g));
 553                 symbols_init(g);
 554
 555                 for (tk = token_next(state); tk.num != TK_eof;
 556                      tk = token_next(state)) {
 557                         if (tk.num == TK_newline)
 558                                 continue;
 559                         if (tk.num == TK_ident) {
 560                                 // new non-terminal
 561                                 head = sym_find(g, tk.txt);
 562                                 if (head->type == Nonterminal)
 563                                         err = "This non-terminal has already be used.";
 564                                 else if (head->type == Virtual)
 565                                         err = "Virtual symbol not permitted in head of production";
 566                                 else {
 567                                         head->type = Nonterminal;
 568                                         head->struct_name = g->current_type;
 569                                         if (g->start == 0) {
 570                                                 ## create production zero
 571                                         }
 572                                         head->first_production = g->production_count;
 573                                         tk = token_next(state);
 574                                         if (tk.num == TK_mark &&
 575                                             text_is(tk.txt, "->"))
 576                                                 err = parse_production(g, head, state);
 577                                         else
 578                                                 err = "'->' missing in production";
 579                                 }
 580                         } else if (tk.num == TK_mark
 581                                    && text_is(tk.txt, "|")) {
 582                                 // another production for same non-term
 583                                 if (head)
 584                                         err = parse_production(g, head, state);
 585                                 else
 586                                         err = "First production must have a head";
 587                         } else if (tk.num == TK_mark
 588                                    &&  text_is(tk.txt, "$")) {
 589                                 err = dollar_line(state, g);
 590                         } else {
 591                                 err = "Unrecognised token at start of line.";
 592                         }
 593                         if (err)
 594                                 goto abort;
 595                 }
 596                 token_close(state);
 597                 return g;
 598         abort:
 599                 fprintf(stderr, "Error at line %d: %s\n",
 600                         tk.line, err);
 601                 token_close(state);
 602                 free(g);
 603                 return NULL;
 604         }
 605
 606 ## Analysing the grammar
 607
 608 The central task in analysing the grammar is to determine a set of
 609 states to drive the parsing process.  This will require finding
 610 various sets of symbols and of "items".  Some of these sets will need
 611 to attach information to each element in the set, so they are more
 612 like maps.
 613
 614 Each "item" may have a 'look-ahead' or `LA` set associated with
 615 it.  Many of these will be the same as each other.  To avoid memory
 616 wastage, and to simplify some comparisons of sets, these sets will be
 617 stored in a list of unique sets, each assigned a number.
 618
 619 Once we have the data structures in hand to manage these sets and
 620 lists, we can start setting the 'nullable' flag, build the 'FIRST'
 621 sets, and then create the item sets which define the various states.
 622
 623 ### Symbol sets.
 624
 625 Though we don't only store symbols in these sets, they are the main
 626 things we store, so they are called symbol sets or "symsets".
 627
 628 A symset has a size, an array of shorts, and an optional array of data
 629 values, which are also shorts.  If the array of data is not present,
 630 we store an impossible pointer, as `NULL` is used to indicate that no
 631 memory has been allocated yet;
 632
 633 ###### declarations
 634         struct symset {
 635                 short cnt;
 636                 unsigned short *syms, *data;
 637         };
 638         #define NO_DATA ((unsigned short *)1)
 639         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 640         const struct symset INIT_DATASET = { 0, NULL, NULL };
 641
 642 The arrays of shorts are allocated in blocks of 8 and are kept sorted
 643 by using an insertion sort.  We don't explicitly record the amount of
 644 allocated space as it can be derived directly from the current `cnt` using
 645 `((cnt - 1) | 7) + 1`.
 646
 647 ###### functions
 648         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 649         {
 650                 int i;
 651                 int current = ((s->cnt-1) | 7) + 1;
 652                 if (current == s->cnt) {
 653                         current += 8;
 654                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 655                         if (s->data != NO_DATA)
 656                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 657                 }
 658                 i = s->cnt;
 659                 while (i > 0 && s->syms[i-1] > key) {
 660                         s->syms[i] = s->syms[i-1];
 661                         if (s->data != NO_DATA)
 662                                 s->data[i] = s->data[i-1];
 663                         i--;
 664                 }
 665                 s->syms[i] = key;
 666                 if (s->data != NO_DATA)
 667                         s->data[i] = val;
 668                 s->cnt += 1;
 669         }
 670
 671 Finding a symbol (or item) in a `symset` uses a simple binary search.
 672 We return the index where the value was found (so data can be accessed),
 673 or `-1` to indicate failure.
 674
 675         static int symset_find(struct symset *ss, unsigned short key)
 676         {
 677                 int lo = 0;
 678                 int hi = ss->cnt;
 679
 680                 if (hi == 0)
 681                         return -1;
 682                 while (lo + 1 < hi) {
 683                         int mid = (lo + hi) / 2;
 684                         if (ss->syms[mid] <= key)
 685                                 lo = mid;
 686                         else
 687                                 hi = mid;
 688                 }
 689                 if (ss->syms[lo] == key)
 690                         return lo;
 691                 return -1;
 692         }
 693
 694 We will often want to form the union of two symsets.  When we do, we
 695 will often want to know if anything changed (as they might mean there
 696 is more work to do).  So `symset_union` reports whether anything was
 697 added to the first set.  We use a slow quadratic approach as these
 698 sets don't really get very big.  If profiles shows this to be a problem is
 699 can be optimised later.
 700
 701         static int symset_union(struct symset *a, struct symset *b)
 702         {
 703                 int i;
 704                 int added = 0;
 705                 for (i = 0; i < b->cnt; i++)
 706                         if (symset_find(a, b->syms[i]) < 0) {
 707                                 unsigned short data = 0;
 708                                 if (b->data != NO_DATA)
 709                                         data = b->data[i];
 710                                 symset_add(a, b->syms[i], data);
 711                                 added++;
 712                         }
 713                 return added;
 714         }
 715
 716 And of course we must be able to free a symset.
 717
 718         static void symset_free(struct symset ss)
 719         {
 720                 free(ss.syms);
 721                 if (ss.data != NO_DATA)
 722                         free(ss.data);
 723         }
 724
 725 ### Symset Storage
 726
 727 Some symsets are simply stored somewhere appropriate in the `grammar`
 728 data structure, others needs a bit of indirection.  This applies
 729 particularly to "LA" sets.  These will be paired with "items" in an
 730 "itemset".  As itemsets will be stored in a symset, the "LA" set needs to be
 731 stored in the `data` field, so we need a mapping from a small (short)
 732 number to an LA `symset`.
 733
 734 As mentioned earlier this involves creating a list of unique symsets.
 735
 736 For now, we just use a linear list sorted by insertion.  A skiplist
 737 would be more efficient and may be added later.
 738
 739 ###### declarations
 740
 741         struct setlist {
 742                 struct symset ss;
 743                 int num;
 744                 struct setlist *next;
 745         };
 746
 747 ###### grammar fields
 748         struct setlist *sets;
 749         int nextset;
 750
 751 ###### functions
 752
 753         static int ss_cmp(struct symset a, struct symset b)
 754         {
 755                 int i;
 756                 int diff = a.cnt - b.cnt;
 757                 if (diff)
 758                         return diff;
 759                 for (i = 0; i < a.cnt; i++) {
 760                         diff = (int)a.syms[i] - (int)b.syms[i];
 761                         if (diff)
 762                                 return diff;
 763                 }
 764                 return 0;
 765         }
 766
 767         static int save_set(struct grammar *g, struct symset ss)
 768         {
 769                 struct setlist **sl = &g->sets;
 770                 int cmp = 1;
 771                 struct setlist *s;
 772
 773                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 774                         sl = & (*sl)->next;
 775                 if (cmp == 0) {
 776                         symset_free(ss);
 777                         return (*sl)->num;
 778                 }
 779
 780                 s = malloc(sizeof(*s));
 781                 s->ss = ss;
 782                 s->num = g->nextset;
 783                 g->nextset += 1;
 784                 s->next = *sl;
 785                 *sl = s;
 786                 return s->num;
 787         }
 788
 789 Finding a set by number is currently performed by a simple linear search.
 790 If this turns out to hurt performance, we can store the sets in a dynamic
 791 array like the productions.
 792
 793         static struct symset set_find(struct grammar *g, int num)
 794         {
 795                 struct setlist *sl = g->sets;
 796                 while (sl && sl->num != num)
 797                         sl = sl->next;
 798                 return sl->ss;
 799         }
 800
 801
 802 ### Setting `nullable`
 803
 804 We set `nullable` on the head symbol for any production for which all
 805 the body symbols (if any) are nullable.  As this is a recursive
 806 definition, any change in the `nullable` setting means that we need to
 807 re-evaluate where it needs to be set.
 808
 809 We simply loop around performing the same calculations until no more
 810 changes happen.
 811
 812 ###### symbol fields
 813         int nullable;
 814
 815 ###### functions
 816         static void set_nullable(struct grammar *g)
 817         {
 818                 int check_again = 1;
 819                 while (check_again) {
 820                         int p;
 821                         check_again = 0;
 822                         for (p = 0; p < g->production_count; p++) {
 823                                 struct production *pr = g->productions[p];
 824                                 int s;
 825
 826                                 if (pr->head->nullable)
 827                                         continue;
 828                                 for (s = 0; s < pr->body_size; s++)
 829                                         if (! pr->body[s]->nullable)
 830                                                 break;
 831                                 if (s == pr->body_size) {
 832                                         pr->head->nullable = 1;
 833                                         check_again = 1;
 834                                 }
 835                         }
 836                 }
 837         }
 838
 839 ### Building the `first` sets
 840
 841 When calculating what can follow a particular non-terminal, we will need to
 842 know what the "first" terminal in an subsequent non-terminal might be.  So
 843 we calculate the `first` set for every non-terminal and store them in an
 844 array.  We don't bother recording the "first" set for terminals as they are
 845 trivial.
 846
 847 As the "first" for one symbol might depend on the "first" of another,
 848 we repeat the calculations until no changes happen, just like with
 849 `set_nullable`.  This makes use of the fact that `symset_union`
 850 reports if any change happens.
 851
 852 The core of this which finds the "first" of part of a production body
 853 will be reused for computing the "follow" sets, so we split it out
 854 into a separate function.
 855
 856 ###### grammar fields
 857         struct symset *first;
 858
 859 ###### functions
 860
 861         static int add_first(struct production *pr, int start,
 862                              struct symset *target, struct grammar *g,
 863                              int *to_end)
 864         {
 865                 int s;
 866                 int changed = 0;
 867                 for (s = start; s < pr->body_size; s++) {
 868                         struct symbol *bs = pr->body[s];
 869                         if (bs->type == Terminal) {
 870                                 if (symset_find(target, bs->num) < 0) {
 871                                         symset_add(target, bs->num, 0);
 872                                         changed = 1;
 873                                 }
 874                                 break;
 875                         } else if (symset_union(target, &g->first[bs->num]))
 876                                 changed = 1;
 877                         if (!bs->nullable)
 878                                 break;
 879                 }
 880                 if (to_end)
 881                         *to_end = (s == pr->body_size);
 882                 return changed;
 883         }
 884
 885         static void build_first(struct grammar *g)
 886         {
 887                 int check_again = 1;
 888                 int s;
 889                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
 890                 for (s = 0; s < g->num_syms; s++)
 891                         g->first[s] = INIT_SYMSET;
 892
 893                 while (check_again) {
 894                         int p;
 895                         check_again = 0;
 896                         for (p = 0; p < g->production_count; p++) {
 897                                 struct production *pr = g->productions[p];
 898                                 struct symset *head = &g->first[pr->head->num];
 899
 900                                 if (add_first(pr, 0, head, g, NULL))
 901                                         check_again = 1;
 902                         }
 903                 }
 904         }
 905
 906 ### Building the `follow` sets.
 907
 908 There are two different situations when we will want to generate "follow"
 909 sets.  If we are doing an SLR analysis, we want to generate a single
 910 "follow" set for each non-terminal in the grammar.  That is what is
 911 happening here.  If we are doing an LALR or LR analysis we will want
 912 to generate a separate "LA" set for each item.  We do that later
 913 in state generation.
 914
 915 There are two parts to generating a "follow" set.  Firstly we look at
 916 every place that any non-terminal appears in the body of any
 917 production, and we find the set of possible "first" symbols after
 918 there.  This is done using `add_first` above and only needs to be done
 919 once as the "first" sets are now stable and will not change.
 920
 921 ###### follow code
 922
 923         for (p = 0; p < g->production_count; p++) {
 924                 struct production *pr = g->productions[p];
 925                 int b;
 926
 927                 for (b = 0; b < pr->body_size - 1; b++) {
 928                         struct symbol *bs = pr->body[b];
 929                         if (bs->type == Terminal)
 930                                 continue;
 931                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
 932                 }
 933         }
 934
 935 The second part is to add the "follow" set of the head of a production
 936 to the "follow" sets of the final symbol in the production, and any
 937 other symbol which is followed only by `nullable` symbols.  As this
 938 depends on "follow" itself we need to repeatedly perform the process
 939 until no further changes happen.
 940
 941 ###### follow code
 942
 943         for (again = 0, p = 0;
 944              p < g->production_count;
 945              p < g->production_count-1
 946                 ? p++ : again ? (again = 0, p = 0)
 947                               : p++) {
 948                 struct production *pr = g->productions[p];
 949                 int b;
 950
 951                 for (b = pr->body_size - 1; b >= 0; b--) {
 952                         struct symbol *bs = pr->body[b];
 953                         if (bs->type == Terminal)
 954                                 break;
 955                         if (symset_union(&g->follow[bs->num],
 956                                          &g->follow[pr->head->num]))
 957                                 again = 1;
 958                         if (!bs->nullable)
 959                                 break;
 960                 }
 961         }
 962
 963 We now just need to create and initialise the `follow` list to get a
 964 complete function.
 965
 966 ###### grammar fields
 967         struct symset *follow;
 968
 969 ###### functions
 970         static void build_follow(struct grammar *g)
 971         {
 972                 int s, again, p;
 973                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
 974                 for (s = 0; s < g->num_syms; s++)
 975                         g->follow[s] = INIT_SYMSET;
 976                 ## follow code
 977         }
 978
 979 ### Building itemsets and states
 980
 981 There are three different levels of detail that can be chosen for
 982 building the itemsets and states for the LR grammar.  They are:
 983
 984 1. LR(0) or SLR(1), where no look-ahead is considered.
 985 2. LALR(1) where we build look-ahead sets with each item and merge
 986    the LA sets when we find two paths to the same "kernel" set of items.
 987 3. LR(1) where different look-ahead for any item in the code means
 988    a different state must be created.
 989
 990 ###### forward declarations
 991         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
 992
 993 We need to be able to look through existing states to see if a newly
 994 generated state already exists.  For now we use a simple sorted linked
 995 list.
 996
 997 An item is a pair of numbers: the production number and the position of
 998 "DOT", which is an index into the body of the production.
 999 As the numbers are not enormous we can combine them into a single "short"
1000 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1001 production number (so 4000 productions with maximum of 15 symbols in the
1002 body).
1003
1004 Comparing the itemsets will be a little different to comparing symsets
1005 as we want to do the lookup after generating the "kernel" of an
1006 itemset, so we need to ignore the offset=zero items which are added during
1007 completion.
1008
1009 To facilitate this, we modify the "DOT" number so that "0" sorts to the end of
1010 the list in the symset, and then only compare items before the first "0".
1011
1012 ###### declarations
1013         static inline unsigned short item_num(int production, int index)
1014         {
1015                 return production | ((31-index) << 11);
1016         }
1017         static inline int item_prod(unsigned short item)
1018         {
1019                 return item & 0x7ff;
1020         }
1021         static inline int item_index(unsigned short item)
1022         {
1023                 return (31-(item >> 11)) & 0x1f;
1024         }
1025
1026 For LR(1) analysis we need to compare not just the itemset in a state
1027 but also the LA sets.  As we assign each unique LA set a number, we
1028 can just compare the symset and the data values together.
1029
1030 ###### functions
1031         static int itemset_cmp(struct symset a, struct symset b,
1032                                enum grammar_type type)
1033         {
1034                 int i;
1035                 int av, bv;
1036
1037                 for (i = 0;
1038                      i < a.cnt && i < b.cnt &&
1039                      item_index(a.syms[i]) > 0 &&
1040                      item_index(b.syms[i]) > 0;
1041                      i++) {
1042                         int diff = a.syms[i] - b.syms[i];
1043                         if (diff)
1044                                 return diff;
1045                         if (type == LR1) {
1046                                 diff = a.data[i] - b.data[i];
1047                                 if (diff)
1048                                         return diff;
1049                         }
1050                 }
1051                 if (i == a.cnt || item_index(a.syms[i]) == 0)
1052                         av = -1;
1053                 else
1054                         av = a.syms[i];
1055                 if (i == b.cnt || item_index(b.syms[i]) == 0)
1056                         bv = -1;
1057                 else
1058                         bv = b.syms[i];
1059                 if (av - bv)
1060                         return av - bv;
1061                 if (type < LR1 || av == -1)
1062                         return 0;
1063                 return
1064                         a.data[i] - b.data[i];
1065         }
1066
1067 And now we can build the list of itemsets.  The lookup routine returns
1068 both a success flag and a pointer to where in the list an insert
1069 should happen, so we don't need to search a second time.
1070
1071 ###### declarations
1072         struct itemset {
1073                 struct itemset *next;
1074                 short state;
1075                 struct symset items;
1076                 struct symset go_to;
1077                 char completed;
1078         };
1079
1080 ###### grammar fields
1081         struct itemset *items;
1082         int states;
1083
1084 ###### functions
1085         static int itemset_find(struct grammar *g, struct itemset ***where,
1086                                 struct symset kernel, enum grammar_type type)
1087         {
1088                 struct itemset **ip;
1089
1090                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1091                         struct itemset *i = *ip;
1092                         int diff;
1093                         diff = itemset_cmp(i->items, kernel, type);
1094                         if (diff < 0)
1095                                 continue;
1096                         if (diff > 0)
1097                                 break;
1098                         /* found */
1099                         *where = ip;
1100                         return 1;
1101                 }
1102                 *where = ip;
1103                 return 0;
1104         }
1105
1106 Adding an itemset may require merging the LA sets if LALR analysis is
1107 happening. If any new LA set add symbol that weren't in the old LA set, we
1108 clear the `completed` flag so that the dependants of this itemset will be
1109 recalculated and their LA sets updated.
1110
1111 `add_itemset` must consume the symsets it is passed, either by adding
1112 them to a data structure, of freeing them.
1113
1114         static int add_itemset(struct grammar *g, struct symset ss,
1115                                enum grammar_type type)
1116         {
1117                 struct itemset **where, *is;
1118                 int i;
1119                 int found = itemset_find(g, &where, ss, type);
1120                 if (!found) {
1121                         is = calloc(1, sizeof(*is));
1122                         is->state = g->states;
1123                         g->states += 1;
1124                         is->items = ss;
1125                         is->next = *where;
1126                         is->go_to = INIT_DATASET;
1127                         *where = is;
1128                         return is->state;
1129                 }
1130                 is = *where;
1131                 if (type != LALR) {
1132                         symset_free(ss);
1133                         return is->state;
1134                 }
1135                 for (i = 0; i < ss.cnt; i++) {
1136                         struct symset temp = INIT_SYMSET, s;
1137                         if (ss.data[i] == is->items.data[i])
1138                                 continue;
1139                         s = set_find(g, is->items.data[i]);
1140                         symset_union(&temp, &s);
1141                         s = set_find(g, ss.data[i]);
1142                         if (symset_union(&temp, &s)) {
1143                                 is->items.data[i] = save_set(g, temp);
1144                                 is->completed = 0;
1145                         } else
1146                                 symset_free(temp);
1147                 }
1148                 symset_free(ss);
1149                 return is->state;
1150         }
1151
1152 #### The build
1153
1154 To build all the itemsets, we first insert the initial itemset made from the
1155 start symbol, complete each itemset, and then generate new itemsets from old
1156 until no new ones can be made.
1157
1158 Completing an itemset means finding all the items where "DOT" is followed by
1159 a nonterminal and adding "DOT=0" items for every production from that
1160 non-terminal - providing each item hasn't already been added.
1161
1162 If LA sets are needed, the LA set for each new item is found using
1163 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This may
1164 be supplemented by the LA set for the item which produce the new item.
1165
1166 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1167 is used in the next stage.
1168
1169 NOTE: precedence handling should happen here - I haven't written this yet
1170 though.
1171
1172 ###### complete itemset
1173         for (i = 0; i < is->items.cnt; i++) {
1174                 int p = item_prod(is->items.syms[i]);
1175                 int bs = item_index(is->items.syms[i]);
1176                 struct production *pr = g->productions[p];
1177                 int p2;
1178                 struct symbol *s;
1179                 struct symset LA = INIT_SYMSET;
1180                 unsigned short sn = 0;
1181
1182                 if (bs == pr->body_size)
1183                         continue;
1184                 s = pr->body[bs];
1185                 if (symset_find(&done, s->num) < 0)
1186                         symset_add(&done, s->num, 0);
1187                 if (s->type != Nonterminal)
1188                         continue;
1189                 again = 1;
1190                 if (type >= LALR) {
1191                         // Need the LA set.
1192                         int to_end;
1193                         add_first(pr, bs+1, &LA, g, &to_end);
1194                         if (to_end) {
1195                                 struct symset ss = set_find(g, is->items.data[i]);
1196                                 symset_union(&LA, &ss);
1197                         }
1198                         sn = save_set(g, LA);
1199                         LA = set_find(g, sn);
1200                 }
1201                 /* Add productions for this symbol */
1202                 for (p2 = s->first_production;
1203                      p2 < g->production_count &&
1204                       g->productions[p2]->head == s;
1205                      p2++) {
1206                         int itm = item_num(p2, 0);
1207                         int pos = symset_find(&is->items, itm);
1208                         if (pos < 0) {
1209                                 symset_add(&is->items, itm, sn);
1210                                 /* Will have re-ordered, so start
1211                                  * from beginning again */
1212                                 i = -1;
1213                         } else if (type >= LALR) {
1214                                 struct symset ss = set_find(g, is->items.data[pos]);
1215                                 struct symset tmp = INIT_SYMSET;
1216
1217                                 symset_union(&tmp, &ss);
1218                                 if (symset_union(&tmp, &LA)) {
1219                                         is->items.data[pos] = save_set(g, tmp);
1220                                         i = -1;
1221                                 }else
1222                                         symset_free(tmp);
1223                         }
1224                 }
1225         }
1226
1227 For each symbol we found (and placed in `done`) we collect all the items for
1228 which this symbol is next, and create a new itemset with "DOT" advanced over
1229 the symbol.  This is then added to the collection of itemsets (or merged
1230 with a pre-existing itemset).
1231
1232 ###### derive itemsets
1233         // Now we have a completed itemset, so we need to
1234         // create all the 'next' itemset and create them
1235         // if they don't exist.
1236         for (i = 0; i < done.cnt; i++) {
1237                 int j;
1238                 unsigned short state;
1239                 struct symset newitemset = INIT_SYMSET;
1240                 if (type >= LALR)
1241                         newitemset = INIT_DATASET;
1242
1243                 for (j = 0; j < is->items.cnt; j++) {
1244                         int itm = is->items.syms[j];
1245                         int p = item_prod(itm);
1246                         int bp = item_index(itm);
1247                         struct production *pr = g->productions[p];
1248                         unsigned short la = 0;
1249                         int pos;
1250
1251                         if (bp == pr->body_size)
1252                                 continue;
1253                         if (pr->body[bp]->num != done.syms[i])
1254                                 continue;
1255                         if (type >= LALR)
1256                                 la = is->items.data[j];
1257                         pos = symset_find(&newitemset, pr->head->num);
1258                         if (pos < 0)
1259                                 symset_add(&newitemset, item_num(p, bp+1), la);
1260                         else if (type >= LALR) {
1261                                 // Need to merge la set.
1262                                 int la2 = newitemset.data[pos];
1263                                 if (la != la2) {
1264                                         struct symset ss = set_find(g, la2);
1265                                         struct symset LA = INIT_SYMSET;
1266                                         symset_union(&LA, &ss);
1267                                         ss = set_find(g, la);
1268                                         if (symset_union(&LA, &ss))
1269                                                 newitemset.data[pos] = save_set(g, LA);
1270                                         else
1271                                                 symset_free(LA);
1272                                 }
1273                         }
1274                 }
1275                 state = add_itemset(g, newitemset, type);
1276                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1277                         symset_add(&is->go_to, done.syms[i], state);
1278         }
1279
1280 All that is left is to crate the initial itemset from production zero, and
1281 with `TK_eof` as the LA set.
1282
1283 ###### functions
1284         static void build_itemsets(struct grammar *g, enum grammar_type type)
1285         {
1286                 struct symset first = INIT_SYMSET;
1287                 struct itemset *is;
1288                 int again;
1289                 unsigned short la = 0;
1290                 if (type >= LALR) {
1291                         // LA set just has eof
1292                         struct symset eof = INIT_SYMSET;
1293                         symset_add(&eof, TK_eof, 0);
1294                         la = save_set(g, eof);
1295                         first = INIT_DATASET;
1296                 }
1297                 // production 0, offset 0  (with no data)
1298                 symset_add(&first, item_num(0, 0), la);
1299                 add_itemset(g, first, type);
1300                 for (again = 0, is = g->items;
1301                      is;
1302                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
1303                         int i;
1304                         struct symset done = INIT_SYMSET;
1305                         if (is->completed)
1306                                 continue;
1307                         is->completed = 1;
1308                         again = 1;
1309                         ## complete itemset
1310                         ## derive itemsets
1311                         symset_free(done);
1312                 }
1313         }
1314
1315 ### Completing the analysis.
1316
1317 The exact process of analysis depends on which level was requested.  For
1318 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1319 `LR1` we need first, but not follow.  For `SLR` we need both.
1320
1321 We don't build the "action" tables that you might expect as the parser
1322 doesn't use them.  They will be calculated to some extent if needed for
1323 a report.
1324
1325 Once we have built everything we allocate arrays for the two lists:
1326 symbols and itemsets.  This allows more efficient access during reporting.
1327 The symbols are grouped as terminals and non-terminals and we record the
1328 changeover point in `first_nonterm`.
1329
1330 ###### grammar fields
1331         struct symbol **symtab;
1332         struct itemset **statetab;
1333         int first_nonterm;
1334
1335 ###### grammar_analyse
1336
1337         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1338         {
1339                 struct symbol *s;
1340                 struct itemset *is;
1341                 int snum = TK_reserved;
1342                 for (s = g->syms; s; s = s->next)
1343                         if (s->num < 0 && s->type == Terminal) {
1344                                 s->num = snum;
1345                                 snum++;
1346                         }
1347                 g->first_nonterm = snum;
1348                 for (s = g->syms; s; s = s->next)
1349                         if (s->num < 0) {
1350                                 s->num = snum;
1351                                 snum++;
1352                         }
1353                 g->num_syms = snum;
1354                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1355                 for (s = g->syms; s; s = s->next)
1356                         g->symtab[s->num] = s;
1357
1358                 if (type >= SLR) {
1359                         set_nullable(g);
1360                         build_first(g);
1361                 }
1362                 if (type == SLR)
1363                         build_follow(g);
1364
1365                 build_itemsets(g, type);
1366
1367                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1368                 for (is = g->items; is ; is = is->next)
1369                         g->statetab[is->state] = is;
1370         }
1371
1372 ## Reporting on the Grammar
1373
1374 The purpose of the report is to give the grammar developer insight into
1375 how the grammar parser will work.  It is basically a structured dump of
1376 all the tables that have been generated, plus an description of any conflicts.
1377
1378 ###### grammar_report
1379         static int grammar_report(struct grammar *g, enum grammar_type type)
1380         {
1381                 report_symbols(g);
1382                 if (g->follow)
1383                         report_follow(g);
1384                 report_itemsets(g);
1385                 return report_conflicts(g, type);
1386         }
1387
1388 Firstly we have the complete list of symbols, together with the "FIRST"
1389 set if that was generated.
1390
1391 ###### functions
1392
1393         static void report_symbols(struct grammar *g)
1394         {
1395                 int n;
1396                 if (g->first)
1397                         printf("SYMBOLS + FIRST:\n");
1398                 else
1399                         printf("SYMBOLS:\n");
1400
1401                 for (n = 0; n < g->num_syms; n++) {
1402                         struct symbol *s = g->symtab[n];
1403                         if (!s)
1404                                 continue;
1405
1406                         printf(" %c%3d%c: ",
1407                                s->nullable ? '*':' ',
1408                                s->num, symtypes[s->type]);
1409                         prtxt(s->name);
1410                         if (s->precedence)
1411                                 printf(" (%d%s)", s->precedence,
1412                                        assoc_names[s->assoc]);
1413
1414                         if (g->first && s->type == Nonterminal) {
1415                                 int j;
1416                                 char c = ':';
1417                                 for (j = 0; j < g->first[n].cnt; j++) {
1418                                         printf("%c ", c);
1419                                         c = ',';
1420                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1421                                 }
1422                         }
1423                         printf("\n");
1424                 }
1425         }
1426
1427 Then we have to follow sets if they were computed.
1428
1429         static void report_follow(struct grammar *g)
1430         {
1431                 int n;
1432                 printf("FOLLOW:\n");
1433                 for (n = 0; n < g->num_syms; n++)
1434                         if (g->follow[n].cnt) {
1435                                 int j;
1436                                 char c = ':';
1437                                 printf("  ");
1438                                 prtxt(g->symtab[n]->name);
1439                                 for (j = 0; j < g->follow[n].cnt; j++) {
1440                                         printf("%c ", c);
1441                                         c = ',';
1442                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1443                                 }
1444                                 printf("\n");
1445                         }
1446         }
1447
1448 And finally the item sets.  These include the GO TO tables and, for
1449 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1450 it up a bit.  First the items, with production number and associativity.
1451
1452         static void report_item(struct grammar *g, int itm)
1453         {
1454                 int p = item_prod(itm);
1455                 int dot = item_index(itm);
1456                 struct production *pr = g->productions[p];
1457                 int i;
1458
1459                 printf("    ");
1460                 prtxt(pr->head->name);
1461                 printf(" ->");
1462                 for (i = 0; i < pr->body_size; i++) {
1463                         printf(" %s", (dot == i ? ". ": ""));
1464                         prtxt(pr->body[i]->name);
1465                 }
1466                 if (dot == pr->body_size)
1467                         printf(" .");
1468                 printf(" [%d]", p);
1469                 if (pr->precedence)
1470                         printf(" (%d%s)", pr->precedence,
1471                                assoc_names[pr->assoc]);
1472                 printf("\n");
1473         }
1474
1475 The LA sets which are (possibly) reported with each item:
1476
1477         static void report_la(struct grammar *g, int lanum)
1478         {
1479                 struct symset la = set_find(g, lanum);
1480                 int i;
1481                 char c = ':';
1482
1483                 printf("        LOOK AHEAD(%d)", lanum);
1484                 for (i = 0; i < la.cnt; i++) {
1485                         printf("%c ", c);
1486                         c = ',';
1487                         prtxt(g->symtab[la.syms[i]]->name);
1488                 }
1489                 printf("\n");
1490         }
1491
1492 Then the go to sets:
1493
1494
1495         static void report_goto(struct grammar *g, struct symset gt)
1496         {
1497                 int i;
1498                 printf("    GOTO:\n");
1499
1500                 for (i = 0; i < gt.cnt; i++) {
1501                         printf("      ");
1502                         prtxt(g->symtab[gt.syms[i]]->name);
1503                         printf(" -> %d\n", gt.data[i]);
1504                 }
1505         }
1506
1507 Now we can report all the item sets complete with items, LA sets, and GO TO.
1508
1509         static void report_itemsets(struct grammar *g)
1510         {
1511                 int s;
1512                 printf("ITEM SETS(%d)\n", g->states);
1513                 for (s = 0; s < g->states; s++) {
1514                         int j;
1515                         struct itemset *is = g->statetab[s];
1516                         printf("  Itemset %d:\n", s);
1517                         for (j = 0; j < is->items.cnt; j++) {
1518                                 report_item(g, is->items.syms[j]);
1519                                 if (is->items.data != NO_DATA)
1520                                         report_la(g, is->items.data[j]);
1521                         }
1522                         report_goto(g, is->go_to);
1523                 }
1524         }
1525
1526 ### Reporting conflicts
1527
1528 Conflict detection varies a lot among different analysis levels.  However
1529 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1530 LR1 are also similar as they have FOLLOW or LA sets.
1531
1532 ###### functions
1533
1534         ## conflict functions
1535
1536         static int report_conflicts(struct grammar *g, enum grammar_type type)
1537         {
1538                 int cnt = 0;
1539                 printf("Conflicts:\n");
1540                 if (type < SLR)
1541                         cnt = conflicts_lr0(g, type);
1542                 else
1543                         cnt = conflicts_slr(g, type);
1544                 if (cnt == 0)
1545                         printf(" - no conflicts\n");
1546                 return cnt;
1547         }
1548
1549 LR0 conflicts are any state which have both a reducible item and
1550 a shiftable item.
1551
1552 LR05 conflicts only occurs if two possibly reductions exist,
1553 as shifts always over-ride reductions.
1554
1555 ###### conflict functions
1556         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1557         {
1558                 int i;
1559                 int cnt = 0;
1560                 for (i = 0; i < g->states; i++) {
1561                         struct itemset *is = g->statetab[i];
1562                         int last_reduce = -1;
1563                         int prev_reduce = -1;
1564                         int last_shift = -1;
1565                         int j;
1566                         if (!is)
1567                                 continue;
1568                         for (j = 0; j < is->items.cnt; j++) {
1569                                 int itm = is->items.syms[j];
1570                                 int p = item_prod(itm);
1571                                 int bp = item_index(itm);
1572                                 struct production *pr = g->productions[p];
1573
1574                                 if (bp == pr->body_size) {
1575                                         prev_reduce = last_reduce;
1576                                         last_reduce = j;
1577                                         continue;
1578                                 }
1579                                 if (pr->body[bp]->type == Terminal)
1580                                         last_shift = j;
1581                         }
1582                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1583                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1584                                 report_item(g, is->items.syms[last_shift]);
1585                                 report_item(g, is->items.syms[last_reduce]);
1586                                 cnt++;
1587                         }
1588                         if (prev_reduce >= 0) {
1589                                 printf("  State %d has 2 (or more) reducible items\n", i);
1590                                 report_item(g, is->items.syms[prev_reduce]);
1591                                 report_item(g, is->items.syms[last_reduce]);
1592                                 cnt++;
1593                         }
1594                 }
1595                 return cnt;
1596         }
1597
1598 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1599 look ahead, or if a symbol in a look-ahead can be shifted.  The differ only
1600 in the source of the look ahead set.
1601
1602 We build a dataset mapping terminal to item for possible SHIFTs and then
1603 another for possible REDUCE operations. We report when we get conflicts
1604 between the two.
1605
1606         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1607         {
1608                 int i;
1609                 int cnt = 0;
1610
1611                 for (i = 0; i < g->states; i++) {
1612                         struct itemset *is = g->statetab[i];
1613                         struct symset shifts = INIT_DATASET;
1614                         struct symset reduce = INIT_DATASET;
1615                         int j;
1616                         if (!is)
1617                                 continue;
1618                         /* First collect the shifts */
1619                         for (j = 0; j < is->items.cnt; j++) {
1620                                 unsigned short itm = is->items.syms[j];
1621                                 int p = item_prod(itm);
1622                                 int bp = item_index(itm);
1623                                 struct production *pr = g->productions[p];
1624
1625                                 if (bp < pr->body_size &&
1626                                     pr->body[bp]->type == Terminal) {
1627                                         /* shiftable */
1628                                         int sym = pr->body[bp]->num;
1629                                         if (symset_find(&shifts, sym) < 0)
1630                                                 symset_add(&shifts, sym, itm);
1631                                 }
1632                         }
1633                         /* Now look for reduction and conflicts */
1634                         for (j = 0; j < is->items.cnt; j++) {
1635                                 unsigned short itm = is->items.syms[j];
1636                                 int p = item_prod(itm);
1637                                 int bp = item_index(itm);
1638                                 struct production *pr = g->productions[p];
1639
1640                                 if (bp < pr->body_size)
1641                                         continue;
1642                                 /* reducible */
1643                                 struct symset la;
1644                                 if (type == SLR)
1645                                         la = g->follow[pr->head->num];
1646                                 else
1647                                         la = set_find(g, is->items.data[j]);
1648                                 int k;
1649                                 for (k = 0; k < la.cnt; k++) {
1650                                         int pos = symset_find(&shifts, la.syms[k]);
1651                                         if (pos >= 0) {
1652                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1653                                                 prtxt(g->symtab[la.syms[k]]->name);
1654                                                 printf(":\n");
1655                                                 report_item(g, shifts.data[pos]);
1656                                                 report_item(g, itm);
1657                                                 cnt++;
1658                                         }
1659                                         pos = symset_find(&reduce, la.syms[k]);
1660                                         if (pos < 0) {
1661                                                 symset_add(&reduce, la.syms[k], itm);
1662                                                 continue;
1663                                         }
1664                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1665                                         prtxt(g->symtab[la.syms[k]]->name);
1666                                         printf(":\n");
1667                                         report_item(g, itm);
1668                                         report_item(g, reduce.data[pos]);
1669                                         cnt++;
1670                                 }
1671                         }
1672                         symset_free(shifts);
1673                         symset_free(reduce);
1674                 }
1675                 return cnt;
1676         }
1677
1678
1679 ## Generating the parser
1680
1681 The export part of the parser is the `parse_XX` function, where the name
1682 `XX` is based on the name of the parser files.
1683
1684 This takes a `code_node`, a partially initialized `token_config`, and an
1685 optional `FILE` to send tracing to.  The `token_config` gets the list of
1686 known words added and then is used with the `code_node` to initialize the
1687 scanner.
1688
1689 `parse_XX` then call the library function `parser_run` to actually complete
1690 the parse,  This needs the `states` table and function to call the various
1691 pieces of code provided in the grammar file, so they are generated first.
1692
1693 ###### parser_generate
1694
1695         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1696         {
1697                 gen_known(f, g);
1698                 gen_non_term(f, g);
1699                 gen_goto(f, g);
1700                 gen_states(f, g);
1701                 gen_reduce(f, g, file);
1702                 gen_free(f, g);
1703
1704                 fprintf(f, "#line 0 \"gen_parser\"\n");
1705                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1706                         name);
1707                 fprintf(f, "{\n");
1708                 fprintf(f, "\tstruct token_state *tokens;\n");
1709                 fprintf(f, "\tconfig->words_marks = known;\n");
1710                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1711                 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1712                 fprintf(f, "\ttokens = token_open(code, config);\n");
1713                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
1714                 fprintf(f, "\ttoken_close(tokens);\n");
1715                 fprintf(f, "\treturn rv;\n");
1716                 fprintf(f, "}\n\n");
1717         }
1718
1719 ### Table words table
1720
1721 The know words is simply an array of terminal symbols.
1722 The table of nonterminals used for tracing is a similar array.
1723
1724 ###### functions
1725
1726         static void gen_known(FILE *f, struct grammar *g)
1727         {
1728                 int i;
1729                 fprintf(f, "#line 0 \"gen_known\"\n");
1730                 fprintf(f, "static const char *known[] = {\n");
1731                 for (i = TK_reserved;
1732                      i < g->num_syms && g->symtab[i]->type == Terminal;
1733                      i++)
1734                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1735                                 g->symtab[i]->name.txt);
1736                 fprintf(f, "};\n\n");
1737         }
1738
1739         static void gen_non_term(FILE *f, struct grammar *g)
1740         {
1741                 int i;
1742                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1743                 fprintf(f, "static const char *non_term[] = {\n");
1744                 for (i = TK_reserved;
1745                      i < g->num_syms;
1746                      i++)
1747                         if (g->symtab[i]->type == Nonterminal)
1748                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1749                                         g->symtab[i]->name.txt);
1750                 fprintf(f, "};\n\n");
1751         }
1752
1753 ### States and the goto tables.
1754
1755 For each state we record the goto table and the reducible production if
1756 there is one.
1757 Some of the details of the reducible production are stored in the
1758 `do_reduce` function to come later.  Here we store the production number,
1759 the body size (useful for stack management) and the resulting symbol (useful
1760 for knowing how to free data later).
1761
1762 The go to table is stored in a simple array of `sym` and corresponding
1763 `state`.
1764
1765 ###### exported types
1766
1767         struct lookup {
1768                 short sym;
1769                 short state;
1770         };
1771         struct state {
1772                 short go_to_cnt;
1773                 const struct lookup * go_to;
1774                 short reduce_prod;
1775                 short reduce_size;
1776                 short reduce_sym;
1777         };
1778
1779
1780 ###### functions
1781
1782         static void gen_goto(FILE *f, struct grammar *g)
1783         {
1784                 int i;
1785                 fprintf(f, "#line 0 \"gen_goto\"\n");
1786                 for (i = 0; i < g->states; i++) {
1787                         int j;
1788                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
1789                                 i);
1790                         struct symset gt = g->statetab[i]->go_to;
1791                         for (j = 0; j < gt.cnt; j++)
1792                                 fprintf(f, "\t{ %d, %d },\n",
1793                                         gt.syms[j], gt.data[j]);
1794                         fprintf(f, "};\n");
1795                 }
1796         }
1797
1798 ###### functions
1799
1800         static void gen_states(FILE *f, struct grammar *g)
1801         {
1802                 int i;
1803                 fprintf(f, "#line 0 \"gen_states\"\n");
1804                 fprintf(f, "static const struct state states[] = {\n");
1805                 for (i = 0; i < g->states; i++) {
1806                         struct itemset *is = g->statetab[i];
1807                         int j, prod = -1;
1808                         for (j = 0; j < is->items.cnt; j++) {
1809                                 int itm = is->items.syms[j];
1810                                 int p = item_prod(itm);
1811                                 int bp = item_index(itm);
1812                                 struct production *pr = g->productions[p];
1813
1814                                 if (bp < pr->body_size)
1815                                         continue;
1816                                 /* This is what we reduce */
1817                                 prod = p;
1818                                 break;
1819                         }
1820
1821                         fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d },\n",
1822                                 i, is->go_to.cnt, i, prod,
1823                                 prod < 0 ? -1 : g->productions[prod]->body_size,
1824                                 prod < 0 ? -1 : g->productions[prod]->head->num);
1825                 }
1826                 fprintf(f, "};\n\n");
1827         }
1828
1829 ### The `do_reduce` function and the code
1830
1831 When the parser engine decides to reduce a production, it calls `do_reduce`.
1832 This has two functions.
1833
1834 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1835 production being reduced.  Secondly it runs the code that was included with
1836 the production if any.
1837
1838 This code needs to be able to store data somewhere.  Rather than requiring
1839 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1840 `do_reduce` return the size to be saved.
1841
1842 The code fragment requires translation when written out.  Any `$N` needs to
1843 be converted to a reference either to that buffer (if `$0`) or to the
1844 structure returned by a previous reduction.  These pointer need to be cast
1845 to the appropriate type for each access.  All this is handling in
1846 `gen_code`.
1847
1848
1849 ###### functions
1850
1851         static void gen_code(struct production *p, FILE *f, struct grammar *g)
1852         {
1853                 char *c;
1854                 fprintf(f, "\t\t\t");
1855                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1856                         int n;
1857                         if (*c != '$') {
1858                                 fputc(*c, f);
1859                                 if (*c == '\n')
1860                                         fputs("\t\t\t", f);
1861                                 continue;
1862                         }
1863                         c++;
1864                         if (*c < '0' || *c > '9') {
1865                                 fputc(*c, f);
1866                                 continue;
1867                         }
1868                         n = *c - '0';
1869                         while (c[1] >= '0' && c[1] <= '9') {
1870                                 c += 1;
1871                                 n = n * 10 + *c - '0';
1872                         }
1873                         if (n == 0)
1874                                 fprintf(f, "(*(struct %.*s*)ret)",
1875                                         p->head->struct_name.len,
1876                                         p->head->struct_name.txt);
1877                         else if (n > p->body_size)
1878                                 fprintf(f, "$%d", n);
1879                         else if (p->body[n-1]->type == Terminal)
1880                                 fprintf(f, "(*(struct token *)body[%d])",
1881                                         n-1);
1882                         else if (p->body[n-1]->struct_name.txt == NULL)
1883                                 fprintf(f, "$%d", n);
1884                         else
1885                                 fprintf(f, "(*(struct %.*s*)body[%d])",
1886                                         p->body[n-1]->struct_name.len,
1887                                         p->body[n-1]->struct_name.txt, n-1);
1888                 }
1889                 fputs("\n", f);
1890         }
1891
1892 ###### functions
1893
1894         static void gen_reduce(FILE *f, struct grammar *g, char *file)
1895         {
1896                 int i;
1897                 fprintf(f, "#line 0 \"gen_reduce\"\n");
1898                 fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n");
1899                 fprintf(f, "                      void *ret)\n");
1900                 fprintf(f, "{\n");
1901                 fprintf(f, "\tint ret_size = 0;\n");
1902
1903                 fprintf(f, "\tswitch(prod) {\n");
1904                 for (i = 0; i < g->production_count; i++) {
1905                         struct production *p = g->productions[i];
1906                         fprintf(f, "\tcase %d:\n", i);
1907
1908                         if (p->code.txt)
1909                                 gen_code(p, f, g);
1910
1911                         if (p->head->struct_name.txt)
1912                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
1913                                         p->head->struct_name.len,
1914                                         p->head->struct_name.txt);
1915
1916                         fprintf(f, "\t\tbreak;\n");
1917                 }
1918                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
1919         }
1920
1921 ### `do_free`
1922
1923 As each non-terminal can potentially cause a different type of data
1924 structure to be allocated and filled in, we need to be able to free it when
1925 done.
1926
1927 It is particularly important to have fine control over freeing during error
1928 recovery where individual stack frames might need to be freed.
1929
1930 For this, the grammar author required to defined a `free_XX` function for
1931 each structure that is used by a non-terminal.  `do_free` all call whichever
1932 is appropriate given a symbol number, and will call `free` (as is
1933 appropriate for tokens` on any terminal symbol.
1934
1935 ###### functions
1936
1937         static void gen_free(FILE *f, struct grammar *g)
1938         {
1939                 int i;
1940
1941                 fprintf(f, "#line 0 \"gen_free\"\n");
1942                 fprintf(f, "static void do_free(short sym, void *asn)\n");
1943                 fprintf(f, "{\n");
1944                 fprintf(f, "\tif (!asn) return;\n");
1945                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
1946                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
1947                 fprintf(f, "\tswitch(sym) {\n");
1948
1949                 for (i = 0; i < g->num_syms; i++) {
1950                         struct symbol *s = g->symtab[i];
1951                         if (!s ||
1952                             s->type != Nonterminal ||
1953                             s->struct_name.txt == NULL)
1954                                 continue;
1955
1956                         fprintf(f, "\tcase %d:\n", s->num);
1957                         fprintf(f, "\t\tfree_%.*s(asn);\n",
1958                                 s->struct_name.len,
1959                                 s->struct_name.txt);
1960                         fprintf(f, "\t\tbreak;\n");
1961                 }
1962                 fprintf(f, "\t}\n}\n\n");
1963         }
1964
1965 ## The main routine.
1966
1967 There are three key parts to the "main" function of parsergen: processing
1968 the arguments, loading the grammar file, and dealing with the grammar.
1969
1970 ### Argument processing.
1971
1972 Command line options allow the selection of analysis mode, name of output
1973 file, and whether or not a report should be generated.  By default we create
1974 a report only if no code output was requested.
1975
1976 The `parse_XX` function name uses the basename of the output file which
1977 should not have a suffix (`.c`).  `.c` is added to the given name for the
1978 code, and `.h` is added for the header (if header text is specifed in the
1979 grammar file).
1980
1981 ###### includes
1982         #include <getopt.h>
1983
1984 ###### declarations
1985         static const struct option long_options[] = {
1986                 { "LR0",        0, NULL, '0' },
1987                 { "LR05",       0, NULL, '5' },
1988                 { "SLR",        0, NULL, 'S' },
1989                 { "LALR",       0, NULL, 'L' },
1990                 { "LR1",        0, NULL, '1' },
1991                 { "report",     0, NULL, 'R' },
1992                 { "output",     1, NULL, 'o' },
1993                 { NULL,         0, NULL, 0   }
1994         };
1995         const char *options = "05SL1Ro:";
1996
1997 ###### process arguments
1998         int opt;
1999         char *outfile = NULL;
2000         char *infile;
2001         char *name;
2002         int report = 1;
2003         enum grammar_type type = LR05;
2004         while ((opt = getopt_long(argc, argv, options,
2005                                   long_options, NULL)) != -1) {
2006                 switch(opt) {
2007                 case '0':
2008                         type = LR0; break;
2009                 case '5':
2010                         type = LR05; break;
2011                 case 'S':
2012                         type = SLR; break;
2013                 case 'L':
2014                         type = LALR; break;
2015                 case '1':
2016                         type = LR1; break;
2017                 case 'R':
2018                         report = 2; break;
2019                 case 'o':
2020                         outfile = optarg; break;
2021                 default:
2022                         fprintf(stderr, "Usage: parsergen ...\n");
2023                         exit(1);
2024                 }
2025         }
2026         if (optind < argc)
2027                 infile = argv[optind++];
2028         else {
2029                 fprintf(stderr, "No input file given\n");
2030                 exit(1);
2031         }
2032         if (outfile && report == 1)
2033                 report = 0;
2034         name = outfile;
2035         if (name && strchr(name, '/'))
2036                 name = strrchr(name, '/')+1;
2037
2038         if (optind < argc) {
2039                 fprintf(stderr, "Excess command line arguments\n");
2040                 exit(1);
2041         }
2042
2043 ### Loading the grammar file
2044
2045 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2046 map it.
2047
2048 One we have extracted the code (with `mdcode`) we expect to file three
2049 sections: header, code, and grammar.  Anything else is an error.
2050
2051 "header" and "code" are optional, though it is hard to build a working
2052 parser with neither. "grammar" must be provided.
2053
2054 ###### includes
2055         #include <fcntl.h>
2056         #include <sys/mman.h>
2057         #include <errno.h>
2058
2059 ###### functions
2060         static int errs;
2061         static void pr_err(char *msg)
2062         {
2063                 errs++;
2064                 fprintf(stderr, "%s\n", msg);
2065         }
2066
2067 ###### load file
2068         struct section *table;
2069         int fd;
2070         int len;
2071         char *file;
2072         fd = open(infile, O_RDONLY);
2073         if (fd < 0) {
2074                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2075                         infile, strerror(errno));
2076                 exit(1);
2077         }
2078         len = lseek(fd, 0, 2);
2079         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2080         table = code_extract(file, file+len, pr_err);
2081
2082         struct code_node *hdr = NULL;
2083         struct code_node *code = NULL;
2084         struct code_node *gram = NULL;
2085         for (s = table; s; s = s->next) {
2086                 if (text_is(s->section, "header"))
2087                         hdr = s->code;
2088                 else if (text_is(s->section, "code"))
2089                         code = s->code;
2090                 else if (text_is(s->section, "grammar"))
2091                         gram = s->code;
2092                 else {
2093                         fprintf(stderr, "Unknown content section: %.*s\n",
2094                                 s->section.len, s->section.txt);
2095                         rv |= 2;
2096                 }
2097         }
2098
2099 ### Processing the input
2100
2101 As we need to append an extention to a filename and then open it for
2102 writing, and we need to do this twice, it helps to have a separate function.
2103
2104 ###### functions
2105
2106         static FILE *open_ext(char *base, char *ext)
2107         {
2108                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2109                 FILE *f;
2110                 strcat(strcpy(fn, base), ext);
2111                 f = fopen(fn, "w");
2112                 free(fn);
2113                 return f;
2114         }
2115
2116 If we can read the grammar, then we analyse and optionally report on it.  If
2117 the report finds conflicts we will exit with an error status.
2118
2119 ###### process input
2120         struct grammar *g = NULL;
2121         if (gram == NULL) {
2122                 fprintf(stderr, "No grammar section provided\n");
2123                 rv |= 2;
2124         } else {
2125                 g = grammar_read(gram);
2126                 if (!g) {
2127                         fprintf(stderr, "Failure to parse grammar\n");
2128                         rv |= 2;
2129                 }
2130         }
2131         if (g) {
2132                 grammar_analyse(g, type);
2133                 if (report)
2134                         if (grammar_report(g, type))
2135                                 rv |= 1;
2136         }
2137
2138 If a headers section is defined, we write it out and include a declaration
2139 for the `parse_XX` function so it can be used from separate code.
2140
2141         if (rv == 0 && hdr && outfile) {
2142                 FILE *f = open_ext(outfile, ".h");
2143                 if (f) {
2144                         code_node_print(f, hdr, infile);
2145                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2146                                 name);
2147                         fclose(f);
2148                 } else {
2149                         fprintf(stderr, "Cannot create %s.h\n",
2150                                 outfile);
2151                         rv |= 4;
2152                 }
2153         }
2154
2155 And if all goes well and an output file was provided, we create the `.c`
2156 file with the code section (if any) and the parser tables and function.
2157
2158         if (rv == 0 && outfile) {
2159                 FILE *f = open_ext(outfile, ".c");
2160                 if (f) {
2161                         if (code)
2162                                 code_node_print(f, code, infile);
2163                         gen_parser(f, g, infile, name);
2164                         fclose(f);
2165                 } else {
2166                         fprintf(stderr, "Cannot create %s.c\n",
2167                                 outfile);
2168                         rv |= 4;
2169                 }
2170         }
2171
2172 And that about wraps it up.  We need to set the locale so that UTF-8 is
2173 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2174
2175 ###### File: parsergen.mk
2176         parsergen : parsergen.o libscanner.o libmdcode.o
2177                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2178
2179 ###### includes
2180         #include <locale.h>
2181
2182 ###### main
2183
2184         int main(int argc, char *argv[])
2185         {
2186                 struct section *s;
2187                 int rv = 0;
2188
2189                 setlocale(LC_ALL,"");
2190
2191                 ## process arguments
2192                 ## load file
2193                 ## process input
2194
2195                 return rv;
2196         }
2197
2198 ## The SHIFT/REDUCE parser
2199
2200 Having analysed the grammar and generated all the table, we only need the
2201 shift/reduce engine to bring it all together.
2202
2203 ### Goto table lookup
2204
2205 The parser generator has nicely provided us with goto tables sorted by
2206 symbol number.  We need a binary search function to find a symbol in the
2207 table.
2208
2209 ###### parser functions
2210
2211         static int search(const struct state *l, int sym)
2212         {
2213                 int lo = 0;
2214                 int hi = l->go_to_cnt;
2215
2216                 if (hi == 0)
2217                         return -1;
2218                 while (lo + 1 < hi) {
2219                         int mid = (lo + hi) / 2;
2220                         if (l->go_to[mid].sym <= sym)
2221                                 lo = mid;
2222                         else
2223                                 hi = mid;
2224                 }
2225                 if (l->go_to[lo].sym == sym)
2226                         return l->go_to[lo].state;
2227                 else
2228                         return -1;
2229         }
2230
2231 ### The state stack.
2232
2233 The core data structure for the parser is the stack.  This track all the
2234 symbols that have been recognised or partially recognised.
2235
2236 The stack usually won't grow very large - maybe a few tens of entries.  So
2237 we dynamically resize and array as required but never bother to shrink it
2238 down again.
2239
2240 We keep the stack as two separate allocations.  One, `asn_stack` stores the
2241 "abstract syntax nodes" which are created by each reduction.  When we call
2242 `do_reduce` we need to pass an array of the `asn`s of the body of the
2243 production, and by keeping a separate `asn` stack, we can just pass a
2244 pointer into this stack.
2245
2246 The other allocation store all other stack fields of which there are two.
2247 The `state` is the most important one and guides the parsing process.  The
2248 `sym` is nearly unnecessary.  However when we want to free entries from the
2249 `asn_stack`, it helps to know what type they are so we can call the right
2250 freeing function.  The symbol leads us to the right free function through
2251 `do_free`.
2252
2253 ###### parser functions
2254
2255         struct parser {
2256                 int state;
2257                 struct frame {
2258                         short state;
2259                         short sym;
2260                 } *stack;
2261                 void **asn_stack;
2262                 int stack_size;
2263                 int tos;
2264         };
2265
2266 #### Shift and pop
2267
2268 The operations are needed on the stack - shift (which is like push) and pop.
2269
2270 Shift applies no only to terminals but also to non-terminals.  When we
2271 reduce a production we will pop off entries corresponding to the body
2272 symbols, then push on an item for the head of the production.  This last is
2273 exactly the same process as shifting in a terminal so we use the same
2274 function for both.
2275
2276 To simplify other code we arrange for `shift` to fail if there is no `goto`
2277 state for the symbol.  This is useful in basic parsing due to our design
2278 that we shift when we can, and reduce when we cannot.  So the `shift`
2279 function reports if it could.
2280
2281 So `shift` finds the next state.  If that succeed it extends the allocations
2282 if needed and pushed all the information onto the stacks.
2283
2284 ###### parser functions
2285
2286         static int shift(struct parser *p,
2287                          int sym, void *asn,
2288                          const struct state states[])
2289         {
2290                 // Push an entry onto the stack
2291                 int newstate = search(&states[p->state], sym);
2292                 if (newstate < 0)
2293                         return 0;
2294                 if (p->tos >= p->stack_size) {
2295                         p->stack_size += 10;
2296                         p->stack = realloc(p->stack, p->stack_size
2297                                            * sizeof(p->stack[0]));
2298                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2299                                            * sizeof(p->asn_stack[0]));
2300                 }
2301                 p->stack[p->tos].state = p->state;
2302                 p->stack[p->tos].sym = sym;
2303                 p->asn_stack[p->tos] = asn;
2304                 p->tos++;
2305                 p->state = newstate;
2306                 return 1;
2307         }
2308
2309 `pop` simply moves the top of stack (`tos`) back down the required amount
2310 and frees and `asn` entries that need to be freed.  It is called _after_ we
2311 reduce a production, just before we `shift` the nonterminal in.
2312
2313 ###### parser functions
2314
2315         static void pop(struct parser *p, int num,
2316                         void(*do_free)(short sym, void *asn))
2317         {
2318                 int i;
2319                 p->tos -= num;
2320                 for (i = 0; i < num; i++)
2321                         do_free(p->stack[p->tos+i].sym,
2322                                 p->asn_stack[p->tos+i]);
2323
2324                 p->state = p->stack[p->tos].state;
2325         }
2326
2327 ### Memory allocation
2328
2329 The `scanner` returns tokens in a local variable - we want them in allocated
2330 memory so they can live in the `asn_stack`.  Similarly the `asn` produce by
2331 a reduce is in a large buffer.  Both of these require some allocation and
2332 copying, hence `memdup` and `tokcopy`.
2333
2334 ###### parser includes
2335         #include <memory.h>
2336
2337 ###### parser functions
2338
2339         void *memdup(void *m, int len)
2340         {
2341                 void *ret;
2342                 if (len == 0)
2343                         return NULL;
2344                 ret = malloc(len);
2345                 memcpy(ret, m, len);
2346                 return ret;
2347         }
2348
2349         static struct token *tok_copy(struct token tk)
2350         {
2351                 struct token *new = malloc(sizeof(*new));
2352                 *new = tk;
2353                 return new;
2354         }
2355
2356 ### The heart of the parser.
2357
2358 Now we have the parser.  If we can shift, we do.  If not and we can reduce
2359 we do.  If the production we reduced was production zero, then we have
2360 accepted the input and can finish.
2361
2362 If we can neither shift nor reduce we have an error to handle.  We pop
2363 single entries off the stack until we can shift the `TK_error` symbol, the
2364 drop input tokens until we find one we can shift into the new error state.
2365
2366 We return whatever `asn` was returned by reducing production zero.
2367
2368 ###### parser includes
2369         #include "parser.h"
2370 ###### parser
2371         void *parser_run(struct token_state *tokens,
2372                          const struct state states[],
2373                          int (*do_reduce)(int, int, void**, void*),
2374                          void (*do_free)(short, void*),
2375                          FILE *trace, const char *non_term[], int knowns)
2376         {
2377                 struct parser p = { 0 };
2378                 struct token *tk;
2379                 int accepted = 0;
2380                 void *ret;
2381
2382                 tk = tok_copy(token_next(tokens));
2383                 while (!accepted) {
2384                         if (trace)
2385                                 parser_trace(trace, &p, tk, states, non_term, knowns);
2386
2387                         if (shift(&p, tk->num, tk, states)) {
2388                                 tk = tok_copy(token_next(tokens));
2389                                 continue;
2390                         }
2391                         if (states[p.state].reduce_prod >= 0) {
2392                                 void **body;
2393                                 int prod = states[p.state].reduce_prod;
2394                                 int size = states[p.state].reduce_size;
2395                                 int sym = states[p.state].reduce_sym;
2396                                 int bufsize;
2397                                 static char buf[16*1024];
2398
2399                                 body = p.asn_stack +
2400                                         (p.tos - states[p.state].reduce_size);
2401
2402                                 bufsize = do_reduce(prod, p.tos, body,
2403                                                     buf);
2404
2405                                 pop(&p, size, do_free);
2406                                 shift(&p, sym, memdup(buf, bufsize), states);
2407                                 if (prod == 0)
2408                                         accepted = 1;
2409                                 continue;
2410                         }
2411                         /* Error. we walk up the stack until we
2412                          * find a state which will accept TK_error.
2413                          * We then shift in TK_error and see what state
2414                          * that takes us too.
2415                          * Then we discard input tokens until
2416                          * we find one that is acceptable.
2417                          */
2418                         while (shift(&p, TK_error, tk, states) == 0
2419                                && p.tos > 0)
2420                                 // discard this state
2421                                 pop(&p, 1, do_free);
2422                         tk = tok_copy(*tk);
2423                         while (search(&states[p.state], tk->num) < 0 &&
2424                                tk->num != TK_eof) {
2425                                 free(tk);
2426                                 tk = tok_copy(token_next(tokens));
2427                         }
2428                         if (p.tos == 0 && tk->num == TK_eof)
2429                                 break;
2430                 }
2431                 free(tk);
2432                 if (accepted)
2433                         ret = p.asn_stack[0];
2434                 else
2435                         pop(&p, p.tos, do_free);
2436                 free(p.asn_stack);
2437                 free(p.stack);
2438                 return ret;
2439         }
2440
2441 ###### exported functions
2442         void *parser_run(struct token_state *tokens,
2443                          const struct state states[],
2444                          int (*do_reduce)(int, int, void**, void*),
2445                          void (*do_free)(short, void*),
2446                          FILE *trace, const char *non_term[], int knowns);
2447
2448 ### Tracing
2449
2450 Being able to visualize the parser in action can be invaluable when
2451 debugging the parser code, or trying to understand how the parse of a
2452 particular grammar progresses.  The stack contains all the important
2453 state, so just printing out the stack every time around the parse loop
2454 can make it possible to see exactly what is happening.
2455
2456 This doesn't explicitly show each SHIFT and REDUCE action.  However they
2457 are easily deduced from the change between consecutive lines, and the
2458 details of each state can be found by cross referencing the states list
2459 in the stack with the "report" that parsergen can generate.
2460
2461 For terminal symbols, we just dump the token.  For non-terminals we
2462 print the name of the symbol.  The look ahead token is reported at the
2463 end inside square brackets.
2464
2465 ###### parser functions
2466
2467         void parser_trace(FILE *trace, struct parser *p,
2468                           struct token *tk, const struct state states[],
2469                           const char *non_term[], int knowns)
2470         {
2471                 int i;
2472                 for (i = 0; i < p->tos; i++) {
2473                         int sym = p->stack[i].sym;
2474                         fprintf(trace, "(%d) ", p->stack[i].state);
2475                         if (sym < TK_reserved + knowns) {
2476                                 struct token *t = p->asn_stack[i];
2477                                 text_dump(trace, t->txt, 20);
2478                         } else
2479                                 fputs(non_term[sym - TK_reserved - knowns],
2480                                       trace);
2481                         fputs(" ", trace);
2482                 }
2483                 fprintf(trace, "(%d) [", p->state);
2484                 text_dump(trace, tk->txt, 20);
2485                 fputs("]\n", trace);
2486         }
2487
2488 # A Worked Example
2489
2490 The obvious example for a parser is a calculator.
2491
2492 As `scanner` provides number parsing function using `libgmp` is it not much
2493 work to perform arbitrary rational number calculations.
2494
2495 This calculator takes one expression, or an equality test per line.  The
2496 results are printed and in any equality test fails, the program exits with
2497 an error.
2498
2499 Embedding mdcode inside mdcode is rather horrible.  I'd like to find a
2500 better approach, but as the grammar file must have 3 components I need
2501 something like this.
2502
2503 ###### File: parsergen.mk
2504         calc.c : parsergen calc.cgm
2505                 ./parsergen -o calc calc.cgm
2506         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2507                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2508
2509 ###### demo grammar
2510
2511         # header
2512         ~~~~~
2513
2514         #include "number.h"
2515         // what do we use for a demo-grammar?  A calculator of course.
2516         struct number {
2517                 mpq_t val;
2518                 char tail[2];
2519                 int err;
2520         };
2521
2522         ~~~~~
2523         # code
2524         ~~~~~
2525
2526         #include <stdlib.h>
2527         #include <unistd.h>
2528         #include <fcntl.h>
2529         #include <sys/mman.h>
2530         #include <stdio.h>
2531         #include <malloc.h>
2532         #include <gmp.h>
2533         #include "mdcode.h"
2534         #include "scanner.h"
2535         #include "number.h"
2536         #include "parser.h"
2537
2538         #include "calc.h"
2539
2540         static void free_number(struct number *n)
2541         {
2542                 mpq_clear(n->val);
2543                 free(n);
2544         }
2545
2546         int main(int argc, char *argv[])
2547         {
2548                 int fd = open(argv[1], O_RDONLY);
2549                 int len = lseek(fd, 0, 2);
2550                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2551                 struct section *s = code_extract(file, file+len, NULL);
2552                 struct token_config config = {
2553                         .ignored = (1 << TK_line_comment)
2554                                  | (1 << TK_block_comment)
2555                                  | (1 << TK_indent)
2556                                  | (1 << TK_undent),
2557                         .number_chars = ".,_+-",
2558                         .word_start = "",
2559                         .word_cont = "",
2560                 };
2561                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2562                 exit(0);
2563         }
2564
2565         ~~~~~
2566         # grammar
2567         ~~~~~
2568
2569         Session -> Session Line
2570                 | Line
2571
2572         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2573                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2574                                         gmp_printf("  or as a decimal: %Fg\n", fl);
2575                                         mpf_clear(fl);
2576                                         }
2577                                      }$
2578                 | Expression = Expression NEWLINE ${
2579                         if (mpq_equal($1.val, $3.val))
2580                                 gmp_printf("Both equal %Qd\n", $1.val);
2581                         else {
2582                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
2583                                         $1.val, $3.val);
2584                                 exit(1);
2585                         }
2586                 }$
2587                 |  NEWLINE ${ printf("Blank line\n"); }$
2588                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2589
2590         $number
2591         Expression -> Expression +  Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2592                 | Expression -  Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2593                 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2594
2595         Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2596                 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2597                 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2598
2599         Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2600                 | ( Expression  ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$