ocean-lang.org Git - ocean/blob - csrc/parsergen.mdc

   1 # LR(1) Parser Generator #
   2
   3 This parser generator takes a grammar description combined with code
   4 fragments, analyses it, and can report details about the analysis and
   5 write out C code files which can be compiled to make a parser.
   6
   7 There are several distinct sections.
   8
   9 1. `grammar_read` will read a grammar from a literate-code file and
  10    store the productions, symbols, and code fragments.
  11 2. `grammar_analyse` will take that grammar and build LR parsing
  12    states and look-ahead tables.
  13 3. `grammar_report` will present the details of the analysis
  14    in a readable format and will report any conflicts.
  15 4. `parser_generate` will write out C code files with various
  16    tables and with the code fragments arranged in useful places.
  17 5. `parser_run` is a library function intended to be linked together
  18    with the generated parser tables to complete the implementation of
  19    a parser.
  20 6. Finally `calc` is a test grammar for a simple calculator.  The
  21    `parsergen` program built from the C code in this file can extract
  22    that grammar directly from this file and process it.
  23
  24
  25 ###### File: parsergen.c
  26         #include <unistd.h>
  27         #include <stdlib.h>
  28         #include <stdio.h>
  29         ## includes
  30         ## forward declarations
  31         ## declarations
  32         ## functions
  33         ## grammar_read
  34         ## grammar_analyse
  35         ## grammar_report
  36         ## parser_generate
  37         ## main
  38 ###### File: parser.h
  39         ## exported types
  40         ## exported functions
  41 ###### File: libparser.c
  42         #include <unistd.h>
  43         #include <stdlib.h>
  44         #include <stdio.h>
  45         ## parser includes
  46         ## parser functions
  47         ## parser_run
  48 ###### File: parsergen.mk
  49         CFLAGS += -Wall -g
  50         all :: parsergen calc
  51         parsergen.c parsergen.mk libparser.c parser.h : parsergen.mdc
  52                 ./md2c parsergen.mdc
  53
  54 ## Reading the grammar
  55
  56 The grammar must be stored in a markdown literate code file as parsed
  57 by "[mdcode][]".  It must have three top level (i.e. unreferenced)
  58 sections: `header`, `code`, and `grammar`.  The first two will be
  59 literally copied into the generated `.c` and `.h`. files.  The last
  60 contains the grammar.  This is tokenised with "[scanner][]".
  61
  62 If the `--tag` option is given, then any top level header that doesn't
  63 start with the tag is ignored, and the tag is striped from the rest.  So
  64 `--tag Foo`
  65 means that the three needed sections must be `Foo: header`, `Foo: code`,
  66 and `Foo: grammar`.
  67
  68 [mdcode]: mdcode.html
  69 [scanner]: scanner.html
  70
  71 ###### includes
  72         #include "mdcode.h"
  73         #include "scanner.h"
  74
  75 ###### parser includes
  76         #include "mdcode.h"
  77         #include "scanner.h"
  78
  79 The grammar contains production sets, precedence/associativity
  80 declarations, and data type declarations.  These are all parsed with
  81 _ad hoc_ parsing as we don't have a parser generator yet.
  82
  83 The precedence and associativity can be set for each production, but
  84 can be inherited from symbols.  The data type (either structure or a
  85 reference to a structure) is potentially defined for each non-terminal
  86 and describes what C structure is used to store information about each
  87 symbol.
  88
  89 ###### declarations
  90         enum assoc {Left, Right, Non};
  91         char *assoc_names[] = {"Left","Right","Non"};
  92
  93         struct symbol {
  94                 struct text struct_name;
  95                 int isref;
  96                 enum assoc assoc;
  97                 unsigned short precedence;
  98                 ## symbol fields
  99         };
 100         struct production {
 101                 unsigned short precedence;
 102                 enum assoc assoc;
 103                 ## production fields
 104         };
 105         struct grammar {
 106                 ## grammar fields
 107         };
 108
 109 The strings reported by `mdcode` and `scanner` are `struct text` which have
 110 length rather than being null terminated.  To help with printing and
 111 comparing we define `text_is` and `prtxt`, which should possibly go in
 112 `mdcode`.  `scanner` does provide `text_dump` which is useful for strings
 113 which might contain control characters.
 114
 115 `strip_tag` is a bit like `strncmp`, but adds a test for a colon,
 116 because that is what we need to detect tags.
 117
 118 ###### functions
 119         static int text_is(struct text t, char *s)
 120         {
 121                 return (strlen(s) == t.len &&
 122                         strncmp(s, t.txt, t.len) == 0);
 123         }
 124         static void prtxt(struct text t)
 125         {
 126                 printf("%.*s", t.len, t.txt);
 127         }
 128
 129         static int strip_tag(struct text *t, char *tag)
 130         {
 131                 int skip = strlen(tag) + 1;
 132                 if (skip >= t->len ||
 133                     strncmp(t->txt, tag, skip-1) != 0 ||
 134                     t->txt[skip-1] != ':')
 135                         return 0;
 136                 while (skip < t->len && t->txt[skip] == ' ')
 137                         skip++;
 138                 t->len -= skip;
 139                 t->txt += skip;
 140                 return 1;
 141         }
 142
 143 ### Symbols
 144
 145 Productions are comprised primarily of symbols - terminal and
 146 non-terminal.  We do not make a syntactic distinction between the two,
 147 though non-terminals must be identifiers.  Non-terminal symbols are
 148 those which appear in the head of a production, terminal symbols are
 149 those which don't.  There are also "virtual" symbols used for precedence
 150 marking discussed later, and sometimes we won't know what type a symbol
 151 is yet.
 152
 153 ###### forward declarations
 154         enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 155         char *symtypes = "UVTN";
 156 ###### symbol fields
 157         enum symtype type;
 158
 159 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 160 table of known symbols and the resulting parser will report them as
 161 `TK_reserved + N`.  A small set of identifiers are reserved for the
 162 different token types that `scanner` can report.
 163
 164 ###### declarations
 165         static char *reserved_words[] = {
 166                 [TK_error]        = "ERROR",
 167                 [TK_number]       = "NUMBER",
 168                 [TK_ident]        = "IDENTIFIER",
 169                 [TK_mark]         = "MARK",
 170                 [TK_string]       = "STRING",
 171                 [TK_multi_string] = "MULTI_STRING",
 172                 [TK_in]           = "IN",
 173                 [TK_out]          = "OUT",
 174                 [TK_newline]      = "NEWLINE",
 175                 [TK_eof]          = "$eof",
 176         };
 177 ###### symbol fields
 178         short num;
 179
 180 Note that `TK_eof` and the two `TK_*_comment` tokens cannot be
 181 recognised.  The former is automatically expected at the end of the text
 182 being parsed. The latter are always ignored by the parser.
 183
 184 All of these symbols are stored in a simple symbol table.  We use the
 185 `struct text` from `mdcode` to store the name and link them together
 186 into a sorted list using an insertion sort.
 187
 188 We don't have separate `find` and `insert` functions as any symbol we
 189 find needs to be remembered.  We simply expect `find` to always return a
 190 symbol, but its type might be `Unknown`.
 191
 192 ###### includes
 193         #include <string.h>
 194
 195 ###### symbol fields
 196         struct text name;
 197         struct symbol *next;
 198
 199 ###### grammar fields
 200         struct symbol *syms;
 201         int num_syms;
 202
 203 ###### functions
 204         static int text_cmp(struct text a, struct text b)
 205         {
 206                 int len = a.len;
 207                 if (a.len > b.len)
 208                         len = b.len;
 209                 int cmp = strncmp(a.txt, b.txt, len);
 210                 if (cmp)
 211                         return cmp;
 212                 else
 213                         return a.len - b.len;
 214         }
 215
 216         static struct symbol *sym_find(struct grammar *g, struct text s)
 217         {
 218                 struct symbol **l = &g->syms;
 219                 struct symbol *n;
 220                 int cmp = 1;
 221
 222                 while (*l &&
 223                         (cmp = text_cmp((*l)->name, s)) < 0)
 224                                 l = & (*l)->next;
 225                 if (cmp == 0)
 226                         return *l;
 227                 n = calloc(1, sizeof(*n));
 228                 n->name = s;
 229                 n->type = Unknown;
 230                 n->next = *l;
 231                 n->num = -1;
 232                 *l = n;
 233                 return n;
 234         }
 235
 236         static void symbols_init(struct grammar *g)
 237         {
 238                 int entries = sizeof(reserved_words)/sizeof(reserved_words[0]);
 239                 int i;
 240                 for (i = 0; i < entries; i++) {
 241                         struct text t;
 242                         struct symbol *s;
 243                         t.txt = reserved_words[i];
 244                         if (!t.txt)
 245                                 continue;
 246                         t.len = strlen(t.txt);
 247                         s = sym_find(g, t);
 248                         s->type = Terminal;
 249                         s->num = i;
 250                 }
 251         }
 252
 253 ### Data types and precedence.
 254
 255 Data type specification and precedence specification are both
 256 introduced by a dollar sign at the start of the line.  If the next
 257 word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
 258 precedence, otherwise it specifies a data type.
 259
 260 The data type name is simply stored and applied to the head of all
 261 subsequent productions.  It must be the name of a structure optionally
 262 preceded by an asterisk which means a reference or "pointer".  So
 263 `$expression` maps to `struct expression` and `$*statement` maps to
 264 `struct statement *`.
 265
 266 Any productions given before the first data type declaration will have
 267 no data type associated with them and can carry no information.  In
 268 order to allow other non-terminals to have no type, the data type
 269 `$void` can be given.  This does *not* mean that `struct void` will be
 270 used, but rather than no type will be associated with future
 271 non-terminals.
 272
 273 The precedence line must contain a list of symbols - typically
 274 terminal symbols, but not necessarily.  It can only contain symbols
 275 that have not been seen yet, so precedence declaration must precede
 276 the productions that they affect.
 277
 278 A precedence line may also contain a "virtual" symbol which is an
 279 identifier preceded by `$$`.  Virtual terminals carry precedence
 280 information but are not included in the grammar.  A production can
 281 declare that it inherits the precedence of a given virtual symbol.
 282
 283 This use for `$$` precludes it from being used as a symbol in the
 284 described language.  Two other symbols: `${` and `}$` are also
 285 unavailable.
 286
 287 Each new precedence line introduces a new precedence level and
 288 declares how it associates.  This level is stored in each symbol
 289 listed and may be inherited by any production which uses the symbol.  A
 290 production inherits from the last symbol which has a precedence.
 291
 292 ###### grammar fields
 293         struct text current_type;
 294         int type_isref;
 295         int prec_levels;
 296
 297 ###### declarations
 298         enum symbols { TK_virtual = TK_reserved, TK_open, TK_close };
 299         static const char *known[] = { "$$", "${", "}$" };
 300
 301 ###### functions
 302         static char *dollar_line(struct token_state *ts, struct grammar *g, int isref)
 303         {
 304                 struct token t = token_next(ts);
 305                 char *err;
 306                 enum assoc assoc;
 307                 int found;
 308
 309                 if (t.num != TK_ident) {
 310                         err = "type or assoc expected after '$'";
 311                         goto abort;
 312                 }
 313                 if (text_is(t.txt, "LEFT"))
 314                         assoc = Left;
 315                 else if (text_is(t.txt, "RIGHT"))
 316                         assoc = Right;
 317                 else if (text_is(t.txt, "NON"))
 318                         assoc = Non;
 319                 else {
 320                         g->current_type = t.txt;
 321                         g->type_isref = isref;
 322                         if (text_is(t.txt, "void"))
 323                                 g->current_type.txt = NULL;
 324                         t = token_next(ts);
 325                         if (t.num != TK_newline) {
 326                                 err = "Extra tokens after type name";
 327                                 goto abort;
 328                         }
 329                         return NULL;
 330                 }
 331
 332                 if (isref) {
 333                         err = "$* cannot be followed by a precedence";
 334                         goto abort;
 335                 }
 336
 337                 // This is a precedence line, need some symbols.
 338                 found = 0;
 339                 g->prec_levels += 1;
 340                 t = token_next(ts);
 341                 while (t.num != TK_newline) {
 342                         enum symtype type = Terminal;
 343                         struct symbol *s;
 344                         if (t.num == TK_virtual) {
 345                                 type = Virtual;
 346                                 t = token_next(ts);
 347                                 if (t.num != TK_ident) {
 348                                         err = "$$ must be followed by a word";
 349                                         goto abort;
 350                                 }
 351                         } else if (t.num != TK_ident &&
 352                                    t.num != TK_mark) {
 353                                 err = "Illegal token in precedence line";
 354                                 goto abort;
 355                         }
 356                         s = sym_find(g, t.txt);
 357                         if (s->type != Unknown) {
 358                                 err = "Symbols in precedence line must not already be known.";
 359                                 goto abort;
 360                         }
 361                         s->type = type;
 362                         s->precedence = g->prec_levels;
 363                         s->assoc = assoc;
 364                         found += 1;
 365                 }
 366                 if (found == 0)
 367                         err = "No symbols given on precedence line";
 368                         goto abort;
 369                 return NULL;
 370         abort:
 371                 while (t.num != TK_newline && t.num != TK_eof)
 372                         t = token_next(ts);
 373                 return err;
 374         }
 375
 376 ### Productions
 377
 378 A production either starts with an identifier which is the head
 379 non-terminal, or a vertical bar (`|`) in which case this production
 380 uses the same head as the previous one.  The identifier must be
 381 followed by a `->` mark.  All productions for a given non-terminal must
 382 be grouped together with the `nonterminal ->` given only once.
 383
 384 After this a (possibly empty) sequence of identifiers and marks form
 385 the body of the production.  A virtual symbol may be given after the
 386 body by preceding it with `$$`.  If a virtual symbol is given, the
 387 precedence of the production is that for the virtual symbol.  If none
 388 is given, the precedence is inherited from the last symbol in the
 389 production which has a precedence specified.
 390
 391 After the optional precedence may come the `${` mark.  This indicates
 392 the start of a code fragment.  If present, this must be on the same
 393 line as the start of the production.
 394
 395 All of the text from the `${` through to the matching `}$` is
 396 collected and forms the code-fragment for the production.  It must all
 397 be in one `code_node` of the literate code.  The `}$` must be
 398 at the end of a line.
 399
 400 Text in the code fragment will undergo substitutions where `$N` for
 401 some numeric `N` will be replaced with a variable holding the parse
 402 information for the particular symbol in the production.  `$0` is the
 403 head of the production, `$1` is the first symbol of the body, etc.
 404 The type of `$N` for a terminal symbol is `struct token`.  For
 405 a non-terminal, it is whatever has been declared for that symbol.
 406
 407 While building productions we will need to add to an array which needs to
 408 grow dynamically.
 409
 410 ###### functions
 411         static void array_add(void *varray, int *cnt, void *new)
 412         {
 413                 void ***array = varray;
 414                 int current = 0;
 415                 const int step = 8;
 416                 current = ((*cnt-1) | (step-1))+1;
 417                 if (*cnt == current) {
 418                         /* must grow */
 419                         current += step;
 420                         *array = realloc(*array, current * sizeof(void*));
 421                 }
 422                 (*array)[*cnt] = new;
 423                 (*cnt) += 1;
 424         }
 425
 426 Collecting the code fragment simply involves reading tokens until we
 427 hit the end token `}$`, and noting the character position of the start and
 428 the end.
 429
 430 ###### functions
 431         static struct text collect_code(struct token_state *state,
 432                                         struct token start)
 433         {
 434                 struct text code;
 435                 struct token t;
 436                 code.txt = start.txt.txt + start.txt.len;
 437                 do
 438                         t = token_next(state);
 439                 while (t.node == start.node &&
 440                        t.num != TK_close && t.num != TK_error &&
 441                        t.num != TK_eof);
 442                 if (t.num == TK_close && t.node == start.node)
 443                         code.len = t.txt.txt - code.txt;
 444                 else
 445                         code.txt = NULL;
 446                 return code;
 447         }
 448
 449 Now we have all the bit we need to parse a full production.
 450
 451 ###### includes
 452         #include <memory.h>
 453
 454 ###### grammar fields
 455         struct production **productions;
 456         int production_count;
 457
 458 ###### production fields
 459         struct symbol  *head;
 460         struct symbol **body;
 461         int             body_size;
 462         struct text     code;
 463
 464 ###### symbol fields
 465         int first_production;
 466
 467 ###### functions
 468         static char *parse_production(struct grammar *g,
 469                                       struct symbol *head,
 470                                       struct token_state *state)
 471         {
 472                 /* Head has already been parsed. */
 473                 struct token tk;
 474                 char *err;
 475                 struct production p, *pp;
 476
 477                 memset(&p, 0, sizeof(p));
 478                 p.head = head;
 479                 tk = token_next(state);
 480                 while (tk.num == TK_ident || tk.num == TK_mark) {
 481                         struct symbol *bs = sym_find(g, tk.txt);
 482                         if (bs->type == Unknown)
 483                                 bs->type = Terminal;
 484                         if (bs->type == Virtual) {
 485                                 err = "Virtual symbol not permitted in production";
 486                                 goto abort;
 487                         }
 488                         if (bs->precedence) {
 489                                 p.precedence = bs->precedence;
 490                                 p.assoc = bs->assoc;
 491                         }
 492                         array_add(&p.body, &p.body_size, bs);
 493                         tk = token_next(state);
 494                 }
 495                 if (tk.num == TK_virtual) {
 496                         struct symbol *vs;
 497                         tk = token_next(state);
 498                         if (tk.num != TK_ident) {
 499                                 err = "word required after $$";
 500                                 goto abort;
 501                         }
 502                         vs = sym_find(g, tk.txt);
 503                         if (vs->type != Virtual) {
 504                                 err = "symbol after $$ must be virtual";
 505                                 goto abort;
 506                         }
 507                         p.precedence = vs->precedence;
 508                         p.assoc = vs->assoc;
 509                         tk = token_next(state);
 510                 }
 511                 if (tk.num == TK_open) {
 512                         p.code = collect_code(state, tk);
 513                         if (p.code.txt == NULL) {
 514                                 err = "code fragment not closed properly";
 515                                 goto abort;
 516                         }
 517                         tk = token_next(state);
 518                 }
 519                 if (tk.num != TK_newline && tk.num != TK_eof) {
 520                         err = "stray tokens at end of line";
 521                         goto abort;
 522                 }
 523                 pp = malloc(sizeof(*pp));
 524                 *pp = p;
 525                 array_add(&g->productions, &g->production_count, pp);
 526                 return NULL;
 527         abort:
 528                 while (tk.num != TK_newline && tk.num != TK_eof)
 529                         tk = token_next(state);
 530                 return err;
 531         }
 532
 533 With the ability to parse production and dollar-lines, we have nearly all
 534 that we need to parse a grammar from a `code_node`.
 535
 536 The head of the first production will effectively be the `start` symbol of
 537 the grammar.  However it won't _actually_ be so.  Processing the grammar is
 538 greatly simplified if the real start symbol only has a single production,
 539 and expects `$eof` as the final terminal.  So when we find the first
 540 explicit production we insert an extra production as production zero which
 541 looks like
 542
 543 ###### Example: production 0
 544         $start -> START $eof
 545
 546 where `START` is the first non-terminal given.
 547
 548 ###### create production zero
 549         struct production *p = calloc(1,sizeof(*p));
 550         struct text start = {"$start",6};
 551         struct text eof = {"$eof",4};
 552         p->head = sym_find(g, start);
 553         p->head->type = Nonterminal;
 554         array_add(&p->body, &p->body_size, head);
 555         array_add(&p->body, &p->body_size, sym_find(g, eof));
 556         p->head->first_production = g->production_count;
 557         array_add(&g->productions, &g->production_count, p);
 558
 559 Now we are ready to read in the grammar.
 560
 561 ###### grammar_read
 562         static struct grammar *grammar_read(struct code_node *code)
 563         {
 564                 struct token_config conf = {
 565                         .word_start = "",
 566                         .word_cont = "",
 567                         .words_marks = known,
 568                         .known_count = sizeof(known)/sizeof(known[0]),
 569                         .number_chars = "",
 570                         .ignored = (1 << TK_line_comment)
 571                                  | (1 << TK_block_comment)
 572                                  | (0 << TK_number)
 573                                  | (1 << TK_string)
 574                                  | (1 << TK_multi_string)
 575                                  | (1 << TK_in)
 576                                  | (1 << TK_out),
 577                 };
 578
 579                 struct token_state *state = token_open(code, &conf);
 580                 struct token tk;
 581                 struct symbol *head = NULL;
 582                 struct grammar *g;
 583                 char *err = NULL;
 584
 585                 g = calloc(1, sizeof(*g));
 586                 symbols_init(g);
 587
 588                 for (tk = token_next(state); tk.num != TK_eof;
 589                      tk = token_next(state)) {
 590                         if (tk.num == TK_newline)
 591                                 continue;
 592                         if (tk.num == TK_ident) {
 593                                 // new non-terminal
 594                                 head = sym_find(g, tk.txt);
 595                                 if (head->type == Nonterminal)
 596                                         err = "This non-terminal has already be used.";
 597                                 else if (head->type == Virtual)
 598                                         err = "Virtual symbol not permitted in head of production";
 599                                 else {
 600                                         head->type = Nonterminal;
 601                                         head->struct_name = g->current_type;
 602                                         head->isref = g->type_isref;
 603                                         if (g->production_count == 0) {
 604                                                 ## create production zero
 605                                         }
 606                                         head->first_production = g->production_count;
 607                                         tk = token_next(state);
 608                                         if (tk.num == TK_mark &&
 609                                             text_is(tk.txt, "->"))
 610                                                 err = parse_production(g, head, state);
 611                                         else
 612                                                 err = "'->' missing in production";
 613                                 }
 614                         } else if (tk.num == TK_mark
 615                                    && text_is(tk.txt, "|")) {
 616                                 // another production for same non-term
 617                                 if (head)
 618                                         err = parse_production(g, head, state);
 619                                 else
 620                                         err = "First production must have a head";
 621                         } else if (tk.num == TK_mark
 622                                    && text_is(tk.txt, "$")) {
 623                                 err = dollar_line(state, g, 0);
 624                         } else if (tk.num == TK_mark
 625                                    && text_is(tk.txt, "$*")) {
 626                                 err = dollar_line(state, g, 1);
 627                         } else {
 628                                 err = "Unrecognised token at start of line.";
 629                         }
 630                         if (err)
 631                                 goto abort;
 632                 }
 633                 token_close(state);
 634                 return g;
 635         abort:
 636                 fprintf(stderr, "Error at line %d: %s\n",
 637                         tk.line, err);
 638                 token_close(state);
 639                 free(g);
 640                 return NULL;
 641         }
 642
 643 ## Analysing the grammar
 644
 645 The central task in analysing the grammar is to determine a set of
 646 states to drive the parsing process.  This will require finding
 647 various sets of symbols and of "items".  Some of these sets will need
 648 to attach information to each element in the set, so they are more
 649 like maps.
 650
 651 Each "item" may have a 'look-ahead' or `LA` set associated with
 652 it.  Many of these will be the same as each other.  To avoid memory
 653 wastage, and to simplify some comparisons of sets, these sets will be
 654 stored in a list of unique sets, each assigned a number.
 655
 656 Once we have the data structures in hand to manage these sets and
 657 lists, we can start setting the 'nullable' flag, build the 'FIRST'
 658 sets, and then create the item sets which define the various states.
 659
 660 ### Symbol sets.
 661
 662 Though we don't only store symbols in these sets, they are the main
 663 things we store, so they are called symbol sets or "symsets".
 664
 665 A symset has a size, an array of shorts, and an optional array of data
 666 values, which are also shorts.  If the array of data is not present,
 667 we store an impossible pointer, as `NULL` is used to indicate that no
 668 memory has been allocated yet;
 669
 670 ###### declarations
 671         struct symset {
 672                 short cnt;
 673                 unsigned short *syms, *data;
 674         };
 675         #define NO_DATA ((unsigned short *)1)
 676         const struct symset INIT_SYMSET =  { 0, NULL, NO_DATA };
 677         const struct symset INIT_DATASET = { 0, NULL, NULL };
 678
 679 The arrays of shorts are allocated in blocks of 8 and are kept sorted
 680 by using an insertion sort.  We don't explicitly record the amount of
 681 allocated space as it can be derived directly from the current `cnt` using
 682 `((cnt - 1) | 7) + 1`.
 683
 684 ###### functions
 685         static void symset_add(struct symset *s, unsigned short key, unsigned short val)
 686         {
 687                 int i;
 688                 int current = ((s->cnt-1) | 7) + 1;
 689                 if (current == s->cnt) {
 690                         current += 8;
 691                         s->syms = realloc(s->syms, sizeof(*s->syms) * current);
 692                         if (s->data != NO_DATA)
 693                                 s->data = realloc(s->data, sizeof(*s->data) * current);
 694                 }
 695                 i = s->cnt;
 696                 while (i > 0 && s->syms[i-1] > key) {
 697                         s->syms[i] = s->syms[i-1];
 698                         if (s->data != NO_DATA)
 699                                 s->data[i] = s->data[i-1];
 700                         i--;
 701                 }
 702                 s->syms[i] = key;
 703                 if (s->data != NO_DATA)
 704                         s->data[i] = val;
 705                 s->cnt += 1;
 706         }
 707
 708 Finding a symbol (or item) in a `symset` uses a simple binary search.
 709 We return the index where the value was found (so data can be accessed),
 710 or `-1` to indicate failure.
 711
 712         static int symset_find(struct symset *ss, unsigned short key)
 713         {
 714                 int lo = 0;
 715                 int hi = ss->cnt;
 716
 717                 if (hi == 0)
 718                         return -1;
 719                 while (lo + 1 < hi) {
 720                         int mid = (lo + hi) / 2;
 721                         if (ss->syms[mid] <= key)
 722                                 lo = mid;
 723                         else
 724                                 hi = mid;
 725                 }
 726                 if (ss->syms[lo] == key)
 727                         return lo;
 728                 return -1;
 729         }
 730
 731 We will often want to form the union of two symsets.  When we do, we
 732 will often want to know if anything changed (as they might mean there
 733 is more work to do).  So `symset_union` reports whether anything was
 734 added to the first set.  We use a slow quadratic approach as these
 735 sets don't really get very big.  If profiles shows this to be a problem is
 736 can be optimised later.
 737
 738         static int symset_union(struct symset *a, struct symset *b)
 739         {
 740                 int i;
 741                 int added = 0;
 742                 for (i = 0; i < b->cnt; i++)
 743                         if (symset_find(a, b->syms[i]) < 0) {
 744                                 unsigned short data = 0;
 745                                 if (b->data != NO_DATA)
 746                                         data = b->data[i];
 747                                 symset_add(a, b->syms[i], data);
 748                                 added++;
 749                         }
 750                 return added;
 751         }
 752
 753 And of course we must be able to free a symset.
 754
 755         static void symset_free(struct symset ss)
 756         {
 757                 free(ss.syms);
 758                 if (ss.data != NO_DATA)
 759                         free(ss.data);
 760         }
 761
 762 ### Symset Storage
 763
 764 Some symsets are simply stored somewhere appropriate in the `grammar`
 765 data structure, others needs a bit of indirection.  This applies
 766 particularly to "LA" sets.  These will be paired with "items" in an
 767 "itemset".  As itemsets will be stored in a symset, the "LA" set needs to be
 768 stored in the `data` field, so we need a mapping from a small (short)
 769 number to an LA `symset`.
 770
 771 As mentioned earlier this involves creating a list of unique symsets.
 772
 773 For now, we just use a linear list sorted by insertion.  A skiplist
 774 would be more efficient and may be added later.
 775
 776 ###### declarations
 777
 778         struct setlist {
 779                 struct symset ss;
 780                 int num;
 781                 struct setlist *next;
 782         };
 783
 784 ###### grammar fields
 785         struct setlist *sets;
 786         int nextset;
 787
 788 ###### functions
 789
 790         static int ss_cmp(struct symset a, struct symset b)
 791         {
 792                 int i;
 793                 int diff = a.cnt - b.cnt;
 794                 if (diff)
 795                         return diff;
 796                 for (i = 0; i < a.cnt; i++) {
 797                         diff = (int)a.syms[i] - (int)b.syms[i];
 798                         if (diff)
 799                                 return diff;
 800                 }
 801                 return 0;
 802         }
 803
 804         static int save_set(struct grammar *g, struct symset ss)
 805         {
 806                 struct setlist **sl = &g->sets;
 807                 int cmp = 1;
 808                 struct setlist *s;
 809
 810                 while (*sl && (cmp = ss_cmp((*sl)->ss, ss)) < 0)
 811                         sl = & (*sl)->next;
 812                 if (cmp == 0) {
 813                         symset_free(ss);
 814                         return (*sl)->num;
 815                 }
 816
 817                 s = malloc(sizeof(*s));
 818                 s->ss = ss;
 819                 s->num = g->nextset;
 820                 g->nextset += 1;
 821                 s->next = *sl;
 822                 *sl = s;
 823                 return s->num;
 824         }
 825
 826 Finding a set by number is currently performed by a simple linear search.
 827 If this turns out to hurt performance, we can store the sets in a dynamic
 828 array like the productions.
 829
 830         static struct symset set_find(struct grammar *g, int num)
 831         {
 832                 struct setlist *sl = g->sets;
 833                 while (sl && sl->num != num)
 834                         sl = sl->next;
 835                 return sl->ss;
 836         }
 837
 838
 839 ### Setting `nullable`
 840
 841 We set `nullable` on the head symbol for any production for which all
 842 the body symbols (if any) are nullable.  As this is a recursive
 843 definition, any change in the `nullable` setting means that we need to
 844 re-evaluate where it needs to be set.
 845
 846 We simply loop around performing the same calculations until no more
 847 changes happen.
 848
 849 ###### symbol fields
 850         int nullable;
 851
 852 ###### functions
 853         static void set_nullable(struct grammar *g)
 854         {
 855                 int check_again = 1;
 856                 while (check_again) {
 857                         int p;
 858                         check_again = 0;
 859                         for (p = 0; p < g->production_count; p++) {
 860                                 struct production *pr = g->productions[p];
 861                                 int s;
 862
 863                                 if (pr->head->nullable)
 864                                         continue;
 865                                 for (s = 0; s < pr->body_size; s++)
 866                                         if (! pr->body[s]->nullable)
 867                                                 break;
 868                                 if (s == pr->body_size) {
 869                                         pr->head->nullable = 1;
 870                                         check_again = 1;
 871                                 }
 872                         }
 873                 }
 874         }
 875
 876 ### Setting `can_eol`
 877
 878 In order to be able to ignore newline tokens when not relevant, but
 879 still include them in the parse when needed, we will need to know
 880 which states can start a "line-like" section of code.  We ignore
 881 newlines when there is an indent since the most recent start of a
 882 line-like section.
 883
 884 To know what is line-like, we first need to know which symbols can end
 885 a line-like section, which is precisely those which can end with a
 886 newline token.  These symbols don't necessarily alway end with a
 887 newline, but they can.  Hence they are not described as "lines" but
 888 only "line-like".
 889
 890 Clearly the `TK_newline` token can end with a newline.  Any symbol
 891 which is the head of a production that contains a line-ending symbol
 892 followed only by nullable symbols is also a line-ending symbol.  We
 893 use a new field `can_eol` to record this attribute of symbols, and
 894 compute it in a repetitive manner similar to `set_nullable`.
 895
 896 ###### symbol fields
 897         int can_eol;
 898
 899 ###### functions
 900         static void set_can_eol(struct grammar *g)
 901         {
 902                 int check_again = 1;
 903                 g->symtab[TK_newline]->can_eol = 1;
 904                 while (check_again) {
 905                         int p;
 906                         check_again = 0;
 907                         for (p = 0; p < g->production_count; p++) {
 908                                 struct production *pr = g->productions[p];
 909                                 int s;
 910
 911                                 if (pr->head->can_eol)
 912                                         continue;
 913
 914                                 for (s = pr->body_size - 1; s >= 0; s--) {
 915                                         if (pr->body[s]->can_eol) {
 916                                                 pr->head->can_eol = 1;
 917                                                 check_again = 1;
 918                                                 break;
 919                                         }
 920                                         if (!pr->body[s]->nullable)
 921                                                 break;
 922                                 }
 923                         }
 924                 }
 925         }
 926
 927 ### Building the `first` sets
 928
 929 When calculating what can follow a particular non-terminal, we will need to
 930 know what the "first" terminal in any subsequent non-terminal might be.  So
 931 we calculate the `first` set for every non-terminal and store them in an
 932 array.  We don't bother recording the "first" set for terminals as they are
 933 trivial.
 934
 935 As the "first" for one symbol might depend on the "first" of another,
 936 we repeat the calculations until no changes happen, just like with
 937 `set_nullable`.  This makes use of the fact that `symset_union`
 938 reports if any change happens.
 939
 940 The core of this which finds the "first" of part of a production body
 941 will be reused for computing the "follow" sets, so we split it out
 942 into a separate function.
 943
 944 ###### grammar fields
 945         struct symset *first;
 946
 947 ###### functions
 948
 949         static int add_first(struct production *pr, int start,
 950                              struct symset *target, struct grammar *g,
 951                              int *to_end)
 952         {
 953                 int s;
 954                 int changed = 0;
 955                 for (s = start; s < pr->body_size; s++) {
 956                         struct symbol *bs = pr->body[s];
 957                         if (bs->type == Terminal) {
 958                                 if (symset_find(target, bs->num) < 0) {
 959                                         symset_add(target, bs->num, 0);
 960                                         changed = 1;
 961                                 }
 962                                 break;
 963                         } else if (symset_union(target, &g->first[bs->num]))
 964                                 changed = 1;
 965                         if (!bs->nullable)
 966                                 break;
 967                 }
 968                 if (to_end)
 969                         *to_end = (s == pr->body_size);
 970                 return changed;
 971         }
 972
 973         static void build_first(struct grammar *g)
 974         {
 975                 int check_again = 1;
 976                 int s;
 977                 g->first = calloc(g->num_syms, sizeof(g->first[0]));
 978                 for (s = 0; s < g->num_syms; s++)
 979                         g->first[s] = INIT_SYMSET;
 980
 981                 while (check_again) {
 982                         int p;
 983                         check_again = 0;
 984                         for (p = 0; p < g->production_count; p++) {
 985                                 struct production *pr = g->productions[p];
 986                                 struct symset *head = &g->first[pr->head->num];
 987
 988                                 if (add_first(pr, 0, head, g, NULL))
 989                                         check_again = 1;
 990                         }
 991                 }
 992         }
 993
 994 ### Building the `follow` sets.
 995
 996 There are two different situations when we will want to generate "follow"
 997 sets.  If we are doing an SLR analysis, we want to generate a single
 998 "follow" set for each non-terminal in the grammar.  That is what is
 999 happening here.  If we are doing an LALR or LR analysis we will want
1000 to generate a separate "LA" set for each item.  We do that later
1001 in state generation.
1002
1003 There are two parts to generating a "follow" set.  Firstly we look at
1004 every place that any non-terminal appears in the body of any
1005 production, and we find the set of possible "first" symbols after
1006 there.  This is done using `add_first` above and only needs to be done
1007 once as the "first" sets are now stable and will not change.
1008
1009 ###### follow code
1010
1011         for (p = 0; p < g->production_count; p++) {
1012                 struct production *pr = g->productions[p];
1013                 int b;
1014
1015                 for (b = 0; b < pr->body_size - 1; b++) {
1016                         struct symbol *bs = pr->body[b];
1017                         if (bs->type == Terminal)
1018                                 continue;
1019                         add_first(pr, b+1, &g->follow[bs->num], g, NULL);
1020                 }
1021         }
1022
1023 The second part is to add the "follow" set of the head of a production
1024 to the "follow" sets of the final symbol in the production, and any
1025 other symbol which is followed only by `nullable` symbols.  As this
1026 depends on "follow" itself we need to repeatedly perform the process
1027 until no further changes happen.
1028
1029 ###### follow code
1030
1031         for (again = 0, p = 0;
1032              p < g->production_count;
1033              p < g->production_count-1
1034                 ? p++ : again ? (again = 0, p = 0)
1035                               : p++) {
1036                 struct production *pr = g->productions[p];
1037                 int b;
1038
1039                 for (b = pr->body_size - 1; b >= 0; b--) {
1040                         struct symbol *bs = pr->body[b];
1041                         if (bs->type == Terminal)
1042                                 break;
1043                         if (symset_union(&g->follow[bs->num],
1044                                          &g->follow[pr->head->num]))
1045                                 again = 1;
1046                         if (!bs->nullable)
1047                                 break;
1048                 }
1049         }
1050
1051 We now just need to create and initialise the `follow` list to get a
1052 complete function.
1053
1054 ###### grammar fields
1055         struct symset *follow;
1056
1057 ###### functions
1058         static void build_follow(struct grammar *g)
1059         {
1060                 int s, again, p;
1061                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
1062                 for (s = 0; s < g->num_syms; s++)
1063                         g->follow[s] = INIT_SYMSET;
1064                 ## follow code
1065         }
1066
1067 ### Building itemsets and states
1068
1069 There are three different levels of detail that can be chosen for
1070 building the itemsets and states for the LR grammar.  They are:
1071
1072 1. LR(0) or SLR(1), where no look-ahead is considered.
1073 2. LALR(1) where we build look-ahead sets with each item and merge
1074    the LA sets when we find two paths to the same "kernel" set of items.
1075 3. LR(1) where different look-ahead for any item in the set means
1076    a different state must be created.
1077
1078 ###### forward declarations
1079         enum grammar_type { LR0, LR05, SLR, LALR, LR1 };
1080
1081 We need to be able to look through existing states to see if a newly
1082 generated state already exists.  For now we use a simple sorted linked
1083 list.
1084
1085 An item is a pair of numbers: the production number and the position of
1086 "DOT", which is an index into the body of the production.
1087 As the numbers are not enormous we can combine them into a single "short"
1088 and store them in a `symset` - 4 bits for "DOT" and 12 bits for the
1089 production number (so 4000 productions with maximum of 15 symbols in the
1090 body).
1091
1092 Comparing the itemsets will be a little different to comparing symsets
1093 as we want to do the lookup after generating the "kernel" of an
1094 itemset, so we need to ignore the offset=zero items which are added during
1095 completion.
1096
1097 To facilitate this, we modify the "DOT" number so that "0" sorts to
1098 the end of the list in the symset, and then only compare items before
1099 the first "0".
1100
1101 ###### declarations
1102         static inline unsigned short item_num(int production, int index)
1103         {
1104                 return production | ((31-index) << 11);
1105         }
1106         static inline int item_prod(unsigned short item)
1107         {
1108                 return item & 0x7ff;
1109         }
1110         static inline int item_index(unsigned short item)
1111         {
1112                 return (31-(item >> 11)) & 0x1f;
1113         }
1114
1115 For LR(1) analysis we need to compare not just the itemset in a state
1116 but also the LA sets.  As we assign each unique LA set a number, we
1117 can just compare the symset and the data values together.
1118
1119 ###### functions
1120         static int itemset_cmp(struct symset a, struct symset b,
1121                                enum grammar_type type)
1122         {
1123                 int i;
1124                 int av, bv;
1125
1126                 for (i = 0;
1127                      i < a.cnt && i < b.cnt &&
1128                      item_index(a.syms[i]) > 0 &&
1129                      item_index(b.syms[i]) > 0;
1130                      i++) {
1131                         int diff = a.syms[i] - b.syms[i];
1132                         if (diff)
1133                                 return diff;
1134                         if (type == LR1) {
1135                                 diff = a.data[i] - b.data[i];
1136                                 if (diff)
1137                                         return diff;
1138                         }
1139                 }
1140                 if (i == a.cnt || item_index(a.syms[i]) == 0)
1141                         av = -1;
1142                 else
1143                         av = a.syms[i];
1144                 if (i == b.cnt || item_index(b.syms[i]) == 0)
1145                         bv = -1;
1146                 else
1147                         bv = b.syms[i];
1148                 if (av - bv)
1149                         return av - bv;
1150                 if (type < LR1 || av == -1)
1151                         return 0;
1152                 return
1153                         a.data[i] - b.data[i];
1154         }
1155
1156 And now we can build the list of itemsets.  The lookup routine returns
1157 both a success flag and a pointer to where in the list an insert
1158 should happen, so we don't need to search a second time.
1159
1160 ###### declarations
1161         struct itemset {
1162                 struct itemset *next;
1163                 short state;
1164                 struct symset items;
1165                 struct symset go_to;
1166                 char completed;
1167                 char starts_line;
1168         };
1169
1170 ###### grammar fields
1171         struct itemset *items;
1172         int states;
1173
1174 ###### functions
1175         static int itemset_find(struct grammar *g, struct itemset ***where,
1176                                 struct symset kernel, enum grammar_type type)
1177         {
1178                 struct itemset **ip;
1179
1180                 for (ip = &g->items; *ip ; ip = & (*ip)->next) {
1181                         struct itemset *i = *ip;
1182                         int diff;
1183                         diff = itemset_cmp(i->items, kernel, type);
1184                         if (diff < 0)
1185                                 continue;
1186                         if (diff > 0)
1187                                 break;
1188                         /* found */
1189                         *where = ip;
1190                         return 1;
1191                 }
1192                 *where = ip;
1193                 return 0;
1194         }
1195
1196 Adding an itemset may require merging the LA sets if LALR analysis is
1197 happening. If any new LA set add symbol that weren't in the old LA set, we
1198 clear the `completed` flag so that the dependants of this itemset will be
1199 recalculated and their LA sets updated.
1200
1201 `add_itemset` must consume the symsets it is passed, either by adding
1202 them to a data structure, of freeing them.
1203
1204         static int add_itemset(struct grammar *g, struct symset ss,
1205                                enum grammar_type type, int starts_line)
1206         {
1207                 struct itemset **where, *is;
1208                 int i;
1209                 int found = itemset_find(g, &where, ss, type);
1210                 if (!found) {
1211                         is = calloc(1, sizeof(*is));
1212                         is->state = g->states;
1213                         g->states += 1;
1214                         is->items = ss;
1215                         is->next = *where;
1216                         is->go_to = INIT_DATASET;
1217                         is->starts_line = starts_line;
1218                         *where = is;
1219                         return is->state;
1220                 }
1221                 is = *where;
1222                 if (type != LALR) {
1223                         symset_free(ss);
1224                         return is->state;
1225                 }
1226                 for (i = 0; i < ss.cnt; i++) {
1227                         struct symset temp = INIT_SYMSET, s;
1228                         if (ss.data[i] == is->items.data[i])
1229                                 continue;
1230                         s = set_find(g, is->items.data[i]);
1231                         symset_union(&temp, &s);
1232                         s = set_find(g, ss.data[i]);
1233                         if (symset_union(&temp, &s)) {
1234                                 is->items.data[i] = save_set(g, temp);
1235                                 is->completed = 0;
1236                         } else
1237                                 symset_free(temp);
1238                 }
1239                 symset_free(ss);
1240                 return is->state;
1241         }
1242
1243 #### The build
1244
1245 To build all the itemsets, we first insert the initial itemset made from the
1246 start symbol, complete each itemset, and then generate new itemsets from old
1247 until no new ones can be made.
1248
1249 Completing an itemset means finding all the items where "DOT" is followed by
1250 a nonterminal and adding "DOT=0" items for every production from that
1251 non-terminal - providing each item hasn't already been added.
1252
1253 If LA sets are needed, the LA set for each new item is found using
1254 `add_first` which was developed earlier for `FIRST` and `FOLLOW`.  This may
1255 be supplemented by the LA set for the item which produce the new item.
1256
1257 We also collect a set of all symbols which follow "DOT" (in `done`) as this
1258 is used in the next stage.
1259
1260 NOTE: precedence handling should happen here - I haven't written this yet
1261 though.
1262
1263 ###### complete itemset
1264         for (i = 0; i < is->items.cnt; i++) {
1265                 int p = item_prod(is->items.syms[i]);
1266                 int bs = item_index(is->items.syms[i]);
1267                 struct production *pr = g->productions[p];
1268                 int p2;
1269                 struct symbol *s;
1270                 struct symset LA = INIT_SYMSET;
1271                 unsigned short sn = 0;
1272
1273                 if (bs == pr->body_size)
1274                         continue;
1275                 s = pr->body[bs];
1276                 if (symset_find(&done, s->num) < 0)
1277                         symset_add(&done, s->num, 0);
1278                 if (s->type != Nonterminal)
1279                         continue;
1280                 again = 1;
1281                 if (type >= LALR) {
1282                         // Need the LA set.
1283                         int to_end;
1284                         add_first(pr, bs+1, &LA, g, &to_end);
1285                         if (to_end) {
1286                                 struct symset ss = set_find(g, is->items.data[i]);
1287                                 symset_union(&LA, &ss);
1288                         }
1289                         sn = save_set(g, LA);
1290                         LA = set_find(g, sn);
1291                 }
1292
1293                 /* Add productions for this symbol */
1294                 for (p2 = s->first_production;
1295                      p2 < g->production_count &&
1296                       g->productions[p2]->head == s;
1297                      p2++) {
1298                         int itm = item_num(p2, 0);
1299                         int pos = symset_find(&is->items, itm);
1300                         if (pos < 0) {
1301                                 symset_add(&is->items, itm, sn);
1302                                 /* Will have re-ordered, so start
1303                                  * from beginning again */
1304                                 i = -1;
1305                         } else if (type >= LALR) {
1306                                 struct symset ss = set_find(g, is->items.data[pos]);
1307                                 struct symset tmp = INIT_SYMSET;
1308
1309                                 symset_union(&tmp, &ss);
1310                                 if (symset_union(&tmp, &LA)) {
1311                                         is->items.data[pos] = save_set(g, tmp);
1312                                         i = -1;
1313                                 }else
1314                                         symset_free(tmp);
1315                         }
1316                 }
1317         }
1318
1319 For each symbol we found (and placed in `done`) we collect all the items for
1320 which this symbol is next, and create a new itemset with "DOT" advanced over
1321 the symbol.  This is then added to the collection of itemsets (or merged
1322 with a pre-existing itemset).
1323
1324 ###### derive itemsets
1325         // Now we have a completed itemset, so we need to
1326         // compute all the 'next' itemsets and create them
1327         // if they don't exist.
1328         for (i = 0; i < done.cnt; i++) {
1329                 int j;
1330                 unsigned short state;
1331                 int starts_line = 0;
1332                 struct symbol *sym = g->symtab[done.syms[i]];
1333                 struct symset newitemset = INIT_SYMSET;
1334                 if (type >= LALR)
1335                         newitemset = INIT_DATASET;
1336
1337                 if (sym->can_eol ||
1338                     (sym->nullable && is->starts_line))
1339                         starts_line = 1;
1340                 for (j = 0; j < is->items.cnt; j++) {
1341                         int itm = is->items.syms[j];
1342                         int p = item_prod(itm);
1343                         int bp = item_index(itm);
1344                         struct production *pr = g->productions[p];
1345                         unsigned short la = 0;
1346                         int pos;
1347
1348                         if (bp == pr->body_size)
1349                                 continue;
1350                         if (pr->body[bp] != sym)
1351                                 continue;
1352                         if (type >= LALR)
1353                                 la = is->items.data[j];
1354                         pos = symset_find(&newitemset, pr->head->num);
1355                         if (pos < 0)
1356                                 symset_add(&newitemset, item_num(p, bp+1), la);
1357                         else if (type >= LALR) {
1358                                 // Need to merge la set.
1359                                 int la2 = newitemset.data[pos];
1360                                 if (la != la2) {
1361                                         struct symset ss = set_find(g, la2);
1362                                         struct symset LA = INIT_SYMSET;
1363                                         symset_union(&LA, &ss);
1364                                         ss = set_find(g, la);
1365                                         if (symset_union(&LA, &ss))
1366                                                 newitemset.data[pos] = save_set(g, LA);
1367                                         else
1368                                                 symset_free(LA);
1369                                 }
1370                         }
1371                 }
1372                 state = add_itemset(g, newitemset, type, starts_line);
1373                 if (symset_find(&is->go_to, done.syms[i]) < 0)
1374                         symset_add(&is->go_to, done.syms[i], state);
1375         }
1376
1377 All that is left is to crate the initial itemset from production zero, and
1378 with `TK_eof` as the LA set.
1379
1380 ###### functions
1381         static void build_itemsets(struct grammar *g, enum grammar_type type)
1382         {
1383                 struct symset first = INIT_SYMSET;
1384                 struct itemset *is;
1385                 int again;
1386                 unsigned short la = 0;
1387                 if (type >= LALR) {
1388                         // LA set just has eof
1389                         struct symset eof = INIT_SYMSET;
1390                         symset_add(&eof, TK_eof, 0);
1391                         la = save_set(g, eof);
1392                         first = INIT_DATASET;
1393                 }
1394                 // production 0, offset 0 (with no data)
1395                 symset_add(&first, item_num(0, 0), la);
1396                 add_itemset(g, first, type, g->productions[0]->body[0]->can_eol);
1397                 for (again = 0, is = g->items;
1398                      is;
1399                      is = is->next ?: again ? (again = 0, g->items) : NULL) {
1400                         int i;
1401                         struct symset done = INIT_SYMSET;
1402                         if (is->completed)
1403                                 continue;
1404                         is->completed = 1;
1405                         again = 1;
1406                         ## complete itemset
1407                         ## derive itemsets
1408                         symset_free(done);
1409                 }
1410         }
1411
1412 ### Completing the analysis.
1413
1414 The exact process of analysis depends on which level was requested.  For
1415 `LR0` and `LR05` we don't need first and follow sets at all.  For `LALR` and
1416 `LR1` we need first, but not follow.  For `SLR` we need both.
1417
1418 We don't build the "action" tables that you might expect as the parser
1419 doesn't use them.  They will be calculated to some extent if needed for
1420 a report.
1421
1422 Once we have built everything we allocate arrays for the two lists:
1423 symbols and itemsets.  This allows more efficient access during reporting.
1424 The symbols are grouped as terminals and non-terminals and we record the
1425 changeover point in `first_nonterm`.
1426
1427 ###### grammar fields
1428         struct symbol **symtab;
1429         struct itemset **statetab;
1430         int first_nonterm;
1431
1432 ###### grammar_analyse
1433
1434         static void grammar_analyse(struct grammar *g, enum grammar_type type)
1435         {
1436                 struct symbol *s;
1437                 struct itemset *is;
1438                 int snum = TK_reserved;
1439                 for (s = g->syms; s; s = s->next)
1440                         if (s->num < 0 && s->type == Terminal) {
1441                                 s->num = snum;
1442                                 snum++;
1443                         }
1444                 g->first_nonterm = snum;
1445                 for (s = g->syms; s; s = s->next)
1446                         if (s->num < 0) {
1447                                 s->num = snum;
1448                                 snum++;
1449                         }
1450                 g->num_syms = snum;
1451                 g->symtab = calloc(g->num_syms, sizeof(g->symtab[0]));
1452                 for (s = g->syms; s; s = s->next)
1453                         g->symtab[s->num] = s;
1454
1455                 set_nullable(g);
1456                 set_can_eol(g);
1457                 if (type >= SLR)
1458                         build_first(g);
1459
1460                 if (type == SLR)
1461                         build_follow(g);
1462
1463                 build_itemsets(g, type);
1464
1465                 g->statetab = calloc(g->states, sizeof(g->statetab[0]));
1466                 for (is = g->items; is ; is = is->next)
1467                         g->statetab[is->state] = is;
1468         }
1469
1470 ## Reporting on the Grammar
1471
1472 The purpose of the report is to give the grammar developer insight into
1473 how the grammar parser will work.  It is basically a structured dump of
1474 all the tables that have been generated, plus an description of any conflicts.
1475
1476 ###### grammar_report
1477         static int grammar_report(struct grammar *g, enum grammar_type type)
1478         {
1479                 report_symbols(g);
1480                 if (g->follow)
1481                         report_follow(g);
1482                 report_itemsets(g);
1483                 return report_conflicts(g, type);
1484         }
1485
1486 Firstly we have the complete list of symbols, together with the "FIRST"
1487 set if that was generated.
1488
1489 ###### functions
1490
1491         static void report_symbols(struct grammar *g)
1492         {
1493                 int n;
1494                 if (g->first)
1495                         printf("SYMBOLS + FIRST:\n");
1496                 else
1497                         printf("SYMBOLS:\n");
1498
1499                 for (n = 0; n < g->num_syms; n++) {
1500                         struct symbol *s = g->symtab[n];
1501                         if (!s)
1502                                 continue;
1503
1504                         printf(" %c%c%3d%c: ",
1505                                s->nullable ? '.':' ',
1506                                s->can_eol ? '>':' ',
1507                                s->num, symtypes[s->type]);
1508                         prtxt(s->name);
1509                         if (s->precedence)
1510                                 printf(" (%d%s)", s->precedence,
1511                                        assoc_names[s->assoc]);
1512
1513                         if (g->first && s->type == Nonterminal) {
1514                                 int j;
1515                                 char c = ':';
1516                                 for (j = 0; j < g->first[n].cnt; j++) {
1517                                         printf("%c ", c);
1518                                         c = ',';
1519                                         prtxt(g->symtab[g->first[n].syms[j]]->name);
1520                                 }
1521                         }
1522                         printf("\n");
1523                 }
1524         }
1525
1526 Then we have to follow sets if they were computed.
1527
1528         static void report_follow(struct grammar *g)
1529         {
1530                 int n;
1531                 printf("FOLLOW:\n");
1532                 for (n = 0; n < g->num_syms; n++)
1533                         if (g->follow[n].cnt) {
1534                                 int j;
1535                                 char c = ':';
1536                                 printf("  ");
1537                                 prtxt(g->symtab[n]->name);
1538                                 for (j = 0; j < g->follow[n].cnt; j++) {
1539                                         printf("%c ", c);
1540                                         c = ',';
1541                                         prtxt(g->symtab[g->follow[n].syms[j]]->name);
1542                                 }
1543                                 printf("\n");
1544                         }
1545         }
1546
1547 And finally the item sets.  These include the GO TO tables and, for
1548 LALR and LR1, the LA set for each item.  Lots of stuff, so we break
1549 it up a bit.  First the items, with production number and associativity.
1550
1551         static void report_item(struct grammar *g, int itm)
1552         {
1553                 int p = item_prod(itm);
1554                 int dot = item_index(itm);
1555                 struct production *pr = g->productions[p];
1556                 int i;
1557
1558                 printf("    ");
1559                 prtxt(pr->head->name);
1560                 printf(" ->");
1561                 for (i = 0; i < pr->body_size; i++) {
1562                         printf(" %s", (dot == i ? ". ": ""));
1563                         prtxt(pr->body[i]->name);
1564                 }
1565                 if (dot == pr->body_size)
1566                         printf(" .");
1567                 printf(" [%d]", p);
1568                 if (pr->precedence)
1569                         printf(" (%d%s)", pr->precedence,
1570                                assoc_names[pr->assoc]);
1571                 printf("\n");
1572         }
1573
1574 The LA sets which are (possibly) reported with each item:
1575
1576         static void report_la(struct grammar *g, int lanum)
1577         {
1578                 struct symset la = set_find(g, lanum);
1579                 int i;
1580                 char c = ':';
1581
1582                 printf("        LOOK AHEAD(%d)", lanum);
1583                 for (i = 0; i < la.cnt; i++) {
1584                         printf("%c ", c);
1585                         c = ',';
1586                         prtxt(g->symtab[la.syms[i]]->name);
1587                 }
1588                 printf("\n");
1589         }
1590
1591 Then the go to sets:
1592
1593
1594         static void report_goto(struct grammar *g, struct symset gt)
1595         {
1596                 int i;
1597                 printf("    GOTO:\n");
1598
1599                 for (i = 0; i < gt.cnt; i++) {
1600                         printf("      ");
1601                         prtxt(g->symtab[gt.syms[i]]->name);
1602                         printf(" -> %d\n", gt.data[i]);
1603                 }
1604         }
1605
1606 Now we can report all the item sets complete with items, LA sets, and GO TO.
1607
1608         static void report_itemsets(struct grammar *g)
1609         {
1610                 int s;
1611                 printf("ITEM SETS(%d)\n", g->states);
1612                 for (s = 0; s < g->states; s++) {
1613                         int j;
1614                         struct itemset *is = g->statetab[s];
1615                         printf("  Itemset %d:%s\n", s, is->starts_line?" (startsline)":"");
1616                         for (j = 0; j < is->items.cnt; j++) {
1617                                 report_item(g, is->items.syms[j]);
1618                                 if (is->items.data != NO_DATA)
1619                                         report_la(g, is->items.data[j]);
1620                         }
1621                         report_goto(g, is->go_to);
1622                 }
1623         }
1624
1625 ### Reporting conflicts
1626
1627 Conflict detection varies a lot among different analysis levels.  However
1628 LR0 and LR0.5 are quite similar - having no follow sets, and SLR, LALR and
1629 LR1 are also similar as they have FOLLOW or LA sets.
1630
1631 ###### functions
1632
1633         ## conflict functions
1634
1635         static int report_conflicts(struct grammar *g, enum grammar_type type)
1636         {
1637                 int cnt = 0;
1638                 printf("Conflicts:\n");
1639                 if (type < SLR)
1640                         cnt = conflicts_lr0(g, type);
1641                 else
1642                         cnt = conflicts_slr(g, type);
1643                 if (cnt == 0)
1644                         printf(" - no conflicts\n");
1645                 return cnt;
1646         }
1647
1648 LR0 conflicts are any state which have both a reducible item and
1649 a shiftable item.
1650
1651 LR05 conflicts only occurs if two possibly reductions exist,
1652 as shifts always over-ride reductions.
1653
1654 ###### conflict functions
1655         static int conflicts_lr0(struct grammar *g, enum grammar_type type)
1656         {
1657                 int i;
1658                 int cnt = 0;
1659                 for (i = 0; i < g->states; i++) {
1660                         struct itemset *is = g->statetab[i];
1661                         int last_reduce = -1;
1662                         int prev_reduce = -1;
1663                         int last_shift = -1;
1664                         int j;
1665                         if (!is)
1666                                 continue;
1667                         for (j = 0; j < is->items.cnt; j++) {
1668                                 int itm = is->items.syms[j];
1669                                 int p = item_prod(itm);
1670                                 int bp = item_index(itm);
1671                                 struct production *pr = g->productions[p];
1672
1673                                 if (bp == pr->body_size) {
1674                                         prev_reduce = last_reduce;
1675                                         last_reduce = j;
1676                                         continue;
1677                                 }
1678                                 if (pr->body[bp]->type == Terminal)
1679                                         last_shift = j;
1680                         }
1681                         if (type == LR0 && last_reduce >= 0 && last_shift >= 0) {
1682                                 printf("  State %d has both SHIFT and REDUCE:\n", i);
1683                                 report_item(g, is->items.syms[last_shift]);
1684                                 report_item(g, is->items.syms[last_reduce]);
1685                                 cnt++;
1686                         }
1687                         if (prev_reduce >= 0) {
1688                                 printf("  State %d has 2 (or more) reducible items\n", i);
1689                                 report_item(g, is->items.syms[prev_reduce]);
1690                                 report_item(g, is->items.syms[last_reduce]);
1691                                 cnt++;
1692                         }
1693                 }
1694                 return cnt;
1695         }
1696
1697 SLR, LALR, and LR1 conflicts happen if two reducible items have over-lapping
1698 look ahead, or if a symbol in a look-ahead can be shifted.  The differ only
1699 in the source of the look ahead set.
1700
1701 We build a dataset mapping terminal to item for possible SHIFTs and then
1702 another for possible REDUCE operations. We report when we get conflicts
1703 between the two.
1704
1705         static int conflicts_slr(struct grammar *g, enum grammar_type type)
1706         {
1707                 int i;
1708                 int cnt = 0;
1709
1710                 for (i = 0; i < g->states; i++) {
1711                         struct itemset *is = g->statetab[i];
1712                         struct symset shifts = INIT_DATASET;
1713                         struct symset reduce = INIT_DATASET;
1714                         int j;
1715                         if (!is)
1716                                 continue;
1717                         /* First collect the shifts */
1718                         for (j = 0; j < is->items.cnt; j++) {
1719                                 unsigned short itm = is->items.syms[j];
1720                                 int p = item_prod(itm);
1721                                 int bp = item_index(itm);
1722                                 struct production *pr = g->productions[p];
1723
1724                                 if (bp < pr->body_size &&
1725                                     pr->body[bp]->type == Terminal) {
1726                                         /* shiftable */
1727                                         int sym = pr->body[bp]->num;
1728                                         if (symset_find(&shifts, sym) < 0)
1729                                                 symset_add(&shifts, sym, itm);
1730                                 }
1731                         }
1732                         /* Now look for reduction and conflicts */
1733                         for (j = 0; j < is->items.cnt; j++) {
1734                                 unsigned short itm = is->items.syms[j];
1735                                 int p = item_prod(itm);
1736                                 int bp = item_index(itm);
1737                                 struct production *pr = g->productions[p];
1738
1739                                 if (bp < pr->body_size)
1740                                         continue;
1741                                 /* reducible */
1742                                 struct symset la;
1743                                 if (type == SLR)
1744                                         la = g->follow[pr->head->num];
1745                                 else
1746                                         la = set_find(g, is->items.data[j]);
1747                                 int k;
1748                                 for (k = 0; k < la.cnt; k++) {
1749                                         int pos = symset_find(&shifts, la.syms[k]);
1750                                         if (pos >= 0) {
1751                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
1752                                                 prtxt(g->symtab[la.syms[k]]->name);
1753                                                 printf(":\n");
1754                                                 report_item(g, shifts.data[pos]);
1755                                                 report_item(g, itm);
1756                                                 cnt++;
1757                                         }
1758                                         pos = symset_find(&reduce, la.syms[k]);
1759                                         if (pos < 0) {
1760                                                 symset_add(&reduce, la.syms[k], itm);
1761                                                 continue;
1762                                         }
1763                                         printf("  State %d has REDUCE/REDUCE conflict on ", i);
1764                                         prtxt(g->symtab[la.syms[k]]->name);
1765                                         printf(":\n");
1766                                         report_item(g, itm);
1767                                         report_item(g, reduce.data[pos]);
1768                                         cnt++;
1769                                 }
1770                         }
1771                         symset_free(shifts);
1772                         symset_free(reduce);
1773                 }
1774                 return cnt;
1775         }
1776
1777
1778 ## Generating the parser
1779
1780 The export part of the parser is the `parse_XX` function, where the name
1781 `XX` is based on the name of the parser files.
1782
1783 This takes a `code_node`, a partially initialized `token_config`, and an
1784 optional `FILE` to send tracing to.  The `token_config` gets the list of
1785 known words added and then is used with the `code_node` to initialize the
1786 scanner.
1787
1788 `parse_XX` then call the library function `parser_run` to actually complete
1789 the parse.  This needs the `states` table and function to call the various
1790 pieces of code provided in the grammar file, so they are generated first.
1791
1792 ###### parser_generate
1793
1794         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
1795         {
1796                 gen_known(f, g);
1797                 gen_non_term(f, g);
1798                 gen_goto(f, g);
1799                 gen_states(f, g);
1800                 gen_reduce(f, g, file);
1801                 gen_free(f, g);
1802
1803                 fprintf(f, "#line 0 \"gen_parser\"\n");
1804                 fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace)\n",
1805                         name);
1806                 fprintf(f, "{\n");
1807                 fprintf(f, "\tstruct token_state *tokens;\n");
1808                 fprintf(f, "\tconfig->words_marks = known;\n");
1809                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
1810                 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
1811                 fprintf(f, "\ttokens = token_open(code, config);\n");
1812                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
1813                 fprintf(f, "\ttoken_close(tokens);\n");
1814                 fprintf(f, "\treturn rv;\n");
1815                 fprintf(f, "}\n\n");
1816         }
1817
1818 ### Table words table
1819
1820 The know words is simply an array of terminal symbols.
1821 The table of nonterminals used for tracing is a similar array.
1822
1823 ###### functions
1824
1825         static void gen_known(FILE *f, struct grammar *g)
1826         {
1827                 int i;
1828                 fprintf(f, "#line 0 \"gen_known\"\n");
1829                 fprintf(f, "static const char *known[] = {\n");
1830                 for (i = TK_reserved;
1831                      i < g->num_syms && g->symtab[i]->type == Terminal;
1832                      i++)
1833                         fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1834                                 g->symtab[i]->name.txt);
1835                 fprintf(f, "};\n\n");
1836         }
1837
1838         static void gen_non_term(FILE *f, struct grammar *g)
1839         {
1840                 int i;
1841                 fprintf(f, "#line 0 \"gen_non_term\"\n");
1842                 fprintf(f, "static const char *non_term[] = {\n");
1843                 for (i = TK_reserved;
1844                      i < g->num_syms;
1845                      i++)
1846                         if (g->symtab[i]->type == Nonterminal)
1847                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
1848                                         g->symtab[i]->name.txt);
1849                 fprintf(f, "};\n\n");
1850         }
1851
1852 ### States and the goto tables.
1853
1854 For each state we record the goto table, the reducible production if
1855 there is one, or a symbol to shift for error recovery.
1856 Some of the details of the reducible production are stored in the
1857 `do_reduce` function to come later.  Here we store the production number,
1858 the body size (useful for stack management) and the resulting symbol (useful
1859 for knowing how to free data later).
1860
1861 The go to table is stored in a simple array of `sym` and corresponding
1862 `state`.
1863
1864 ###### exported types
1865
1866         struct lookup {
1867                 short sym;
1868                 short state;
1869         };
1870         struct state {
1871                 short go_to_cnt;
1872                 const struct lookup * go_to;
1873                 short reduce_prod;
1874                 short reduce_size;
1875                 short reduce_sym;
1876                 short shift_sym;
1877                 short starts_line;
1878         };
1879
1880
1881 ###### functions
1882
1883         static void gen_goto(FILE *f, struct grammar *g)
1884         {
1885                 int i;
1886                 fprintf(f, "#line 0 \"gen_goto\"\n");
1887                 for (i = 0; i < g->states; i++) {
1888                         int j;
1889                         fprintf(f, "static const struct lookup goto_%d[] = {\n",
1890                                 i);
1891                         struct symset gt = g->statetab[i]->go_to;
1892                         for (j = 0; j < gt.cnt; j++)
1893                                 fprintf(f, "\t{ %d, %d },\n",
1894                                         gt.syms[j], gt.data[j]);
1895                         fprintf(f, "};\n");
1896                 }
1897         }
1898
1899 ###### functions
1900
1901         static void gen_states(FILE *f, struct grammar *g)
1902         {
1903                 int i;
1904                 fprintf(f, "#line 0 \"gen_states\"\n");
1905                 fprintf(f, "static const struct state states[] = {\n");
1906                 for (i = 0; i < g->states; i++) {
1907                         struct itemset *is = g->statetab[i];
1908                         int j, prod = -1, prod_len;
1909                         int shift_sym = -1;
1910                         int shift_len = 0, shift_remain = 0;
1911                         for (j = 0; j < is->items.cnt; j++) {
1912                                 int itm = is->items.syms[j];
1913                                 int p = item_prod(itm);
1914                                 int bp = item_index(itm);
1915                                 struct production *pr = g->productions[p];
1916
1917                                 if (bp < pr->body_size) {
1918                                         if (shift_sym < 0 ||
1919                                             (shift_len == bp && shift_remain > pr->body_size - bp)) {
1920                                                 shift_sym = pr->body[bp]->num;
1921                                                 shift_len = bp;
1922                                                 shift_remain = pr->body_size - bp;
1923                                         }
1924                                         continue;
1925                                 }
1926                                 /* This is what we reduce */
1927                                 if (prod < 0 || prod_len < pr->body_size) {
1928                                         prod = p;
1929                                         prod_len = pr->body_size;
1930                                 }
1931                         }
1932
1933                         if (prod >= 0)
1934                                 fprintf(f, "\t[%d] = { %d, goto_%d, %d, %d, %d, 0, %d },\n",
1935                                         i, is->go_to.cnt, i, prod,
1936                                         g->productions[prod]->body_size,
1937                                         g->productions[prod]->head->num,
1938                                         is->starts_line);
1939                         else
1940                                 fprintf(f, "\t[%d] = { %d, goto_%d, -1, -1, -1, %d, %d },\n",
1941                                         i, is->go_to.cnt, i, shift_sym,
1942                                         is->starts_line);
1943                 }
1944                 fprintf(f, "};\n\n");
1945         }
1946
1947 ### The `do_reduce` function and the code
1948
1949 When the parser engine decides to reduce a production, it calls `do_reduce`.
1950 This has two functions.
1951
1952 Firstly, if a non-NULL `trace` file is passed, it prints out details of the
1953 production being reduced.  Secondly it runs the code that was included with
1954 the production if any.
1955
1956 This code needs to be able to store data somewhere.  Rather than requiring
1957 `do_reduce` to `malloc` that "somewhere", we pass in a large buffer and have
1958 `do_reduce` return the size to be saved.
1959
1960 The code fragment requires translation when written out.  Any `$N` needs to
1961 be converted to a reference either to that buffer (if `$0`) or to the
1962 structure returned by a previous reduction.  These pointer need to be cast
1963 to the appropriate type for each access.  All this is handling in
1964 `gen_code`.
1965
1966
1967 ###### functions
1968
1969         static void gen_code(struct production *p, FILE *f, struct grammar *g)
1970         {
1971                 char *c;
1972                 fprintf(f, "\t\t\t");
1973                 for (c = p->code.txt; c < p->code.txt + p->code.len; c++) {
1974                         int n;
1975                         if (*c != '$') {
1976                                 fputc(*c, f);
1977                                 if (*c == '\n')
1978                                         fputs("\t\t\t", f);
1979                                 continue;
1980                         }
1981                         c++;
1982                         if (*c < '0' || *c > '9') {
1983                                 fputc(*c, f);
1984                                 continue;
1985                         }
1986                         n = *c - '0';
1987                         while (c[1] >= '0' && c[1] <= '9') {
1988                                 c += 1;
1989                                 n = n * 10 + *c - '0';
1990                         }
1991                         if (n == 0)
1992                                 fprintf(f, "(*(struct %.*s*%s)ret)",
1993                                         p->head->struct_name.len,
1994                                         p->head->struct_name.txt,
1995                                         p->head->isref ? "*":"");
1996                         else if (n > p->body_size)
1997                                 fprintf(f, "$%d", n);
1998                         else if (p->body[n-1]->type == Terminal)
1999                                 fprintf(f, "(*(struct token *)body[%d])",
2000                                         n-1);
2001                         else if (p->body[n-1]->struct_name.txt == NULL)
2002                                 fprintf(f, "$%d", n);
2003                         else
2004                                 fprintf(f, "(*(struct %.*s*%s)body[%d])",
2005                                         p->body[n-1]->struct_name.len,
2006                                         p->body[n-1]->struct_name.txt,
2007                                         p->body[n-1]->isref ? "*":"", n-1);
2008                 }
2009                 fputs("\n", f);
2010         }
2011
2012 ###### functions
2013
2014         static void gen_reduce(FILE *f, struct grammar *g, char *file)
2015         {
2016                 int i;
2017                 fprintf(f, "#line 0 \"gen_reduce\"\n");
2018                 fprintf(f, "static int do_reduce(int prod, void **body, void *ret)\n");
2019                 fprintf(f, "{\n");
2020                 fprintf(f, "\tint ret_size = 0;\n");
2021
2022                 fprintf(f, "\tswitch(prod) {\n");
2023                 for (i = 0; i < g->production_count; i++) {
2024                         struct production *p = g->productions[i];
2025                         fprintf(f, "\tcase %d:\n", i);
2026
2027                         if (p->code.txt)
2028                                 gen_code(p, f, g);
2029
2030                         if (p->head->struct_name.txt)
2031                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s%s);\n",
2032                                         p->head->struct_name.len,
2033                                         p->head->struct_name.txt,
2034                                         p->head->isref ? "*":"");
2035
2036                         fprintf(f, "\t\tbreak;\n");
2037                 }
2038                 fprintf(f, "\t}\n\treturn ret_size;\n}\n\n");
2039         }
2040
2041 ### `do_free`
2042
2043 As each non-terminal can potentially cause a different type of data
2044 structure to be allocated and filled in, we need to be able to free it when
2045 done.
2046
2047 It is particularly important to have fine control over freeing during error
2048 recovery where individual stack frames might need to be freed.
2049
2050 For this, the grammar author required to defined a `free_XX` function for
2051 each structure that is used by a non-terminal.  `do_free` all call whichever
2052 is appropriate given a symbol number, and will call `free` (as is
2053 appropriate for tokens` on any terminal symbol.
2054
2055 ###### functions
2056
2057         static void gen_free(FILE *f, struct grammar *g)
2058         {
2059                 int i;
2060
2061                 fprintf(f, "#line 0 \"gen_free\"\n");
2062                 fprintf(f, "static void do_free(short sym, void *asn)\n");
2063                 fprintf(f, "{\n");
2064                 fprintf(f, "\tif (!asn) return;\n");
2065                 fprintf(f, "\tif (sym < %d) {\n", g->first_nonterm);
2066                 fprintf(f, "\t\tfree(asn);\n\t\treturn;\n\t}\n");
2067                 fprintf(f, "\tswitch(sym) {\n");
2068
2069                 for (i = 0; i < g->num_syms; i++) {
2070                         struct symbol *s = g->symtab[i];
2071                         if (!s ||
2072                             s->type != Nonterminal ||
2073                             s->struct_name.txt == NULL)
2074                                 continue;
2075
2076                         fprintf(f, "\tcase %d:\n", s->num);
2077                         if (s->isref)
2078                                 fprintf(f, "\t\tfree_%.*s(*(void**)asn);\n",
2079                                         s->struct_name.len,
2080                                         s->struct_name.txt);
2081                         else
2082                                 fprintf(f, "\t\tfree_%.*s(asn);\n",
2083                                         s->struct_name.len,
2084                                         s->struct_name.txt);
2085                         fprintf(f, "\t\tbreak;\n");
2086                 }
2087                 fprintf(f, "\t}\n}\n\n");
2088         }
2089
2090 ## The main routine.
2091
2092 There are three key parts to the "main" function of parsergen: processing
2093 the arguments, loading the grammar file, and dealing with the grammar.
2094
2095 ### Argument processing.
2096
2097 Command line options allow the selection of analysis mode, name of output
2098 file, and whether or not a report should be generated.  By default we create
2099 a report only if no code output was requested.
2100
2101 The `parse_XX` function name uses the basename of the output file which
2102 should not have a suffix (`.c`).  `.c` is added to the given name for the
2103 code, and `.h` is added for the header (if header text is specifed in the
2104 grammar file).
2105
2106 ###### includes
2107         #include <getopt.h>
2108
2109 ###### declarations
2110         static const struct option long_options[] = {
2111                 { "LR0",        0, NULL, '0' },
2112                 { "LR05",       0, NULL, '5' },
2113                 { "SLR",        0, NULL, 'S' },
2114                 { "LALR",       0, NULL, 'L' },
2115                 { "LR1",        0, NULL, '1' },
2116                 { "tag",        1, NULL, 't' },
2117                 { "report",     0, NULL, 'R' },
2118                 { "output",     1, NULL, 'o' },
2119                 { NULL,         0, NULL, 0   }
2120         };
2121         const char *options = "05SL1t:Ro:";
2122
2123 ###### process arguments
2124         int opt;
2125         char *outfile = NULL;
2126         char *infile;
2127         char *name;
2128         char *tag = NULL;
2129         int report = 1;
2130         enum grammar_type type = LR05;
2131         while ((opt = getopt_long(argc, argv, options,
2132                                   long_options, NULL)) != -1) {
2133                 switch(opt) {
2134                 case '0':
2135                         type = LR0; break;
2136                 case '5':
2137                         type = LR05; break;
2138                 case 'S':
2139                         type = SLR; break;
2140                 case 'L':
2141                         type = LALR; break;
2142                 case '1':
2143                         type = LR1; break;
2144                 case 'R':
2145                         report = 2; break;
2146                 case 'o':
2147                         outfile = optarg; break;
2148                 case 't':
2149                         tag = optarg; break;
2150                 default:
2151                         fprintf(stderr, "Usage: parsergen ...\n");
2152                         exit(1);
2153                 }
2154         }
2155         if (optind < argc)
2156                 infile = argv[optind++];
2157         else {
2158                 fprintf(stderr, "No input file given\n");
2159                 exit(1);
2160         }
2161         if (outfile && report == 1)
2162                 report = 0;
2163         name = outfile;
2164         if (name && strchr(name, '/'))
2165                 name = strrchr(name, '/')+1;
2166
2167         if (optind < argc) {
2168                 fprintf(stderr, "Excess command line arguments\n");
2169                 exit(1);
2170         }
2171
2172 ### Loading the grammar file
2173
2174 To be able to run `mdcode` and `scanner` on the grammar we need to memory
2175 map it.
2176
2177 One we have extracted the code (with `mdcode`) we expect to file three
2178 sections: header, code, and grammar.  Anything else is an error.
2179
2180 "header" and "code" are optional, though it is hard to build a working
2181 parser with neither. "grammar" must be provided.
2182
2183 ###### includes
2184         #include <fcntl.h>
2185         #include <sys/mman.h>
2186         #include <errno.h>
2187
2188 ###### functions
2189         static int errs;
2190         static void pr_err(char *msg)
2191         {
2192                 errs++;
2193                 fprintf(stderr, "%s\n", msg);
2194         }
2195
2196 ###### load file
2197         struct section *table;
2198         int fd;
2199         int len;
2200         char *file;
2201         fd = open(infile, O_RDONLY);
2202         if (fd < 0) {
2203                 fprintf(stderr, "parsergen: cannot open %s: %s\n",
2204                         infile, strerror(errno));
2205                 exit(1);
2206         }
2207         len = lseek(fd, 0, 2);
2208         file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2209         table = code_extract(file, file+len, pr_err);
2210
2211         struct code_node *hdr = NULL;
2212         struct code_node *code = NULL;
2213         struct code_node *gram = NULL;
2214         for (s = table; s; s = s->next) {
2215                 struct text sec = s->section;
2216                 if (tag && !strip_tag(&sec, tag))
2217                         continue;
2218                 if (text_is(sec, "header"))
2219                         hdr = s->code;
2220                 else if (text_is(sec, "code"))
2221                         code = s->code;
2222                 else if (text_is(sec, "grammar"))
2223                         gram = s->code;
2224                 else {
2225                         fprintf(stderr, "Unknown content section: %.*s\n",
2226                                 s->section.len, s->section.txt);
2227                         rv |= 2;
2228                 }
2229         }
2230
2231 ### Processing the input
2232
2233 As we need to append an extention to a filename and then open it for
2234 writing, and we need to do this twice, it helps to have a separate function.
2235
2236 ###### functions
2237
2238         static FILE *open_ext(char *base, char *ext)
2239         {
2240                 char *fn = malloc(strlen(base) + strlen(ext) + 1);
2241                 FILE *f;
2242                 strcat(strcpy(fn, base), ext);
2243                 f = fopen(fn, "w");
2244                 free(fn);
2245                 return f;
2246         }
2247
2248 If we can read the grammar, then we analyse and optionally report on it.  If
2249 the report finds conflicts we will exit with an error status.
2250
2251 ###### process input
2252         struct grammar *g = NULL;
2253         if (gram == NULL) {
2254                 fprintf(stderr, "No grammar section provided\n");
2255                 rv |= 2;
2256         } else {
2257                 g = grammar_read(gram);
2258                 if (!g) {
2259                         fprintf(stderr, "Failure to parse grammar\n");
2260                         rv |= 2;
2261                 }
2262         }
2263         if (g) {
2264                 grammar_analyse(g, type);
2265                 if (report)
2266                         if (grammar_report(g, type))
2267                                 rv |= 1;
2268         }
2269
2270 If a headers section is defined, we write it out and include a declaration
2271 for the `parse_XX` function so it can be used from separate code.
2272
2273         if (rv == 0 && hdr && outfile) {
2274                 FILE *f = open_ext(outfile, ".h");
2275                 if (f) {
2276                         code_node_print(f, hdr, infile);
2277                         fprintf(f, "void *parse_%s(struct code_node *code, struct token_config *config, FILE *trace);\n",
2278                                 name);
2279                         fclose(f);
2280                 } else {
2281                         fprintf(stderr, "Cannot create %s.h\n",
2282                                 outfile);
2283                         rv |= 4;
2284                 }
2285         }
2286
2287 And if all goes well and an output file was provided, we create the `.c`
2288 file with the code section (if any) and the parser tables and function.
2289
2290         if (rv == 0 && outfile) {
2291                 FILE *f = open_ext(outfile, ".c");
2292                 if (f) {
2293                         if (code)
2294                                 code_node_print(f, code, infile);
2295                         gen_parser(f, g, infile, name);
2296                         fclose(f);
2297                 } else {
2298                         fprintf(stderr, "Cannot create %s.c\n",
2299                                 outfile);
2300                         rv |= 4;
2301                 }
2302         }
2303
2304 And that about wraps it up.  We need to set the locale so that UTF-8 is
2305 recognised properly, and link with `libicuuc` is `libmdcode` requires that.
2306
2307 ###### File: parsergen.mk
2308         parsergen : parsergen.o libscanner.o libmdcode.o
2309                 $(CC) $(CFLAGS) -o parsergen parsergen.o libscanner.o libmdcode.o -licuuc
2310
2311 ###### includes
2312         #include <locale.h>
2313
2314 ###### main
2315
2316         int main(int argc, char *argv[])
2317         {
2318                 struct section *s;
2319                 int rv = 0;
2320
2321                 setlocale(LC_ALL,"");
2322
2323                 ## process arguments
2324                 ## load file
2325                 ## process input
2326
2327                 return rv;
2328         }
2329
2330 ## The SHIFT/REDUCE parser
2331
2332 Having analysed the grammar and generated all the table, we only need the
2333 shift/reduce engine to bring it all together.
2334
2335 ### Goto table lookup
2336
2337 The parser generator has nicely provided us with goto tables sorted by
2338 symbol number.  We need a binary search function to find a symbol in the
2339 table.
2340
2341 ###### parser functions
2342
2343         static int search(const struct state *l, int sym)
2344         {
2345                 int lo = 0;
2346                 int hi = l->go_to_cnt;
2347
2348                 if (hi == 0)
2349                         return -1;
2350                 while (lo + 1 < hi) {
2351                         int mid = (lo + hi) / 2;
2352                         if (l->go_to[mid].sym <= sym)
2353                                 lo = mid;
2354                         else
2355                                 hi = mid;
2356                 }
2357                 if (l->go_to[lo].sym == sym)
2358                         return l->go_to[lo].state;
2359                 else
2360                         return -1;
2361         }
2362
2363 ### The state stack.
2364
2365 The core data structure for the parser is the stack.  This tracks all the
2366 symbols that have been recognised or partially recognised.
2367
2368 The stack usually won't grow very large - maybe a few tens of entries.  So
2369 we dynamically resize and array as required but never bother to shrink it
2370 down again.
2371
2372 We keep the stack as two separate allocations.  One, `asn_stack` stores the
2373 "abstract syntax nodes" which are created by each reduction.  When we call
2374 `do_reduce` we need to pass an array of the `asn`s of the body of the
2375 production, and by keeping a separate `asn` stack, we can just pass a
2376 pointer into this stack.
2377
2378 The other allocation stores all other stack fields of which there are four.
2379 The `state` is the most important one and guides the parsing process.  The
2380 `sym` is nearly unnecessary.  However when we want to free entries from the
2381 `asn_stack`, it helps to know what type they are so we can call the right
2382 freeing function.  The symbol leads us to the right free function through
2383 `do_free`.
2384
2385 The `indents` count and the `starts_indented` flag track the line
2386 indents in the symbol.  These are used to allow indent information to
2387 guide parsing and error recovery.
2388
2389 As well as the stack of frames we have a `next` frame which is
2390 assembled from the incoming token and other information prior to
2391 pushing it onto the stack.
2392
2393 ###### parser functions
2394
2395         struct parser {
2396                 struct frame {
2397                         short state;
2398                         short sym;
2399                         short starts_indented;
2400                         short indents;
2401                         short newline_permitted;
2402                 } *stack, next;
2403                 void **asn_stack;
2404                 int stack_size;
2405                 int tos;
2406         };
2407
2408 #### Shift and pop
2409
2410 Two operations are needed on the stack - shift (which is like push) and pop.
2411
2412 Shift applies not only to terminals but also to non-terminals.  When we
2413 reduce a production we will pop off entries corresponding to the body
2414 symbols, then push on an item for the head of the production.  This last is
2415 exactly the same process as shifting in a terminal so we use the same
2416 function for both.
2417
2418 To simplify other code we arrange for `shift` to fail if there is no `goto`
2419 state for the symbol.  This is useful in basic parsing due to our design
2420 that we shift when we can, and reduce when we cannot.  So the `shift`
2421 function reports if it could.
2422
2423 So `shift` finds the next state.  If that succeed it extends the allocations
2424 if needed and pushes all the information onto the stacks.
2425
2426 ###### parser functions
2427
2428         static int shift(struct parser *p,
2429                          void *asn,
2430                          const struct state states[])
2431         {
2432                 // Push an entry onto the stack
2433                 int newstate = search(&states[p->next.state], p->next.sym);
2434                 if (newstate < 0)
2435                         return 0;
2436                 if (p->tos >= p->stack_size) {
2437                         p->stack_size += 10;
2438                         p->stack = realloc(p->stack, p->stack_size
2439                                            * sizeof(p->stack[0]));
2440                         p->asn_stack = realloc(p->asn_stack, p->stack_size
2441                                            * sizeof(p->asn_stack[0]));
2442                 }
2443                 p->stack[p->tos] = p->next;
2444                 p->asn_stack[p->tos] = asn;
2445                 p->tos++;
2446                 p->next.state = newstate;
2447                 p->next.indents = 0;
2448                 p->next.starts_indented = 0;
2449                 // if new state doesn't start a line, we inherit newline_permitted status
2450                 if (states[newstate].starts_line)
2451                         p->next.newline_permitted = 1;
2452                 return 1;
2453         }
2454
2455 `pop` simply moves the top of stack (`tos`) back down the required amount
2456 and frees any `asn` entries that need to be freed.  It is called _after_ we
2457 reduce a production, just before we `shift` the nonterminal in.
2458
2459 ###### parser functions
2460
2461         static void pop(struct parser *p, int num,
2462                         void(*do_free)(short sym, void *asn))
2463         {
2464                 int i;
2465                 p->tos -= num;
2466                 for (i = 0; i < num; i++) {
2467                         p->next.indents += p->stack[p->tos+i].indents;
2468                         do_free(p->stack[p->tos+i].sym,
2469                                 p->asn_stack[p->tos+i]);
2470                 }
2471
2472                 if (num) {
2473                         p->next.state = p->stack[p->tos].state;
2474                         p->next.starts_indented = p->stack[p->tos].starts_indented;
2475                         p->next.newline_permitted = p->stack[p->tos].newline_permitted;
2476                         if (p->next.indents > p->next.starts_indented)
2477                                 p->next.newline_permitted = 0;
2478                 }
2479         }
2480
2481 ### Memory allocation
2482
2483 The `scanner` returns tokens in a local variable - we want them in allocated
2484 memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
2485 a reduce is in a large buffer.  Both of these require some allocation and
2486 copying, hence `memdup` and `tokcopy`.
2487
2488 ###### parser includes
2489         #include <memory.h>
2490
2491 ###### parser functions
2492
2493         void *memdup(void *m, int len)
2494         {
2495                 void *ret;
2496                 if (len == 0)
2497                         return NULL;
2498                 ret = malloc(len);
2499                 memcpy(ret, m, len);
2500                 return ret;
2501         }
2502
2503         static struct token *tok_copy(struct token tk)
2504         {
2505                 struct token *new = malloc(sizeof(*new));
2506                 *new = tk;
2507                 return new;
2508         }
2509
2510 ### The heart of the parser.
2511
2512 Now we have the parser.  If we can shift, we do.  If not and we can reduce
2513 we do.  If the production we reduced was production zero, then we have
2514 accepted the input and can finish.
2515
2516 We return whatever `asn` was returned by reducing production zero.
2517
2518 If we can neither shift nor reduce we have an error to handle.  We pop
2519 single entries off the stack until we can shift the `TK_error` symbol, then
2520 drop input tokens until we find one we can shift into the new error state.
2521
2522 When we find `TK_in` and `TK_out` tokens which report indents we need
2523 to handle them directly as the grammar cannot express what we want to
2524 do with them.
2525
2526 `TK_in` tokens are easy: we simply update the `next` stack frame to
2527 record how many indents there are and that the next token started with
2528 an indent.
2529
2530 `TK_out` tokens must either be counted off against any pending indent,
2531 or must force reductions until there is a pending indent which isn't
2532 at the start of a production.
2533
2534 ###### parser includes
2535         #include "parser.h"
2536 ###### parser_run
2537         void *parser_run(struct token_state *tokens,
2538                          const struct state states[],
2539                          int (*do_reduce)(int, void**, void*),
2540                          void (*do_free)(short, void*),
2541                          FILE *trace, const char *non_term[], int knowns)
2542         {
2543                 struct parser p = { 0 };
2544                 struct token *tk = NULL;
2545                 int accepted = 0;
2546                 void *ret;
2547
2548                 p.next.newline_permitted = states[0].starts_line;
2549                 while (!accepted) {
2550                         struct token *err_tk;
2551                         if (!tk)
2552                                 tk = tok_copy(token_next(tokens));
2553                         p.next.sym = tk->num;
2554                         if (trace)
2555                                 parser_trace(trace, &p, tk, states, non_term, knowns);
2556
2557                         if (p.next.sym == TK_in) {
2558                                 p.next.starts_indented = 1;
2559                                 p.next.indents = 1;
2560                                 free(tk);
2561                                 tk = NULL;
2562                                 continue;
2563                         }
2564                         if (p.next.sym == TK_out) {
2565                                 if (p.stack[p.tos-1].indents > p.stack[p.tos-1].starts_indented ||
2566                                     (p.stack[p.tos-1].indents == 1 &&
2567                                      states[p.next.state].reduce_size > 1)) {
2568                                         p.stack[p.tos-1].indents -= 1;
2569                                         if (p.stack[p.tos-1].indents == p.stack[p.tos-1].starts_indented) {
2570                                                 // no internal indent any more, reassess 'newline_permitted'
2571                                                 if (states[p.stack[p.tos-1].state].starts_line)
2572                                                         p.stack[p.tos-1].newline_permitted = 1;
2573                                                 else if (p.tos > 1)
2574                                                         p.stack[p.tos-1].newline_permitted = p.stack[p.tos-2].newline_permitted;
2575                                         }
2576                                         free(tk);
2577                                         tk = NULL;
2578                                         continue;
2579                                 }
2580                                 // fall through and force a REDUCE (as 'shift'
2581                                 // will fail).
2582                         }
2583                         if (p.next.sym == TK_newline) {
2584                                 if (!p.tos || ! p.stack[p.tos-1].newline_permitted) {
2585                                         free(tk);
2586                                         tk = NULL;
2587                                         continue;
2588                                 }
2589                         }
2590                         if (shift(&p, tk, states)) {
2591                                 tk = NULL;
2592                                 continue;
2593                         }
2594                         if (states[p.next.state].reduce_prod >= 0) {
2595                                 void **body;
2596                                 int prod = states[p.next.state].reduce_prod;
2597                                 int size = states[p.next.state].reduce_size;
2598                                 int bufsize;
2599                                 static char buf[16*1024];
2600                                 p.next.sym = states[p.next.state].reduce_sym;
2601
2602                                 body = p.asn_stack +
2603                                         (p.tos - states[p.next.state].reduce_size);
2604
2605                                 bufsize = do_reduce(prod, body, buf);
2606
2607                                 pop(&p, size, do_free);
2608                                 shift(&p, memdup(buf, bufsize), states);
2609                                 if (prod == 0)
2610                                         accepted = 1;
2611                                 continue;
2612                         }
2613                         if (tk->num == TK_out) {
2614                                 // Indent problem - synthesise tokens to get us
2615                                 // out of here.
2616                                 fprintf(stderr, "Synthesize %d to handle indent problem\n", states[p.next.state].shift_sym);
2617                                 p.next.sym = states[p.next.state].shift_sym;
2618                                 shift(&p, tok_copy(*tk), states);
2619                                 // FIXME need to report this error somehow
2620                                 continue;
2621                         }
2622                         /* Error. We walk up the stack until we
2623                          * find a state which will accept TK_error.
2624                          * We then shift in TK_error and see what state
2625                          * that takes us too.
2626                          * Then we discard input tokens until
2627                          * we find one that is acceptable.
2628                          */
2629
2630                         err_tk = tok_copy(*tk);
2631                         p.next.sym = TK_error;
2632                         while (shift(&p, err_tk, states) == 0
2633                                && p.tos > 0)
2634                                 // discard this state
2635                                 pop(&p, 1, do_free);
2636                         if (p.tos == 0) {
2637                                 free(err_tk);
2638                                 // no state accepted TK_error
2639                                 break;
2640                         }
2641                         while (search(&states[p.next.state], tk->num) < 0 &&
2642                                tk->num != TK_eof) {
2643                                 free(tk);
2644                                 tk = tok_copy(token_next(tokens));
2645                                 if (tk->num == TK_in)
2646                                         p.next.indents += 1;
2647                                 if (tk->num == TK_out) {
2648                                         if (p.next.indents == 0)
2649                                                 break;
2650                                         p.next.indents -= 1;
2651                                 }
2652                         }
2653                         if (p.tos == 0 && tk->num == TK_eof)
2654                                 break;
2655                 }
2656                 free(tk);
2657                 if (accepted)
2658                         ret = p.asn_stack[0];
2659                 else
2660                         pop(&p, p.tos, do_free);
2661                 free(p.asn_stack);
2662                 free(p.stack);
2663                 return ret;
2664         }
2665
2666 ###### exported functions
2667         void *parser_run(struct token_state *tokens,
2668                          const struct state states[],
2669                          int (*do_reduce)(int, void**, void*),
2670                          void (*do_free)(short, void*),
2671                          FILE *trace, const char *non_term[], int knowns);
2672
2673 ### Tracing
2674
2675 Being able to visualize the parser in action can be invaluable when
2676 debugging the parser code, or trying to understand how the parse of a
2677 particular grammar progresses.  The stack contains all the important
2678 state, so just printing out the stack every time around the parse loop
2679 can make it possible to see exactly what is happening.
2680
2681 This doesn't explicitly show each SHIFT and REDUCE action.  However they
2682 are easily deduced from the change between consecutive lines, and the
2683 details of each state can be found by cross referencing the states list
2684 in the stack with the "report" that parsergen can generate.
2685
2686 For terminal symbols, we just dump the token.  For non-terminals we
2687 print the name of the symbol.  The look ahead token is reported at the
2688 end inside square brackets.
2689
2690 ###### parser functions
2691
2692         static char *reserved_words[] = {
2693                 [TK_error]        = "ERROR",
2694                 [TK_in]           = "IN",
2695                 [TK_out]          = "OUT",
2696                 [TK_newline]      = "NEWLINE",
2697                 [TK_eof]          = "$eof",
2698         };
2699         static void parser_trace_state(FILE *trace, struct frame *f, const struct state states[])
2700         {
2701                 fprintf(trace, "(%d", f->state);
2702                 if (f->indents)
2703                         fprintf(trace, "%c%d", f->starts_indented?':':'.',
2704                                 f->indents);
2705                 if (states[f->state].starts_line)
2706                         fprintf(trace, "s");
2707                 if (f->newline_permitted)
2708                         fprintf(trace, "n");
2709                 fprintf(trace, ") ");
2710         }
2711
2712         void parser_trace(FILE *trace, struct parser *p,
2713                           struct token *tk, const struct state states[],
2714                           const char *non_term[], int knowns)
2715         {
2716                 int i;
2717                 for (i = 0; i < p->tos; i++) {
2718                         int sym = p->stack[i].sym;
2719                         parser_trace_state(trace, &p->stack[i], states);
2720                         if (sym < TK_reserved &&
2721                             reserved_words[sym] != NULL)
2722                                 fputs(reserved_words[sym], trace);
2723                         else if (sym < TK_reserved + knowns) {
2724                                 struct token *t = p->asn_stack[i];
2725                                 text_dump(trace, t->txt, 20);
2726                         } else
2727                                 fputs(non_term[sym - TK_reserved - knowns],
2728                                       trace);
2729                         fputs(" ", trace);
2730                 }
2731                 parser_trace_state(trace, &p->next, states);
2732                 fprintf(trace, " [");
2733                 if (tk->num < TK_reserved &&
2734                     reserved_words[tk->num] != NULL)
2735                         fputs(reserved_words[tk->num], trace);
2736                 else
2737                         text_dump(trace, tk->txt, 20);
2738                 fputs("]\n", trace);
2739         }
2740
2741 # A Worked Example
2742
2743 The obvious example for a parser is a calculator.
2744
2745 As `scanner` provides number parsing function using `libgmp` is it not much
2746 work to perform arbitrary rational number calculations.
2747
2748 This calculator takes one expression, or an equality test per line.  The
2749 results are printed and in any equality test fails, the program exits with
2750 an error.
2751
2752 Embedding mdcode inside mdcode is rather horrible.  I'd like to find a
2753 better approach, but as the grammar file must have 3 components I need
2754 something like this.
2755
2756 ###### File: parsergen.mk
2757         calc.c calc.h : parsergen parsergen.mdc
2758                 ./parsergen --tag calc -o calc parsergen.mdc
2759         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
2760                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
2761
2762 # calc: header
2763
2764         #include "number.h"
2765         // what do we use for a demo-grammar?  A calculator of course.
2766         struct number {
2767                 mpq_t val;
2768                 char tail[2];
2769                 int err;
2770         };
2771
2772 # calc: code
2773
2774         #include <stdlib.h>
2775         #include <unistd.h>
2776         #include <fcntl.h>
2777         #include <sys/mman.h>
2778         #include <stdio.h>
2779         #include <malloc.h>
2780         #include <gmp.h>
2781         #include "mdcode.h"
2782         #include "scanner.h"
2783         #include "number.h"
2784         #include "parser.h"
2785
2786         #include "calc.h"
2787
2788         static void free_number(struct number *n)
2789         {
2790                 mpq_clear(n->val);
2791                 free(n);
2792         }
2793
2794         int main(int argc, char *argv[])
2795         {
2796                 int fd = open(argv[1], O_RDONLY);
2797                 int len = lseek(fd, 0, 2);
2798                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2799                 struct section *s = code_extract(file, file+len, NULL);
2800                 struct token_config config = {
2801                         .ignored = (1 << TK_line_comment)
2802                                  | (1 << TK_block_comment)
2803                                  | (1 << TK_in)
2804                                  | (1 << TK_out),
2805                         .number_chars = ".,_+-",
2806                         .word_start = "",
2807                         .word_cont = "",
2808                 };
2809                 parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
2810                 exit(0);
2811         }
2812
2813 # calc: grammar
2814
2815         Session -> Session Line
2816                 | Line
2817
2818         Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
2819                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
2820                                         gmp_printf("  or as a decimal: %Fg\n", fl);
2821                                         mpf_clear(fl);
2822                                         }
2823                                      }$
2824                 | Expression = Expression NEWLINE ${
2825                         if (mpq_equal($1.val, $3.val))
2826                                 gmp_printf("Both equal %Qd\n", $1.val);
2827                         else {
2828                                 gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
2829                                         $1.val, $3.val);
2830                                 exit(1);
2831                         }
2832                 }$
2833                 | NEWLINE ${ printf("Blank line\n"); }$
2834                 | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
2835
2836         $number
2837         Expression -> Expression + Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$
2838                 | Expression - Term ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
2839                 | Term ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2840
2841         Term -> Term * Factor ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
2842                 | Term / Factor ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
2843                 | Factor ${ mpq_init($0.val); mpq_set($0.val, $1.val); }$
2844
2845         Factor -> NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
2846                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$