parsergen: allow "$void" to remove current type.

[ocean] / csrc / parsergen.mdc
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index ddf03ab27d640ab38f8a07c342f9c5ee9dc66324..0099f5785db32072118b228473d8aa94ed9bf09f 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -40,6 +40,7 @@ There are several distinct sections.
         #include <stdlib.h>
         #include <stdio.h>
         ## parser includes
+       ## parser functions
         ## parser
  ###### File: calc.cgm
         ## demo grammar
@@ -234,6 +235,12 @@ The data type name is simply stored and applied to the head of all
  subsequent productions.  It must be the name of a structure, so `$expr`
  maps to `struct expr`.
  
+Any productions given before the first data type will have no data type
+and can carry no information.  In order to allow other non-terminals to
+have no type, the data type `$void` can be given.  This does *not* mean
+that `struct void` will be used, but rather than no type will be
+associated with future non-terminals.
+
  The precedence line must contain a list of symbols - typically
  terminal symbols, but not necessarily.  It can only contain symbols
  that have not been seen yet, so precedence declaration must precede
@@ -281,6 +288,8 @@ production inherits from the last symbol which has a precedence.
                         assoc = Non;
                 else {
                         g->current_type = t.txt;
+                       if (text_is(t.txt, "void"))
+                               g->current_type.txt = NULL;
                         t = token_next(ts);
                         if (t.num != TK_newline) {
                                 err = "Extra tokens after type name";
@@ -528,7 +537,7 @@ Now we are ready to read in the grammar.
                         .ignored = (1 << TK_line_comment)
                                  | (1 << TK_block_comment)
                                  | (0 << TK_number)
-                                | (0 << TK_string)
+                                | (1 << TK_string)
                                  | (1 << TK_multi_string)
                                  | (1 << TK_indent)
                                  | (1 << TK_undent),
@@ -636,7 +645,7 @@ allocated space as it can be derived directly from the current `cnt` using
  `((cnt - 1) | 7) + 1`.
  
  ###### functions
-       static void symset_add(struct symset *s, int key, int val)
+       static void symset_add(struct symset *s, unsigned short key, unsigned short val)
         {
                 int i;
                 int current = ((s->cnt-1) | 7) + 1;
@@ -663,7 +672,7 @@ Finding a symbol (or item) in a `symset` uses a simple binary search.
  We return the index where the value was found (so data can be accessed),
  or `-1` to indicate failure.
  
-       static int symset_find(struct symset *ss, int key)
+       static int symset_find(struct symset *ss, unsigned short key)
         {
                 int lo = 0;
                 int hi = ss->cnt;
@@ -695,7 +704,7 @@ can be optimised later.
                 int added = 0;
                 for (i = 0; i < b->cnt; i++)
                         if (symset_find(a, b->syms[i]) < 0) {
-                               int data = 0;
+                               unsigned short data = 0;
                                 if (b->data != NO_DATA)
                                         data = b->data[i];
                                 symset_add(a, b->syms[i], data);
@@ -1003,7 +1012,7 @@ the list in the symset, and then only compare items before the first "0".
  ###### declarations
         static inline unsigned short item_num(int production, int index)
         {
-               return production | (((index-1)&0x1f) << 11);
+               return production | ((31-index) << 11);
         }
         static inline int item_prod(unsigned short item)
         {
@@ -1011,7 +1020,7 @@ the list in the symset, and then only compare items before the first "0".
         }
         static inline int item_index(unsigned short item)
         {
-               return ((item >> 11)+1) & 0x1f;
+               return (31-(item >> 11)) & 0x1f;
         }
  
  For LR(1) analysis we need to compare not just the itemset in a state
@@ -1168,7 +1177,7 @@ though.
                 int p2;
                 struct symbol *s;
                 struct symset LA = INIT_SYMSET;
-               int sn = 0;
+               unsigned short sn = 0;
  
                 if (bs == pr->body_size)
                         continue;
@@ -1226,7 +1235,7 @@ with a pre-existing itemset).
         // if they don't exist.
         for (i = 0; i < done.cnt; i++) {
                 int j;
-               int state;
+               unsigned short state;
                 struct symset newitemset = INIT_SYMSET;
                 if (type >= LALR)
                         newitemset = INIT_DATASET;
@@ -1236,7 +1245,7 @@ with a pre-existing itemset).
                         int p = item_prod(itm);
                         int bp = item_index(itm);
                         struct production *pr = g->productions[p];
-                       int la = 0;
+                       unsigned short la = 0;
                         int pos;
  
                         if (bp == pr->body_size)
@@ -1277,7 +1286,7 @@ with `TK_eof` as the LA set.
                 struct symset first = INIT_SYMSET;
                 struct itemset *is;
                 int again;
-               int la = 0;
+               unsigned short la = 0;
                 if (type >= LALR) {
                         // LA set just has eof
                         struct symset eof = INIT_SYMSET;
@@ -1296,6 +1305,7 @@ with `TK_eof` as the LA set.
                         if (is->completed)
                                 continue;
                         is->completed = 1;
+                       again = 1;
                         ## complete itemset
                         ## derive itemsets
                         symset_free(done);
@@ -1607,7 +1617,7 @@ between the two.
                                 continue;
                         /* First collect the shifts */
                         for (j = 0; j < is->items.cnt; j++) {
-                               int itm = is->items.syms[j];
+                               unsigned short itm = is->items.syms[j];
                                 int p = item_prod(itm);
                                 int bp = item_index(itm);
                                 struct production *pr = g->productions[p];
@@ -1622,7 +1632,7 @@ between the two.
                         }
                         /* Now look for reduction and conflicts */
                         for (j = 0; j < is->items.cnt; j++) {
-                               int itm = is->items.syms[j];
+                               unsigned short itm = is->items.syms[j];
                                 int p = item_prod(itm);
                                 int bp = item_index(itm);
                                 struct production *pr = g->productions[p];
@@ -1685,6 +1695,7 @@ pieces of code provided in the grammar file, so they are generated first.
         static void gen_parser(FILE *f, struct grammar *g, char *file, char *name)
         {
                 gen_known(f, g);
+               gen_non_term(f, g);
                 gen_goto(f, g);
                 gen_states(f, g);
                 gen_reduce(f, g, file);
@@ -1699,7 +1710,7 @@ pieces of code provided in the grammar file, so they are generated first.
                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
                 fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
                 fprintf(f, "\ttokens = token_open(code, config);\n");
-               fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace);\n");
+               fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config->known_count);\n");
                 fprintf(f, "\ttoken_close(tokens);\n");
                 fprintf(f, "\treturn rv;\n");
                 fprintf(f, "}\n\n");
@@ -1708,6 +1719,7 @@ pieces of code provided in the grammar file, so they are generated first.
  ### Table words table
  
  The know words is simply an array of terminal symbols.
+The table of nonterminals used for tracing is a similar array.
  
  ###### functions
  
@@ -1724,6 +1736,20 @@ The know words is simply an array of terminal symbols.
                 fprintf(f, "};\n\n");
         }
  
+       static void gen_non_term(FILE *f, struct grammar *g)
+       {
+               int i;
+               fprintf(f, "#line 0 \"gen_non_term\"\n");
+               fprintf(f, "static const char *non_term[] = {\n");
+               for (i = TK_reserved;
+                    i < g->num_syms;
+                    i++)
+                       if (g->symtab[i]->type == Nonterminal)
+                               fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
+                                       g->symtab[i]->name.txt);
+               fprintf(f, "};\n\n");
+       }
+
  ### States and the goto tables.
  
  For each state we record the goto table and the reducible production if
@@ -1867,10 +1893,10 @@ to the appropriate type for each access.  All this is handling in
  
         static void gen_reduce(FILE *f, struct grammar *g, char *file)
         {
-               int i, j;
+               int i;
                 fprintf(f, "#line 0 \"gen_reduce\"\n");
                 fprintf(f, "static int do_reduce(int prod, int depth, void **body,\n");
-               fprintf(f, "                      void *ret, FILE *trace)\n");
+               fprintf(f, "                      void *ret)\n");
                 fprintf(f, "{\n");
                 fprintf(f, "\tint ret_size = 0;\n");
  
@@ -1882,21 +1908,6 @@ to the appropriate type for each access.  All this is handling in
                         if (p->code.txt)
                                 gen_code(p, f, g);
  
-                       fprintf(f, "\t\tif (trace) {\n");
-                       fprintf(f, "\t\t\tfprintf(trace, \"[%%2d]%.*s ->\", depth);\n",
-                               p->head->name.len, p->head->name.txt);
-                       for (j = 0; j < p->body_size; j++) {
-                               if (p->body[j]->type == Terminal) {
-                                       fputs("\t\t\tfputs(\" \", trace);\n", f);
-                                       fprintf(f, "\t\t\ttext_dump(trace, (*(struct token*)body[%d]).txt, 20);\n", j);
-                               } else {
-                                       fprintf(f, "\t\t\tfprintf(trace, \" %.*s\");\n",
-                                               p->body[j]->name.len,
-                                               p->body[j]->name.txt);
-                               }
-                       }
-                       fprintf(f, "\t\t}\n");
-
                         if (p->head->struct_name.txt)
                                 fprintf(f, "\t\tret_size = sizeof(struct %.*s);\n",
                                         p->head->struct_name.len,
@@ -2195,7 +2206,7 @@ The parser generator has nicely provided us with goto tables sorted by
  symbol number.  We need a binary search function to find a symbol in the
  table.
  
-###### parser
+###### parser functions
  
         static int search(const struct state *l, int sym)
         {
@@ -2239,7 +2250,7 @@ The `state` is the most important one and guides the parsing process.  The
  freeing function.  The symbol leads us to the right free function through
  `do_free`.
  
-###### parser
+###### parser functions
  
         struct parser {
                 int state;
@@ -2270,7 +2281,7 @@ function reports if it could.
  So `shift` finds the next state.  If that succeed it extends the allocations
  if needed and pushed all the information onto the stacks.
  
-###### parser
+###### parser functions
  
         static int shift(struct parser *p,
                          int sym, void *asn,
@@ -2299,7 +2310,7 @@ if needed and pushed all the information onto the stacks.
  and frees and `asn` entries that need to be freed.  It is called _after_ we
  reduce a production, just before we `shift` the nonterminal in.
  
-###### parser
+###### parser functions
  
         static void pop(struct parser *p, int num,
                         void(*do_free)(short sym, void *asn))
@@ -2323,7 +2334,7 @@ copying, hence `memdup` and `tokcopy`.
  ###### parser includes
         #include <memory.h>
  
-###### parser
+###### parser functions
  
         void *memdup(void *m, int len)
         {
@@ -2359,9 +2370,9 @@ We return whatever `asn` was returned by reducing production zero.
  ###### parser
         void *parser_run(struct token_state *tokens,
                          const struct state states[],
-                        int (*do_reduce)(int, int, void**, void*, FILE*),
+                        int (*do_reduce)(int, int, void**, void*),
                          void (*do_free)(short, void*),
-                        FILE *trace)
+                        FILE *trace, const char *non_term[], int knowns)
         {
                 struct parser p = { 0 };
                 struct token *tk;
@@ -2370,12 +2381,10 @@ We return whatever `asn` was returned by reducing production zero.
  
                 tk = tok_copy(token_next(tokens));
                 while (!accepted) {
+                       if (trace)
+                               parser_trace(trace, &p, tk, states, non_term, knowns);
+
                         if (shift(&p, tk->num, tk, states)) {
-                               if (trace) {
-                                       fputs("Shift ", trace);
-                                       text_dump(trace, tk->txt, 20);
-                                       fputs("\n", trace);
-                               }
                                 tk = tok_copy(token_next(tokens));
                                 continue;
                         }
@@ -2391,9 +2400,7 @@ We return whatever `asn` was returned by reducing production zero.
                                         (p.tos - states[p.state].reduce_size);
  
                                 bufsize = do_reduce(prod, p.tos, body,
-                                                   buf, trace);
-                               if (trace)
-                                       fputs("\n", trace);
+                                                   buf);
  
                                 pop(&p, size, do_free);
                                 shift(&p, sym, memdup(buf, bufsize), states);
@@ -2408,7 +2415,7 @@ We return whatever `asn` was returned by reducing production zero.
                          * Then we discard input tokens until
                          * we find one that is acceptable.
                          */
-                       while (shift(&p, TK_error, tk, states) < 0
+                       while (shift(&p, TK_error, tk, states) == 0
                                && p.tos > 0)
                                 // discard this state
                                 pop(&p, 1, do_free);
@@ -2434,10 +2441,49 @@ We return whatever `asn` was returned by reducing production zero.
  ###### exported functions
         void *parser_run(struct token_state *tokens,
                          const struct state states[],
-                        int (*do_reduce)(int, int, void**, void*, FILE*),
+                        int (*do_reduce)(int, int, void**, void*),
                          void (*do_free)(short, void*),
-                        FILE *trace);
+                        FILE *trace, const char *non_term[], int knowns);
+
+### Tracing
+
+Being able to visualize the parser in action can be invaluable when
+debugging the parser code, or trying to understand how the parse of a
+particular grammar progresses.  The stack contains all the important
+state, so just printing out the stack every time around the parse loop
+can make it possible to see exactly what is happening.
  
+This doesn't explicitly show each SHIFT and REDUCE action.  However they
+are easily deduced from the change between consecutive lines, and the
+details of each state can be found by cross referencing the states list
+in the stack with the "report" that parsergen can generate.
+
+For terminal symbols, we just dump the token.  For non-terminals we
+print the name of the symbol.  The look ahead token is reported at the
+end inside square brackets.
+
+###### parser functions
+
+       void parser_trace(FILE *trace, struct parser *p,
+                         struct token *tk, const struct state states[],
+                         const char *non_term[], int knowns)
+       {
+               int i;
+               for (i = 0; i < p->tos; i++) {
+                       int sym = p->stack[i].sym;
+                       fprintf(trace, "(%d) ", p->stack[i].state);
+                       if (sym < TK_reserved + knowns) {
+                               struct token *t = p->asn_stack[i];
+                               text_dump(trace, t->txt, 20);
+                       } else
+                               fputs(non_term[sym - TK_reserved - knowns],
+                                     trace);
+                       fputs(" ", trace);
+               }
+               fprintf(trace, "(%d) [", p->state);
+               text_dump(trace, tk->txt, 20);
+               fputs("]\n", trace);
+       }
  
  # A Worked Example
  
@@ -2523,23 +2569,23 @@ something like this.
         Session -> Session Line
                 | Line
  
-       Line -> Expression NEWLINE ${ gmp_printf( "Answer = %Qd\n", $1.val);
+       Line -> Expression NEWLINE ${ gmp_printf("Answer = %Qd\n", $1.val);
                                         { mpf_t fl; mpf_init2(fl, 20); mpf_set_q(fl, $1.val);
-                                       gmp_printf( "  or as a decimal: %Fg\n", fl);
+                                       gmp_printf("  or as a decimal: %Fg\n", fl);
                                         mpf_clear(fl);
                                         }
                                      }$
                 | Expression = Expression NEWLINE ${
                         if (mpq_equal($1.val, $3.val))
-                               gmp_printf( "Both equal %Qd\n", $1.val);
+                               gmp_printf("Both equal %Qd\n", $1.val);
                         else {
-                               gmp_printf( "NOT EQUAL: %Qd\n      != : %Qd\n",
+                               gmp_printf("NOT EQUAL: %Qd\n      != : %Qd\n",
                                         $1.val, $3.val);
                                 exit(1);
                         }
                 }$
-               |  NEWLINE ${ printf( "Blank line\n"); }$
-               | ERROR NEWLINE ${ printf( "Skipped a bad line\n"); }$
+               |  NEWLINE ${ printf("Blank line\n"); }$
+               | ERROR NEWLINE ${ printf("Skipped a bad line\n"); }$
  
         $number
         Expression -> Expression +  Term ${ mpq_init($0.val); mpq_add($0.val, $1.val, $3.val); }$