parergen: fix bug in deriving itemsets

[ocean] / csrc / parsergen.mdc
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index e0a2167ef1eecf41936c4762d8935bd464190e86..a6486df26025ab252fd4fc446aea2c3dda02e9aa 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -561,6 +561,7 @@ Now we have all the bits we need to parse a full production.
         abort:
                 while (tk.num != TK_newline && tk.num != TK_eof)
                         tk = token_next(state);
+               free(p.body);
                 return err;
         }
  
@@ -695,7 +696,7 @@ used as a terminal anywhere that a terminal is expected.
                                         s->name.len, s->name.txt);
                         }
                         if (errs) {
-                               free(g);
+                               free(g); // FIXME free content
                                 g = NULL;
                         }
                 }
@@ -704,7 +705,7 @@ used as a terminal anywhere that a terminal is expected.
                 fprintf(stderr, "Error at line %d: %s\n",
                         tk.line, err);
                 token_close(state);
-               free(g);
+               free(g); // FIXME free content
                 return NULL;
         }
  
@@ -1071,6 +1072,9 @@ and we find the set of possible "first" symbols after there.  This is
  done using `add_first` above and only needs to be done once as the
  "first" sets are now stable and will not change.
  
+###### grammar fields
+       struct symset *follow;
+
  ###### follow code
  
         for (p = 0; p < g->production_count; p++) {
@@ -1097,10 +1101,10 @@ combine these two functions into a single loop.
  
  ###### follow code
  
-       for (again = 0, p = 0;
+       for (check_again = 0, p = 0;
              p < g->production_count;
              p < g->production_count-1
-               ? p++ : again ? (again = 0, p = 0)
+               ? p++ : check_again ? (check_again = 0, p = 0)
                               : p++) {
                 struct production *pr = g->productions[p];
                 int b;
@@ -1111,7 +1115,7 @@ combine these two functions into a single loop.
                                 break;
                         if (symset_union(&g->follow[bs->num],
                                          &g->follow[pr->head->num]))
-                               again = 1;
+                               check_again = 1;
                         if (!bs->nullable)
                                 break;
                 }
@@ -1120,13 +1124,10 @@ combine these two functions into a single loop.
  We now just need to create and initialise the `follow` list to get a
  complete function.
  
-###### grammar fields
-       struct symset *follow;
-
  ###### functions
         static void build_follow(struct grammar *g)
         {
-               int s, again, p;
+               int s, check_again, p;
                 g->follow = calloc(g->num_syms, sizeof(g->follow[0]));
                 for (s = 0; s < g->num_syms; s++)
                         g->follow[s] = INIT_SYMSET;
@@ -1407,7 +1408,7 @@ so the item is ineffective.
                         continue;
                 if (s->line_like)
                         is->starts_line = 1;
-               again = 1;
+               check_again = 1;
                 if (type >= LALR) {
                         // Need the LA set.
                         int to_end;
@@ -1487,18 +1488,21 @@ itemsets (or merged with a pre-existing itemset).
                                 continue;
                         if (pr->body[bp] != sym)
                                 continue;
+
+                       bp += 1;
                         if (type >= LALR)
                                 la = is->items.data[j];
-                       pos = symset_find(&newitemset, pr->head->num);
-                       if (bp + 1 == pr->body_size &&
+                       if (bp == pr->body_size &&
                             pr->precedence > 0 &&
                             pr->precedence > precedence) {
                                 // new itemset is reducible and has a precedence.
                                 precedence = pr->precedence;
                                 assoc = pr->assoc;
                         }
+                       pos = symset_find(&newitemset, item_num(p, bp));
+
                         if (pos < 0)
-                               symset_add(&newitemset, item_num(p, bp+1), la);
+                               symset_add(&newitemset, item_num(p, bp), la);
                         else if (type >= LALR) {
                                 // Need to merge la set.
                                 int la2 = newitemset.data[pos];
@@ -1527,7 +1531,7 @@ with `TK_eof` as the LA set.
         {
                 struct symset first = INIT_SYMSET;
                 struct itemset *is;
-               int again;
+               int check_again;
                 unsigned short la = 0;
                 if (type >= LALR) {
                         // LA set just has eof
@@ -1539,15 +1543,15 @@ with `TK_eof` as the LA set.
                 // production 0, offset 0 (with no data)
                 symset_add(&first, item_num(0, 0), la);
                 add_itemset(g, first, Non, 0, type);
-               for (again = 0, is = g->items;
+               for (check_again = 0, is = g->items;
                      is;
-                    is = is->next ?: again ? (again = 0, g->items) : NULL) {
+                    is = is->next ?: check_again ? (check_again = 0, g->items) : NULL) {
                         int i;
                         struct symset done = INIT_SYMSET;
                         if (is->completed)
                                 continue;
                         is->completed = 1;
-                       again = 1;
+                       check_again = 1;
                         ## complete itemset
                         ## derive itemsets
                         symset_free(done);
@@ -2072,8 +2076,6 @@ The go to table is stored in a simple array of `sym` and corresponding
                 }
         }
  
-###### functions
-
         static void gen_states(FILE *f, struct grammar *g)
         {
                 int i;
@@ -2205,7 +2207,7 @@ transformed, and will cause an error when the code is compiled.
                         c = *name;
                 }
                 if (namlen == 0) {
-                       if (name == *namep)
+                       if (name == *namep || n > p->body_size)
                                 return -1;
                         *namep = name;
                         return n;
@@ -2214,8 +2216,10 @@ transformed, and will cause an error when the code is compiled.
                 for (i = 0; i < p->body_size; i++) {
                         if (!subseq_match(nam, namlen, p->body[i]->name))
                                 continue;
-                       if (slen == 0 || p->body[i]->name.len < slen)
+                       if (slen == 0 || p->body[i]->name.len < slen) {
                                 s = i;
+                               slen = p->body[i]->name.len;
+                       }
                         if (s >= 0 && p->body[i] != p->body[s] &&
                             p->body[i]->name.len == p->body[s]->name.len)
                                 /* not unique, so s cannot be used */
@@ -2223,7 +2227,7 @@ transformed, and will cause an error when the code is compiled.
                 }
                 if (s < 0)
                         return -1;
-               if (n == 0);
+               if (n == 0)
                         n = 1;
                 for (i = 0; i < p->body_size; i++)
                         if (p->body[i] == p->body[s]) {
@@ -2231,7 +2235,7 @@ transformed, and will cause an error when the code is compiled.
                                 if (n == 0)
                                         break;
                         }
-               if (n > 1)
+               if (n > 0 || i == p->body_size)
                         return -1;
                 *namep = name;
                 return i + 1;
@@ -2302,15 +2306,15 @@ transformed, and will cause an error when the code is compiled.
  ###### functions
  
         static void gen_reduce(FILE *f, struct grammar *g, char *file,
-                              struct code_node *code)
+                              struct code_node *pre_reduce)
         {
                 int i;
                 fprintf(f, "#line 1 \"gen_reduce\"\n");
                 fprintf(f, "static int do_reduce(int prod, void **body, struct token_config *config, void *ret)\n");
                 fprintf(f, "{\n");
                 fprintf(f, "\tint ret_size = 0;\n");
-               if (code)
-                       code_node_print(f, code, file);
+               if (pre_reduce)
+                       code_node_print(f, pre_reduce, file);
  
                 fprintf(f, "#line 4 \"gen_reduce\"\n");
                 fprintf(f, "\tswitch(prod) {\n");
@@ -2445,7 +2449,7 @@ grammar file).
                 case 't':
                         tag = optarg; break;
                 default:
-                       fprintf(stderr, "Usage: parsergen ...\n");
+                       fprintf(stderr, "Usage: parsergen -[05SL1R] [-t tag] [-o output] input\n");
                         exit(1);
                 }
         }
@@ -2661,6 +2665,35 @@ table.
                         return -1;
         }
  
+### Memory allocation
+
+The `scanner` returns tokens in a local variable - we want them in allocated
+memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
+a reduce is in a large buffer.  Both of these require some allocation and
+copying, hence `memdup` and `tok_copy`.
+
+###### parser includes
+       #include <memory.h>
+
+###### parser functions
+
+       void *memdup(void *m, int len)
+       {
+               void *ret;
+               if (len == 0)
+                       return NULL;
+               ret = malloc(len);
+               memcpy(ret, m, len);
+               return ret;
+       }
+
+       static struct token *tok_copy(struct token tk)
+       {
+               struct token *new = malloc(sizeof(*new));
+               *new = tk;
+               return new;
+       }
+
  ### The state stack.
  
  The core data structure for the parser is the stack.  This tracks all
@@ -2833,35 +2866,6 @@ before we `shift` the nonterminal in.
                 return indents;
         }
  
-### Memory allocation
-
-The `scanner` returns tokens in a local variable - we want them in allocated
-memory so they can live in the `asn_stack`.  Similarly the `asn` produced by
-a reduce is in a large buffer.  Both of these require some allocation and
-copying, hence `memdup` and `tokcopy`.
-
-###### parser includes
-       #include <memory.h>
-
-###### parser functions
-
-       void *memdup(void *m, int len)
-       {
-               void *ret;
-               if (len == 0)
-                       return NULL;
-               ret = malloc(len);
-               memcpy(ret, m, len);
-               return ret;
-       }
-
-       static struct token *tok_copy(struct token tk)
-       {
-               struct token *new = malloc(sizeof(*new));
-               *new = tk;
-               return new;
-       }
-
  ### The heart of the parser.
  
  Now we have the parser.  For each token we might shift it, trigger a