]> ocean-lang.org Git - ocean/blobdiff - csrc/parsergen.mdc
parsergen: change how reserved_words are stored
[ocean] / csrc / parsergen.mdc
index 66a41b5f5a297abd1a90510c23d1d40e5e89e676..da9335ed0df1794fe49c35030272e333956259a3 100644 (file)
@@ -169,17 +169,18 @@ table of known symbols and the resulting parser will report them as
 different token types that `scanner` can report.
 
 ###### declarations
-       static char *reserved_words[] = {
-               [TK_error]        = "ERROR",
-               [TK_number]       = "NUMBER",
-               [TK_ident]        = "IDENTIFIER",
-               [TK_mark]         = "MARK",
-               [TK_string]       = "STRING",
-               [TK_multi_string] = "MULTI_STRING",
-               [TK_in]           = "IN",
-               [TK_out]          = "OUT",
-               [TK_newline]      = "NEWLINE",
-               [TK_eof]          = "$eof",
+
+       static struct { int num; char *name; } reserved_words[] = {
+               { TK_error,        "ERROR" },
+               { TK_number,       "NUMBER" },
+               { TK_ident,        "IDENTIFIER" },
+               { TK_mark,         "MARK" },
+               { TK_string,       "STRING" },
+               { TK_multi_string, "MULTI_STRING" },
+               { TK_in,           "IN" },
+               { TK_out,          "OUT" },
+               { TK_newline,      "NEWLINE" },
+               { TK_eof,          "$eof" },
        };
 ###### symbol fields
        short num;
@@ -235,13 +236,11 @@ symbol, but its type might be `Unknown`.
                for (i = 0; i < entries; i++) {
                        struct text t;
                        struct symbol *s;
-                       t.txt = reserved_words[i];
-                       if (!t.txt)
-                               continue;
+                       t.txt = reserved_words[i].name;
                        t.len = strlen(t.txt);
                        s = sym_find(g, t);
                        s->type = Terminal;
-                       s->num = i;
+                       s->num = reserved_words[i].num;
                }
        }
 
@@ -408,14 +407,15 @@ be in one `code_node` of the literate code.  The `}$` must be
 at the end of a line.
 
 Text in the code fragment will undergo substitutions where `$N` or
-`$<N`,for some numeric `N`, will be replaced with a variable holding
-the parse information for the particular symbol in the production.
-`$0` is the head of the production, `$1` is the first symbol of the
-body, etc.  The type of `$N` for a terminal symbol is `struct token`.
-For a non-terminal, it is whatever has been declared for that symbol.
-The `<` may be included for symbols declared as storing a reference
-(not a structure) and means that the reference is being moved out, so
-it will not automatically be freed.
+`$<N`,for some numeric `N` (or non-numeric indicator as described
+later), will be replaced with a variable holding the parse information
+for the particular symbol in the production.  `$0` is the head of the
+production, `$1` is the first symbol of the body, etc.  The type of `$N`
+for a terminal symbol is `struct token`.  For a non-terminal, it is
+whatever has been declared for that symbol.  The `<` may be included and
+means that the value (usually a reference) is being moved out, so it
+will not automatically be freed.  The effect of using '<' is that the
+variable is cleareed to all-zeros.
 
 Symbols that are left-recursive are a little special.  These are symbols
 that both the head of a production and the first body symbol of the same
@@ -1167,15 +1167,15 @@ the end of the list in the symset, and then only compare items before
 the first "0".
 
 ###### declarations
-       static inline unsigned short item_num(int production, int index)
+       static inline unsigned short item_num(int production, int dot)
        {
-               return production | ((31-index) << 11);
+               return production | ((31-dot) << 11);
        }
        static inline int item_prod(unsigned short item)
        {
                return item & 0x7ff;
        }
-       static inline int item_index(unsigned short item)
+       static inline int item_dot(unsigned short item)
        {
                return (31-(item >> 11)) & 0x1f;
        }
@@ -1193,8 +1193,8 @@ can just compare the symset and the data values together.
 
                for (i = 0;
                     i < a.cnt && i < b.cnt &&
-                    item_index(a.syms[i]) > 0 &&
-                    item_index(b.syms[i]) > 0;
+                    item_dot(a.syms[i]) > 0 &&
+                    item_dot(b.syms[i]) > 0;
                     i++) {
                        int diff = a.syms[i] - b.syms[i];
                        if (diff)
@@ -1205,11 +1205,11 @@ can just compare the symset and the data values together.
                                        return diff;
                        }
                }
-               if (i == a.cnt || item_index(a.syms[i]) == 0)
+               if (i == a.cnt || item_dot(a.syms[i]) == 0)
                        av = -1;
                else
                        av = a.syms[i];
-               if (i == b.cnt || item_index(b.syms[i]) == 0)
+               if (i == b.cnt || item_dot(b.syms[i]) == 0)
                        bv = -1;
                else
                        bv = b.syms[i];
@@ -1365,7 +1365,7 @@ into the go to set, so the item is ineffective.
 ###### complete itemset
        for (i = 0; i < is->items.cnt; i++) {
                int p = item_prod(is->items.syms[i]);
-               int bs = item_index(is->items.syms[i]);
+               int bs = item_dot(is->items.syms[i]);
                struct production *pr = g->productions[p];
                int p2;
                struct symbol *s;
@@ -1470,7 +1470,7 @@ with a pre-existing itemset).
                for (j = 0; j < is->items.cnt; j++) {
                        int itm = is->items.syms[j];
                        int p = item_prod(itm);
-                       int bp = item_index(itm);
+                       int bp = item_dot(itm);
                        struct production *pr = g->productions[p];
                        unsigned short la = 0;
                        int pos;
@@ -1695,7 +1695,7 @@ it up a bit.  First the items, with production number and associativity.
        static void report_item(struct grammar *g, int itm)
        {
                int p = item_prod(itm);
-               int dot = item_index(itm);
+               int dot = item_dot(itm);
                struct production *pr = g->productions[p];
                int i;
 
@@ -1824,7 +1824,7 @@ as shifts always over-ride reductions.
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
 
                                if (bp == pr->body_size) {
@@ -1881,7 +1881,7 @@ but handled internally.
                        for (j = 0; j < is->items.cnt; j++) {
                                unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
                                struct symbol *s;
 
@@ -1902,7 +1902,7 @@ but handled internally.
                        for (j = 0; j < is->items.cnt; j++) {
                                unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
 
                                if (bp < pr->body_size)
@@ -2117,7 +2117,7 @@ The go to table is stored in a simple array of `sym` and corresponding
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
 
                                if (bp < pr->body_size)
@@ -2169,13 +2169,105 @@ structure returned by a previous reduction.  These pointers need to be cast
 to the appropriate type for each access.  All this is handled in
 `gen_code`.
 
-`gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
-This applied only to symbols with references (or pointers), not those with structures.
-The `<` implies that the reference it being moved out, so the object will not be
-automatically freed.  This is equivalent to assigning `NULL` to the pointer.
+`gen_code` also allows symbol references to contain a '`<`' as in
+'`$<2`'.  This is particularly useful for references (or pointers), but
+can be used with structures too.  The `<` implies that the value it
+being moved out, so the object will not be automatically freed.  It is
+equivalent to assigning `NULL` to the pointer or filling a structure
+with zeros.
+
+Instead of a number `N`, the `$` or `$<` can be followed by some letters
+and possibly a number.  A number by itself (other than zero) selects a
+symbol from the body of the production.  A sequence of letters selects
+the shortest symbol in the body which contains those letters in the given
+order.  If a number follows the letters, then a later occurrence of
+that symbol is chosen.  So "`$AB2`" will refer to the structure attached
+to the second occurrence of the shortest symbol which contains an `A`
+followed by a `B`.  If there is no unique shortest system, or if the
+number given is too large, then the symbol reference is not transformed,
+and will cause an error when the code is compiled.
 
 ###### functions
 
+       static int textchr(struct text t, char c, int s)
+       {
+               int i;
+               for (i = s; i < t.len; i++)
+                       if (t.txt[i] == c)
+                               return i;
+               return -1;
+       }
+
+       static int subseq_match(char *seq, int slen, struct text name)
+       {
+               int st = 0;
+               while (slen > 0) {
+                       st = textchr(name, *seq, st);
+                       if (st < 0)
+                               return 0;
+                       slen -= 1;
+                       seq += 1;
+                       st += 1;
+               }
+               return 1;
+       }
+
+       static int choose_sym(char **namep, int len, struct production *p)
+       {
+               char *name = *namep;
+               char *nam = name;
+               int namlen;
+               int n = 0;
+               int i, s, slen;
+               char c;
+
+               c = *name;
+               while (len > 0 &&
+                      ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
+                       name += 1;
+                       len -= 1;
+                       c = *name;
+               }
+               namlen = name-nam;
+               while (len > 0 && (c >= '0' && c <= '9')) {
+                       name += 1;
+                       len -= 1;
+                       n = n * 10 + (c - '0');
+                       c = *name;
+               }
+               if (namlen == 0) {
+                       if (name == *namep)
+                               return -1;
+                       *namep = name;
+                       return n;
+               }
+               slen = 0; s = -1;
+               for (i = 0; i < p->body_size; i++) {
+                       if (!subseq_match(nam, namlen, p->body[i]->name))
+                               continue;
+                       if (slen == 0 || p->body[i]->name.len < slen)
+                               s = i;
+                       if (s >= 0 && p->body[i] != p->body[s] &&
+                           p->body[i]->name.len == p->body[s]->name.len)
+                               /* not unique, so s cannot be used */
+                               s = -1;
+               }
+               if (s < 0)
+                       return -1;
+               if (n == 0);
+                       n = 1;
+               for (i = 0; i < p->body_size; i++)
+                       if (p->body[i] == p->body[s]) {
+                               n -= 1;
+                               if (n == 0)
+                                       break;
+                       }
+               if (n > 1)
+                       return -1;
+               *namep = name;
+               return i + 1;
+       }
+
        static void gen_code(struct production *p, FILE *f, struct grammar *g)
        {
                char *c;
@@ -2197,24 +2289,19 @@ automatically freed.  This is equivalent to assigning `NULL` to the pointer.
                                use = 1;
                                c++;
                        }
-                       if (*c < '0' || *c > '9') {
+                       n = choose_sym(&c, p->code.txt + p->code.len - c, p);
+                       if (n < 0) {
+                               fputc('$', f);
                                if (use)
                                        fputc('<', f);
                                fputc(*c, f);
                                continue;
                        }
-                       n = *c - '0';
-                       while (c[1] >= '0' && c[1] <= '9') {
-                               c += 1;
-                               n = n * 10 + *c - '0';
-                       }
                        if (n == 0)
                                fprintf(f, "(*(struct %.*s*%s)ret)",
                                        p->head->struct_name.len,
                                        p->head->struct_name.txt,
                                        p->head->isref ? "*":"");
-                       else if (n > p->body_size)
-                               fprintf(f, "$%d", n);
                        else if (p->body[n-1]->type == Terminal)
                                fprintf(f, "(*(struct token *)body[%d])",
                                        n-1);
@@ -2227,6 +2314,7 @@ automatically freed.  This is equivalent to assigning `NULL` to the pointer.
                                        p->body[n-1]->isref ? "*":"", n-1);
                                used[n-1] = use;
                        }
+                       c -= 1;
                }
                fputs("\n", f);
                for (i = 0; i < p->body_size; i++) {