]> ocean-lang.org Git - ocean/blobdiff - csrc/parsergen.mdc
parsergen.mdc: remove tracking of left-recursive symbols.
[ocean] / csrc / parsergen.mdc
index 66a41b5f5a297abd1a90510c23d1d40e5e89e676..2da1d65bb7b2f962af024ef6b7d7f20a9e2e1f3e 100644 (file)
@@ -169,17 +169,18 @@ table of known symbols and the resulting parser will report them as
 different token types that `scanner` can report.
 
 ###### declarations
 different token types that `scanner` can report.
 
 ###### declarations
-       static char *reserved_words[] = {
-               [TK_error]        = "ERROR",
-               [TK_number]       = "NUMBER",
-               [TK_ident]        = "IDENTIFIER",
-               [TK_mark]         = "MARK",
-               [TK_string]       = "STRING",
-               [TK_multi_string] = "MULTI_STRING",
-               [TK_in]           = "IN",
-               [TK_out]          = "OUT",
-               [TK_newline]      = "NEWLINE",
-               [TK_eof]          = "$eof",
+
+       static struct { int num; char *name; } reserved_words[] = {
+               { TK_error,        "ERROR" },
+               { TK_number,       "NUMBER" },
+               { TK_ident,        "IDENTIFIER" },
+               { TK_mark,         "MARK" },
+               { TK_string,       "STRING" },
+               { TK_multi_string, "MULTI_STRING" },
+               { TK_in,           "IN" },
+               { TK_out,          "OUT" },
+               { TK_newline,      "NEWLINE" },
+               { TK_eof,          "$eof" },
        };
 ###### symbol fields
        short num;
        };
 ###### symbol fields
        short num;
@@ -235,13 +236,11 @@ symbol, but its type might be `Unknown`.
                for (i = 0; i < entries; i++) {
                        struct text t;
                        struct symbol *s;
                for (i = 0; i < entries; i++) {
                        struct text t;
                        struct symbol *s;
-                       t.txt = reserved_words[i];
-                       if (!t.txt)
-                               continue;
+                       t.txt = reserved_words[i].name;
                        t.len = strlen(t.txt);
                        s = sym_find(g, t);
                        s->type = Terminal;
                        t.len = strlen(t.txt);
                        s = sym_find(g, t);
                        s->type = Terminal;
-                       s->num = i;
+                       s->num = reserved_words[i].num;
                }
        }
 
                }
        }
 
@@ -408,29 +407,15 @@ be in one `code_node` of the literate code.  The `}$` must be
 at the end of a line.
 
 Text in the code fragment will undergo substitutions where `$N` or
 at the end of a line.
 
 Text in the code fragment will undergo substitutions where `$N` or
-`$<N`,for some numeric `N`, will be replaced with a variable holding
-the parse information for the particular symbol in the production.
-`$0` is the head of the production, `$1` is the first symbol of the
-body, etc.  The type of `$N` for a terminal symbol is `struct token`.
-For a non-terminal, it is whatever has been declared for that symbol.
-The `<` may be included for symbols declared as storing a reference
-(not a structure) and means that the reference is being moved out, so
-it will not automatically be freed.
-
-Symbols that are left-recursive are a little special.  These are symbols
-that both the head of a production and the first body symbol of the same
-production.  They are problematic when they appear in other productions
-elsewhere than at the end, and when indenting is used to describe
-structure.  In this case there is no way for a smaller indent to ensure
-the left-recursive symbol cannot be extended.  When it appears at the
-end of a production, that production can be reduced to ensure the symbol
-isn't extended.  So we record left-recursive symbols while reading the
-grammar, and produce a warning when reporting the grammar if they are
-found in an unsuitable place.
-
-A symbol that is only left recursive in a production where it is
-followed by newline does not cause these problems because the newline
-will effectively terminate it.
+`$<N`,for some numeric `N` (or non-numeric indicator as described
+later), will be replaced with a variable holding the parse information
+for the particular symbol in the production.  `$0` is the head of the
+production, `$1` is the first symbol of the body, etc.  The type of `$N`
+for a terminal symbol is `struct token`.  For a non-terminal, it is
+whatever has been declared for that symbol.  The `<` may be included and
+means that the value (usually a reference) is being moved out, so it
+will not automatically be freed.  The effect of using '<' is that the
+variable is cleareed to all-zeros.
 
 While building productions we will need to add to an array which needs to
 grow dynamically.
 
 While building productions we will need to add to an array which needs to
 grow dynamically.
@@ -492,7 +477,6 @@ Now we have all the bits we need to parse a full production.
 
 ###### symbol fields
        int first_production;
 
 ###### symbol fields
        int first_production;
-       int left_recursive;
 
 ###### functions
        static char *parse_production(struct grammar *g,
 
 ###### functions
        static char *parse_production(struct grammar *g,
@@ -554,10 +538,6 @@ Now we have all the bits we need to parse a full production.
                        }
                        tk = token_next(state);
                }
                        }
                        tk = token_next(state);
                }
-               if (p.body_size >= 2 &&
-                   p.body[0] == p.head &&
-                   p.body[1]->num != TK_newline)
-                       p.head->left_recursive = 1;
 
                if (tk.num != TK_newline && tk.num != TK_eof) {
                        err = "stray tokens at end of line";
 
                if (tk.num != TK_newline && tk.num != TK_eof) {
                        err = "stray tokens at end of line";
@@ -1167,15 +1147,15 @@ the end of the list in the symset, and then only compare items before
 the first "0".
 
 ###### declarations
 the first "0".
 
 ###### declarations
-       static inline unsigned short item_num(int production, int index)
+       static inline unsigned short item_num(int production, int dot)
        {
        {
-               return production | ((31-index) << 11);
+               return production | ((31-dot) << 11);
        }
        static inline int item_prod(unsigned short item)
        {
                return item & 0x7ff;
        }
        }
        static inline int item_prod(unsigned short item)
        {
                return item & 0x7ff;
        }
-       static inline int item_index(unsigned short item)
+       static inline int item_dot(unsigned short item)
        {
                return (31-(item >> 11)) & 0x1f;
        }
        {
                return (31-(item >> 11)) & 0x1f;
        }
@@ -1193,8 +1173,8 @@ can just compare the symset and the data values together.
 
                for (i = 0;
                     i < a.cnt && i < b.cnt &&
 
                for (i = 0;
                     i < a.cnt && i < b.cnt &&
-                    item_index(a.syms[i]) > 0 &&
-                    item_index(b.syms[i]) > 0;
+                    item_dot(a.syms[i]) > 0 &&
+                    item_dot(b.syms[i]) > 0;
                     i++) {
                        int diff = a.syms[i] - b.syms[i];
                        if (diff)
                     i++) {
                        int diff = a.syms[i] - b.syms[i];
                        if (diff)
@@ -1205,11 +1185,11 @@ can just compare the symset and the data values together.
                                        return diff;
                        }
                }
                                        return diff;
                        }
                }
-               if (i == a.cnt || item_index(a.syms[i]) == 0)
+               if (i == a.cnt || item_dot(a.syms[i]) == 0)
                        av = -1;
                else
                        av = a.syms[i];
                        av = -1;
                else
                        av = a.syms[i];
-               if (i == b.cnt || item_index(b.syms[i]) == 0)
+               if (i == b.cnt || item_dot(b.syms[i]) == 0)
                        bv = -1;
                else
                        bv = b.syms[i];
                        bv = -1;
                else
                        bv = b.syms[i];
@@ -1365,7 +1345,7 @@ into the go to set, so the item is ineffective.
 ###### complete itemset
        for (i = 0; i < is->items.cnt; i++) {
                int p = item_prod(is->items.syms[i]);
 ###### complete itemset
        for (i = 0; i < is->items.cnt; i++) {
                int p = item_prod(is->items.syms[i]);
-               int bs = item_index(is->items.syms[i]);
+               int bs = item_dot(is->items.syms[i]);
                struct production *pr = g->productions[p];
                int p2;
                struct symbol *s;
                struct production *pr = g->productions[p];
                int p2;
                struct symbol *s;
@@ -1470,7 +1450,7 @@ with a pre-existing itemset).
                for (j = 0; j < is->items.cnt; j++) {
                        int itm = is->items.syms[j];
                        int p = item_prod(itm);
                for (j = 0; j < is->items.cnt; j++) {
                        int itm = is->items.syms[j];
                        int p = item_prod(itm);
-                       int bp = item_index(itm);
+                       int bp = item_dot(itm);
                        struct production *pr = g->productions[p];
                        unsigned short la = 0;
                        int pos;
                        struct production *pr = g->productions[p];
                        unsigned short la = 0;
                        int pos;
@@ -1622,7 +1602,7 @@ all the tables that have been generated, plus a description of any conflicts.
                if (g->follow)
                        report_follow(g);
                report_itemsets(g);
                if (g->follow)
                        report_follow(g);
                report_itemsets(g);
-               return report_conflicts(g, type) + report_left_recursive(g);
+               return report_conflicts(g, type);
        }
 
 Firstly we have the complete list of symbols, together with the
        }
 
 Firstly we have the complete list of symbols, together with the
@@ -1695,7 +1675,7 @@ it up a bit.  First the items, with production number and associativity.
        static void report_item(struct grammar *g, int itm)
        {
                int p = item_prod(itm);
        static void report_item(struct grammar *g, int itm)
        {
                int p = item_prod(itm);
-               int dot = item_index(itm);
+               int dot = item_dot(itm);
                struct production *pr = g->productions[p];
                int i;
 
                struct production *pr = g->productions[p];
                int i;
 
@@ -1824,7 +1804,7 @@ as shifts always over-ride reductions.
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
 
                                if (bp == pr->body_size) {
                                struct production *pr = g->productions[p];
 
                                if (bp == pr->body_size) {
@@ -1881,7 +1861,7 @@ but handled internally.
                        for (j = 0; j < is->items.cnt; j++) {
                                unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
                        for (j = 0; j < is->items.cnt; j++) {
                                unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
                                struct symbol *s;
 
                                struct production *pr = g->productions[p];
                                struct symbol *s;
 
@@ -1902,7 +1882,7 @@ but handled internally.
                        for (j = 0; j < is->items.cnt; j++) {
                                unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
                        for (j = 0; j < is->items.cnt; j++) {
                                unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
 
                                if (bp < pr->body_size)
                                struct production *pr = g->productions[p];
 
                                if (bp < pr->body_size)
@@ -1944,42 +1924,6 @@ but handled internally.
        }
 
 
        }
 
 
-### Reporting non-final left-recursive symbols.
-
-Left recursive symbols are a problem for parses that honour indentation
-when they appear other than at the end of the production.  So we need to
-report these when asked.
-
-###### functions
-
-       static int report_left_recursive(struct grammar *g)
-       {
-               int p;
-               int bad_left_recursive = 0;
-
-               for (p = 0; p < g->production_count; p++) {
-                       struct production *pr = g->productions[p];
-                       int sn;
-
-                       for (sn = 0; sn < pr->body_size - 1; sn++) {
-                               struct symbol *s = pr->body[sn];
-
-                               if (s->left_recursive == 1 &&
-                                   s != pr->head) {
-                                       if (!bad_left_recursive) {
-                                               bad_left_recursive = 1;
-                                               printf("Misplaced left recursive symbols.\n");
-                                       }
-                                       printf("  ");
-                                       prtxt(s->name);
-                                       printf(" in production [%d]\n", p);
-                                       s->left_recursive = 2;
-                               }
-                       }
-               }
-               return bad_left_recursive;
-       }
-
 ## Generating the parser
 
 The exported part of the parser is the `parse_XX` function, where the name
 ## Generating the parser
 
 The exported part of the parser is the `parse_XX` function, where the name
@@ -2117,7 +2061,7 @@ The go to table is stored in a simple array of `sym` and corresponding
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
 
                                if (bp < pr->body_size)
                                struct production *pr = g->productions[p];
 
                                if (bp < pr->body_size)
@@ -2169,13 +2113,105 @@ structure returned by a previous reduction.  These pointers need to be cast
 to the appropriate type for each access.  All this is handled in
 `gen_code`.
 
 to the appropriate type for each access.  All this is handled in
 `gen_code`.
 
-`gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
-This applied only to symbols with references (or pointers), not those with structures.
-The `<` implies that the reference it being moved out, so the object will not be
-automatically freed.  This is equivalent to assigning `NULL` to the pointer.
+`gen_code` also allows symbol references to contain a '`<`' as in
+'`$<2`'.  This is particularly useful for references (or pointers), but
+can be used with structures too.  The `<` implies that the value it
+being moved out, so the object will not be automatically freed.  It is
+equivalent to assigning `NULL` to the pointer or filling a structure
+with zeros.
+
+Instead of a number `N`, the `$` or `$<` can be followed by some letters
+and possibly a number.  A number by itself (other than zero) selects a
+symbol from the body of the production.  A sequence of letters selects
+the shortest symbol in the body which contains those letters in the given
+order.  If a number follows the letters, then a later occurrence of
+that symbol is chosen.  So "`$AB2`" will refer to the structure attached
+to the second occurrence of the shortest symbol which contains an `A`
+followed by a `B`.  If there is no unique shortest system, or if the
+number given is too large, then the symbol reference is not transformed,
+and will cause an error when the code is compiled.
 
 ###### functions
 
 
 ###### functions
 
+       static int textchr(struct text t, char c, int s)
+       {
+               int i;
+               for (i = s; i < t.len; i++)
+                       if (t.txt[i] == c)
+                               return i;
+               return -1;
+       }
+
+       static int subseq_match(char *seq, int slen, struct text name)
+       {
+               int st = 0;
+               while (slen > 0) {
+                       st = textchr(name, *seq, st);
+                       if (st < 0)
+                               return 0;
+                       slen -= 1;
+                       seq += 1;
+                       st += 1;
+               }
+               return 1;
+       }
+
+       static int choose_sym(char **namep, int len, struct production *p)
+       {
+               char *name = *namep;
+               char *nam = name;
+               int namlen;
+               int n = 0;
+               int i, s, slen;
+               char c;
+
+               c = *name;
+               while (len > 0 &&
+                      ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
+                       name += 1;
+                       len -= 1;
+                       c = *name;
+               }
+               namlen = name-nam;
+               while (len > 0 && (c >= '0' && c <= '9')) {
+                       name += 1;
+                       len -= 1;
+                       n = n * 10 + (c - '0');
+                       c = *name;
+               }
+               if (namlen == 0) {
+                       if (name == *namep)
+                               return -1;
+                       *namep = name;
+                       return n;
+               }
+               slen = 0; s = -1;
+               for (i = 0; i < p->body_size; i++) {
+                       if (!subseq_match(nam, namlen, p->body[i]->name))
+                               continue;
+                       if (slen == 0 || p->body[i]->name.len < slen)
+                               s = i;
+                       if (s >= 0 && p->body[i] != p->body[s] &&
+                           p->body[i]->name.len == p->body[s]->name.len)
+                               /* not unique, so s cannot be used */
+                               s = -1;
+               }
+               if (s < 0)
+                       return -1;
+               if (n == 0);
+                       n = 1;
+               for (i = 0; i < p->body_size; i++)
+                       if (p->body[i] == p->body[s]) {
+                               n -= 1;
+                               if (n == 0)
+                                       break;
+                       }
+               if (n > 1)
+                       return -1;
+               *namep = name;
+               return i + 1;
+       }
+
        static void gen_code(struct production *p, FILE *f, struct grammar *g)
        {
                char *c;
        static void gen_code(struct production *p, FILE *f, struct grammar *g)
        {
                char *c;
@@ -2197,24 +2233,19 @@ automatically freed.  This is equivalent to assigning `NULL` to the pointer.
                                use = 1;
                                c++;
                        }
                                use = 1;
                                c++;
                        }
-                       if (*c < '0' || *c > '9') {
+                       n = choose_sym(&c, p->code.txt + p->code.len - c, p);
+                       if (n < 0) {
+                               fputc('$', f);
                                if (use)
                                        fputc('<', f);
                                fputc(*c, f);
                                continue;
                        }
                                if (use)
                                        fputc('<', f);
                                fputc(*c, f);
                                continue;
                        }
-                       n = *c - '0';
-                       while (c[1] >= '0' && c[1] <= '9') {
-                               c += 1;
-                               n = n * 10 + *c - '0';
-                       }
                        if (n == 0)
                                fprintf(f, "(*(struct %.*s*%s)ret)",
                                        p->head->struct_name.len,
                                        p->head->struct_name.txt,
                                        p->head->isref ? "*":"");
                        if (n == 0)
                                fprintf(f, "(*(struct %.*s*%s)ret)",
                                        p->head->struct_name.len,
                                        p->head->struct_name.txt,
                                        p->head->isref ? "*":"");
-                       else if (n > p->body_size)
-                               fprintf(f, "$%d", n);
                        else if (p->body[n-1]->type == Terminal)
                                fprintf(f, "(*(struct token *)body[%d])",
                                        n-1);
                        else if (p->body[n-1]->type == Terminal)
                                fprintf(f, "(*(struct token *)body[%d])",
                                        n-1);
@@ -2227,6 +2258,7 @@ automatically freed.  This is equivalent to assigning `NULL` to the pointer.
                                        p->body[n-1]->isref ? "*":"", n-1);
                                used[n-1] = use;
                        }
                                        p->body[n-1]->isref ? "*":"", n-1);
                                used[n-1] = use;
                        }
+                       c -= 1;
                }
                fputs("\n", f);
                for (i = 0; i < p->body_size; i++) {
                }
                fputs("\n", f);
                for (i = 0; i < p->body_size; i++) {
@@ -2902,7 +2934,7 @@ checks if a given token is in any of these look-ahead sets.
                void *ret = NULL;
 
                shift(&p, TK_eof, 0, 1, NULL, states);
                void *ret = NULL;
 
                shift(&p, TK_eof, 0, 1, NULL, states);
-               while (!accepted) {
+               while (!accepted && p.tos > 0) {
                        struct token *err_tk;
                        struct frame *tos = &p.stack[p.tos-1];
                        if (!tk)
                        struct token *err_tk;
                        struct frame *tos = &p.stack[p.tos-1];
                        if (!tk)