]> ocean-lang.org Git - ocean/blobdiff - csrc/parsergen.mdc
parsergen.mdc: remove tracking of left-recursive symbols.
[ocean] / csrc / parsergen.mdc
index 742996e16222854cecedd235f9449987cc6c5e08..2da1d65bb7b2f962af024ef6b7d7f20a9e2e1f3e 100644 (file)
@@ -169,17 +169,18 @@ table of known symbols and the resulting parser will report them as
 different token types that `scanner` can report.
 
 ###### declarations
-       static char *reserved_words[] = {
-               [TK_error]        = "ERROR",
-               [TK_number]       = "NUMBER",
-               [TK_ident]        = "IDENTIFIER",
-               [TK_mark]         = "MARK",
-               [TK_string]       = "STRING",
-               [TK_multi_string] = "MULTI_STRING",
-               [TK_in]           = "IN",
-               [TK_out]          = "OUT",
-               [TK_newline]      = "NEWLINE",
-               [TK_eof]          = "$eof",
+
+       static struct { int num; char *name; } reserved_words[] = {
+               { TK_error,        "ERROR" },
+               { TK_number,       "NUMBER" },
+               { TK_ident,        "IDENTIFIER" },
+               { TK_mark,         "MARK" },
+               { TK_string,       "STRING" },
+               { TK_multi_string, "MULTI_STRING" },
+               { TK_in,           "IN" },
+               { TK_out,          "OUT" },
+               { TK_newline,      "NEWLINE" },
+               { TK_eof,          "$eof" },
        };
 ###### symbol fields
        short num;
@@ -235,13 +236,11 @@ symbol, but its type might be `Unknown`.
                for (i = 0; i < entries; i++) {
                        struct text t;
                        struct symbol *s;
-                       t.txt = reserved_words[i];
-                       if (!t.txt)
-                               continue;
+                       t.txt = reserved_words[i].name;
                        t.len = strlen(t.txt);
                        s = sym_find(g, t);
                        s->type = Terminal;
-                       s->num = i;
+                       s->num = reserved_words[i].num;
                }
        }
 
@@ -418,21 +417,6 @@ means that the value (usually a reference) is being moved out, so it
 will not automatically be freed.  The effect of using '<' is that the
 variable is cleareed to all-zeros.
 
-Symbols that are left-recursive are a little special.  These are symbols
-that both the head of a production and the first body symbol of the same
-production.  They are problematic when they appear in other productions
-elsewhere than at the end, and when indenting is used to describe
-structure.  In this case there is no way for a smaller indent to ensure
-the left-recursive symbol cannot be extended.  When it appears at the
-end of a production, that production can be reduced to ensure the symbol
-isn't extended.  So we record left-recursive symbols while reading the
-grammar, and produce a warning when reporting the grammar if they are
-found in an unsuitable place.
-
-A symbol that is only left recursive in a production where it is
-followed by newline does not cause these problems because the newline
-will effectively terminate it.
-
 While building productions we will need to add to an array which needs to
 grow dynamically.
 
@@ -493,7 +477,6 @@ Now we have all the bits we need to parse a full production.
 
 ###### symbol fields
        int first_production;
-       int left_recursive;
 
 ###### functions
        static char *parse_production(struct grammar *g,
@@ -555,10 +538,6 @@ Now we have all the bits we need to parse a full production.
                        }
                        tk = token_next(state);
                }
-               if (p.body_size >= 2 &&
-                   p.body[0] == p.head &&
-                   p.body[1]->num != TK_newline)
-                       p.head->left_recursive = 1;
 
                if (tk.num != TK_newline && tk.num != TK_eof) {
                        err = "stray tokens at end of line";
@@ -1168,15 +1147,15 @@ the end of the list in the symset, and then only compare items before
 the first "0".
 
 ###### declarations
-       static inline unsigned short item_num(int production, int index)
+       static inline unsigned short item_num(int production, int dot)
        {
-               return production | ((31-index) << 11);
+               return production | ((31-dot) << 11);
        }
        static inline int item_prod(unsigned short item)
        {
                return item & 0x7ff;
        }
-       static inline int item_index(unsigned short item)
+       static inline int item_dot(unsigned short item)
        {
                return (31-(item >> 11)) & 0x1f;
        }
@@ -1194,8 +1173,8 @@ can just compare the symset and the data values together.
 
                for (i = 0;
                     i < a.cnt && i < b.cnt &&
-                    item_index(a.syms[i]) > 0 &&
-                    item_index(b.syms[i]) > 0;
+                    item_dot(a.syms[i]) > 0 &&
+                    item_dot(b.syms[i]) > 0;
                     i++) {
                        int diff = a.syms[i] - b.syms[i];
                        if (diff)
@@ -1206,11 +1185,11 @@ can just compare the symset and the data values together.
                                        return diff;
                        }
                }
-               if (i == a.cnt || item_index(a.syms[i]) == 0)
+               if (i == a.cnt || item_dot(a.syms[i]) == 0)
                        av = -1;
                else
                        av = a.syms[i];
-               if (i == b.cnt || item_index(b.syms[i]) == 0)
+               if (i == b.cnt || item_dot(b.syms[i]) == 0)
                        bv = -1;
                else
                        bv = b.syms[i];
@@ -1366,7 +1345,7 @@ into the go to set, so the item is ineffective.
 ###### complete itemset
        for (i = 0; i < is->items.cnt; i++) {
                int p = item_prod(is->items.syms[i]);
-               int bs = item_index(is->items.syms[i]);
+               int bs = item_dot(is->items.syms[i]);
                struct production *pr = g->productions[p];
                int p2;
                struct symbol *s;
@@ -1471,7 +1450,7 @@ with a pre-existing itemset).
                for (j = 0; j < is->items.cnt; j++) {
                        int itm = is->items.syms[j];
                        int p = item_prod(itm);
-                       int bp = item_index(itm);
+                       int bp = item_dot(itm);
                        struct production *pr = g->productions[p];
                        unsigned short la = 0;
                        int pos;
@@ -1623,7 +1602,7 @@ all the tables that have been generated, plus a description of any conflicts.
                if (g->follow)
                        report_follow(g);
                report_itemsets(g);
-               return report_conflicts(g, type) + report_left_recursive(g);
+               return report_conflicts(g, type);
        }
 
 Firstly we have the complete list of symbols, together with the
@@ -1696,7 +1675,7 @@ it up a bit.  First the items, with production number and associativity.
        static void report_item(struct grammar *g, int itm)
        {
                int p = item_prod(itm);
-               int dot = item_index(itm);
+               int dot = item_dot(itm);
                struct production *pr = g->productions[p];
                int i;
 
@@ -1825,7 +1804,7 @@ as shifts always over-ride reductions.
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
 
                                if (bp == pr->body_size) {
@@ -1882,7 +1861,7 @@ but handled internally.
                        for (j = 0; j < is->items.cnt; j++) {
                                unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
                                struct symbol *s;
 
@@ -1903,7 +1882,7 @@ but handled internally.
                        for (j = 0; j < is->items.cnt; j++) {
                                unsigned short itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
 
                                if (bp < pr->body_size)
@@ -1945,42 +1924,6 @@ but handled internally.
        }
 
 
-### Reporting non-final left-recursive symbols.
-
-Left recursive symbols are a problem for parses that honour indentation
-when they appear other than at the end of the production.  So we need to
-report these when asked.
-
-###### functions
-
-       static int report_left_recursive(struct grammar *g)
-       {
-               int p;
-               int bad_left_recursive = 0;
-
-               for (p = 0; p < g->production_count; p++) {
-                       struct production *pr = g->productions[p];
-                       int sn;
-
-                       for (sn = 0; sn < pr->body_size - 1; sn++) {
-                               struct symbol *s = pr->body[sn];
-
-                               if (s->left_recursive == 1 &&
-                                   s != pr->head) {
-                                       if (!bad_left_recursive) {
-                                               bad_left_recursive = 1;
-                                               printf("Misplaced left recursive symbols.\n");
-                                       }
-                                       printf("  ");
-                                       prtxt(s->name);
-                                       printf(" in production [%d]\n", p);
-                                       s->left_recursive = 2;
-                               }
-                       }
-               }
-               return bad_left_recursive;
-       }
-
 ## Generating the parser
 
 The exported part of the parser is the `parse_XX` function, where the name
@@ -2118,7 +2061,7 @@ The go to table is stored in a simple array of `sym` and corresponding
                        for (j = 0; j < is->items.cnt; j++) {
                                int itm = is->items.syms[j];
                                int p = item_prod(itm);
-                               int bp = item_index(itm);
+                               int bp = item_dot(itm);
                                struct production *pr = g->productions[p];
 
                                if (bp < pr->body_size)
@@ -2991,7 +2934,7 @@ checks if a given token is in any of these look-ahead sets.
                void *ret = NULL;
 
                shift(&p, TK_eof, 0, 1, NULL, states);
-               while (!accepted) {
+               while (!accepted && p.tos > 0) {
                        struct token *err_tk;
                        struct frame *tos = &p.stack[p.tos-1];
                        if (!tk)