parsergen: move EOL handling out of shift()

[ocean] / csrc / parsergen.mdc
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index 80027aa74e49989428ea370172c3a783c8ebfd40..d9467442eeab1dfec4720f7a2b4f5c6666d43867 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -1876,9 +1876,10 @@ optional `FILE` to send tracing to.  The `token_config` gets the list of
  known words added and then is used with the `code_node` to initialize the
  scanner.
  
  known words added and then is used with the `code_node` to initialize the
  scanner.
  
-`parse_XX` then calls the library function `parser_run` to actually complete
-the parse.  This needs the `states` table and functions to call the various
-pieces of code provided in the grammar file, so they are generated first.
+`parse_XX` then calls the library function `parser_run` to actually
+complete the parse.  This needs the `states` table, the `reductions`
+table and functions to call the various pieces of code provided in the
+grammar file, so they are generated first.
  
  ###### parser_generate
  
  
  ###### parser_generate
  
@@ -1889,6 +1890,7 @@ pieces of code provided in the grammar file, so they are generated first.
                 gen_non_term(f, g);
                 gen_goto(f, g);
                 gen_states(f, g);
                 gen_non_term(f, g);
                 gen_goto(f, g);
                 gen_states(f, g);
+               gen_reductions(f, g);
                 gen_reduce(f, g, file, pre_reduce);
                 gen_free(f, g);
  
                 gen_reduce(f, g, file, pre_reduce);
                 gen_free(f, g);
  
@@ -1900,7 +1902,7 @@ pieces of code provided in the grammar file, so they are generated first.
                 fprintf(f, "\tconfig->words_marks = known;\n");
                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
                 fprintf(f, "\ttokens = token_open(code, config);\n");
                 fprintf(f, "\tconfig->words_marks = known;\n");
                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
                 fprintf(f, "\ttokens = token_open(code, config);\n");
-               fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
+               fprintf(f, "\tvoid *rv = parser_run(tokens, states, reductions, do_reduce, do_free, trace, non_term, config);\n");
                 fprintf(f, "\ttoken_close(tokens);\n");
                 fprintf(f, "\treturn rv;\n");
                 fprintf(f, "}\n\n");
                 fprintf(f, "\ttoken_close(tokens);\n");
                 fprintf(f, "\treturn rv;\n");
                 fprintf(f, "}\n\n");
@@ -1941,14 +1943,16 @@ The table of nonterminals used for tracing is a similar array.
                 fprintf(f, "};\n\n");
         }
  
                 fprintf(f, "};\n\n");
         }
  
-### States and the goto tables.
+### States, reductions, and the go to tables.
  
  
-For each state we record the goto table and details of the reducible
-production if there is one.
-Some of the details of the reducible production are stored in the
-`do_reduce` function to come later.  Here we store the production
-number, the body size (useful for stack management), and the resulting
-symbol (useful for knowing how to free data later).
+For each state we record the go to table and the reducible production if
+there is one, the details of which are in a separate table of
+reductions.  Some of the details of the reducible production are stored
+in the `do_reduce` function to come later.  In the go to table we store
+the production number and in the reductions table: the body size (useful
+for stack management), the resulting symbol (useful for knowing how to
+free data later), and the size of the resulting asn object (useful for
+preallocation space.
  
  The go to table is stored in a simple array of `sym` and corresponding
  `state`.
  
  The go to table is stored in a simple array of `sym` and corresponding
  `state`.
@@ -1959,13 +1963,15 @@ The go to table is stored in a simple array of `sym` and corresponding
                 short sym;
                 short state;
         };
                 short sym;
                 short state;
         };
+       struct reduction {
+               short size;
+               short sym;
+               short result_size;
+       };
         struct state {
         struct state {
+               short reduce_prod;
                 short go_to_cnt;
                 const struct lookup * go_to;
                 short go_to_cnt;
                 const struct lookup * go_to;
-               short reduce_prod;
-               short reduce_size;
-               short reduce_sym;
-               short result_size;
         };
  
  ###### functions
         };
  
  ###### functions
@@ -1989,6 +1995,26 @@ The go to table is stored in a simple array of `sym` and corresponding
                 }
         }
  
                 }
         }
  
+       static void gen_reductions(FILE *f, struct grammar *g)
+       {
+               int i;
+               fprintf(f, "#line 0 \"gen_reductions\"\n");
+               fprintf(f, "static const struct reduction reductions[] = {\n");
+               for (i = 0; i < g->production_count; i++) {
+                       struct production *pr = g->productions[i];
+                       struct symbol *hd = pr->head;
+                       fprintf(f, "\t{%d, %d, ", pr->body_size, hd->num);
+                       if (hd->struct_name.txt == NULL)
+                               fprintf(f, "0 },\n");
+                       else
+                               fprintf(f, "sizeof(struct %.*s%s) },\n",
+                                       hd->struct_name.len,
+                                       hd->struct_name.txt,
+                                       hd->isref ? "*" : "");
+               }
+               fprintf(f, "};\n\n");
+       }
+
         static void gen_states(FILE *f, struct grammar *g)
         {
                 int i;
         static void gen_states(FILE *f, struct grammar *g)
         {
                 int i;
@@ -2013,24 +2039,10 @@ The go to table is stored in a simple array of `sym` and corresponding
                                 }
                         }
                         if (is->go_to.cnt)
                                 }
                         }
                         if (is->go_to.cnt)
-                               fprintf(f, "\t[%d] = { %d, goto_%d, ",
-                                       i, is->go_to.cnt, i);
+                               fprintf(f, "\t[%d] = { %d, %d, goto_%d },\n",
+                                       i, prod, is->go_to.cnt, i);
                         else
                         else
-                               fprintf(f, "\t[%d] = { 0, NULL, ", i);
-                       if (prod >= 0) {
-                               struct production *pr = g->productions[prod];
-                               struct symbol *hd = pr->head;
-                               fprintf(f, "%d, %d, %d, ", 
-                                       prod, pr->body_size, pr->head->num);
-                               if (hd->struct_name.txt == NULL)
-                                       fprintf(f, "0 },\n");
-                               else
-                                       fprintf(f, "sizeof(struct %.*s%s) },\n",
-                                               hd->struct_name.len,
-                                               hd->struct_name.txt,
-                                               hd->isref ? "*" : "");
-                       } else
-                               fprintf(f, "-1, -1, -1, -1 },\n");
+                               fprintf(f, "\t[%d] = { %d, 0, NULL },\n", i, prod);
                 }
                 fprintf(f, "};\n\n");
         }
                 }
                 fprintf(f, "};\n\n");
         }
@@ -2564,9 +2576,9 @@ recognised properly, and link with `libicuuc` as `libmdcode` requires that.
  Having analysed the grammar and generated all the tables, we only need
  the shift/reduce engine to bring it all together.
  
  Having analysed the grammar and generated all the tables, we only need
  the shift/reduce engine to bring it all together.
  
-### Goto table lookup
+### Go to table lookup
  
  
-The parser generator has nicely provided us with goto tables sorted by
+The parser generator has nicely provided us with go to tables sorted by
  symbol number.  We need a binary search function to find a symbol in the
  table.
  
  symbol number.  We need a binary search function to find a symbol in the
  table.
  
@@ -2665,7 +2677,7 @@ is exactly the same process as shifting in a terminal so we use the same
  function for both.  In both cases we provide the symbol.  The state is
  deduced from the current top-of-stack state and the new symbol.
  
  function for both.  In both cases we provide the symbol.  The state is
  deduced from the current top-of-stack state and the new symbol.
  
-To simplify other code we arrange for `shift` to fail if there is no `goto`
+To simplify other code we arrange for `shift` to fail if there is no `go to`
  state for the symbol.  This is useful in basic parsing due to our design
  that we shift when we can, and reduce when we cannot.  So the `shift`
  function reports if it could.
  state for the symbol.  This is useful in basic parsing due to our design
  that we shift when we can, and reduce when we cannot.  So the `shift`
  function reports if it could.
@@ -2676,25 +2688,6 @@ stack is empty, it always chooses zero as the next state.
  So `shift` finds the next state.  If that succeeds it extends the
  allocations if needed and pushes all the information onto the stacks.
  
  So `shift` finds the next state.  If that succeeds it extends the
  allocations if needed and pushes all the information onto the stacks.
  
-An extra complication is added to `shift` by the `EOL` token.  This
-token must be generated when a `NEWLINE` is seen, but an `EOL` is
-expected.  When this happens, the `NEWLINE` is NOT consumed, so multiple
-EOL can appear before a NEWLINE.  To indicate that the token was shifted
-by not consumed, `shift` can return the special value `2`.  The token
-number for `EOL` cannot be statically declared, so when the parser
-starts we need to look through the array of non-terminals to find the
-EOL.
-
-###### parser state
-       int tk_eol;
-
-###### find eol
-       p.tk_eol = 0;
-       while (strcmp(non_term[p.tk_eol], "EOL") != 0)
-               p.tk_eol += 1;
-       p.tk_eol += TK_reserved + config->known_count;
-
-
  ###### parser functions
  
         static int shift(struct parser *p,
  ###### parser functions
  
         static int shift(struct parser *p,
@@ -2702,27 +2695,12 @@ EOL.
                          const struct state states[])
         {
                 struct frame next = {0};
                          const struct state states[])
         {
                 struct frame next = {0};
-               int ret;
                 int newstate = p->tos
                         ? search(&states[p->stack[p->tos-1].state],
                                  sym)
                         : 0;
                 int newstate = p->tos
                         ? search(&states[p->stack[p->tos-1].state],
                                  sym)
                         : 0;
-               if (newstate >= 0)
-                       ret = 1;
-               else if (sym != TK_newline)
+               if (newstate < 0)
                         return 0;
                         return 0;
-               else {
-                       // have a NEWLINE, might need an EOL
-                       sym = p->tk_eol;
-                       newstate = p->tos
-                               ? search(&states[p->stack[p->tos-1].state],
-                                        sym)
-                               : 0;
-                       if (newstate < 0)
-                               return 0;
-                       ret = 2;
-                       asn = tok_copy(*(struct token*)asn);
-               }
  
                 if (p->tos >= p->stack_size) {
                         p->stack_size += 10;
  
                 if (p->tos >= p->stack_size) {
                         p->stack_size += 10;
@@ -2737,7 +2715,7 @@ EOL.
                 p->stack[p->tos] = next;
                 p->asn_stack[p->tos] = asn;
                 p->tos++;
                 p->stack[p->tos] = next;
                 p->asn_stack[p->tos] = asn;
                 p->tos++;
-               return ret;
+               return 1;
         }
  
  `pop` primarily moves the top of stack (`tos`) back down the required
         }
  
  `pop` primarily moves the top of stack (`tos`) back down the required
@@ -2785,18 +2763,37 @@ might also be ignored.  Ignoring tokens is combined with shifting.
         }
  
  Indents are ignored unless they can be shifted onto the stack
         }
  
  Indents are ignored unless they can be shifted onto the stack
-immediately.  The end of an indented section - the OUT token - is
-ignored precisely when the indent was ignored.  To keep track of this we
-need a small stack of flags, which is easily stored as bits in an
-`unsigned long`.  This will never overflow and the scanner only allows
-20 levels of indentation.
+immediately or nothing can be shifted (in which case we reduce, and try
+again).  The end of an indented section - the OUT token - is ignored
+precisely when the indent was ignored.  To keep track of this we need a
+small stack of flags, which is easily stored as bits in an `unsigned
+long`.  This will never overflow and the scanner only allows 20 levels
+of indentation.
  
  ###### parser state
         unsigned long ignored_indents;
  
  
  ###### parser state
         unsigned long ignored_indents;
  
-NEWLINE/EOL is ignored when in an indented section of text which was not
+NEWLINE is ignored when in an indented section of text which was not
  explicitly expected by the grammar.  So if the most recent indent is
  explicitly expected by the grammar.  So if the most recent indent is
-ignored, so is any EOL token.
+ignored, so is any NEWLINE token.
+
+If a NEWLINE is seen but it cannot be shifted, we try to shift an EOL
+token instead.  If that succeeds, we make a new copy of the NEWLINE
+token and continue.  This allows a NEWLINE to appear to be preceded by
+an indefinite number of EOL tokens.
+
+The token number for `EOL` cannot be statically declared, so when the
+parser starts we need to look through the array of non-terminals to find
+the EOL.
+
+###### parser state
+       int tk_eol;
+
+###### find eol
+       p.tk_eol = 0;
+       while (strcmp(non_term[p.tk_eol], "EOL") != 0)
+               p.tk_eol += 1;
+       p.tk_eol += TK_reserved + config->known_count;
  
  For other tokens, we shift the next token if that is possible, otherwise
  we try to reduce a production.
  
  For other tokens, we shift the next token if that is possible, otherwise
  we try to reduce a production.
@@ -2814,23 +2811,25 @@ we try to reduce a production.
                 continue;
         }
  
                 continue;
         }
  
-       switch (shift(&p, tk->num, tk, states)) {
-       case 1:
+       if (shift(&p, tk->num, tk, states)) {
                 if (tk->num == TK_out)
                         p.ignored_indents >>= 1;
                 if (tk->num == TK_in)
                         p.ignored_indents <<= 1;
  
                 if (tk->num == TK_out)
                         p.ignored_indents >>= 1;
                 if (tk->num == TK_in)
                         p.ignored_indents <<= 1;
  
+               parser_trace_action(trace, "Shift");
                 tk = NULL;
                 tk = NULL;
-               /* fallthrough */
-       case 2:
-               parser_trace_action(trace, tk ? "ShiftEOL" : "Shift");
                 ## did shift
                 continue;
                 ## did shift
                 continue;
+       } else if (tk->num == TK_newline &&
+                  shift(&p, p.tk_eol, tk, states)) {
+               tk = tok_copy(*tk);
+               parser_trace_action(trace, "ShiftEOL");
+               continue;
         }
  
         }
  
-       if (tk->num == TK_in) {
-               /* No indent expected here, so ignore IN */
+       if (tk->num == TK_in && states[p.stack[p.tos-1].state].go_to_cnt > 0) {
+               /* No indent expected here and reduce is not mandatory, so ignore IN */
                 free(tk);
                 tk = NULL;
                 p.ignored_indents <<= 1;
                 free(tk);
                 tk = NULL;
                 p.ignored_indents <<= 1;
@@ -2860,16 +2859,16 @@ report that that input has been accepted.
                 void *res;
                 const struct state *nextstate = &states[tos->state];
                 int prod = nextstate->reduce_prod;
                 void *res;
                 const struct state *nextstate = &states[tos->state];
                 int prod = nextstate->reduce_prod;
-               int size = nextstate->reduce_size;
-               int res_size = nextstate->result_size;
+               int size = reductions[prod].size;
+               int res_size = reductions[prod].result_size;
  
                 body = p.asn_stack + (p.tos - size);
                 res = res_size ? calloc(1, res_size) : NULL;
                 res_size = do_reduce(prod, body, config, res);
  
                 body = p.asn_stack + (p.tos - size);
                 res = res_size ? calloc(1, res_size) : NULL;
                 res_size = do_reduce(prod, body, config, res);
-               if (res_size != nextstate->result_size)
+               if (res_size != reductions[prod].result_size)
                         abort();
                 pop(&p, size, do_free);
                         abort();
                 pop(&p, size, do_free);
-               if (!shift(&p, nextstate->reduce_sym, res, states)) {
+               if (!shift(&p, reductions[prod].sym, res, states)) {
                         accepted = 1;
                         ret = res;
                         parser_trace_action(trace, "Accept");
                         accepted = 1;
                         ret = res;
                         parser_trace_action(trace, "Accept");
@@ -2928,6 +2927,7 @@ dropping tokens until either we manage to shift one, or reach end-of-file.
  
         void *parser_run(struct token_state *tokens,
                          const struct state states[],
  
         void *parser_run(struct token_state *tokens,
                          const struct state states[],
+                        const struct reduction reductions[],
                          int (*do_reduce)(int, void**, struct token_config*, void*),
                          void (*do_free)(short, void*),
                          FILE *trace, const char *non_term[],
                          int (*do_reduce)(int, void**, struct token_config*, void*),
                          void (*do_free)(short, void*),
                          FILE *trace, const char *non_term[],
@@ -2949,6 +2949,7 @@ dropping tokens until either we manage to shift one, or reach end-of-file.
  ###### exported functions
         void *parser_run(struct token_state *tokens,
                          const struct state states[],
  ###### exported functions
         void *parser_run(struct token_state *tokens,
                          const struct state states[],
+                        const struct reduction reductions[],
                          int (*do_reduce)(int, void**, struct token_config*, void*),
                          void (*do_free)(short, void*),
                          FILE *trace, const char *non_term[],
                          int (*do_reduce)(int, void**, struct token_config*, void*),
                          void (*do_free)(short, void*),
                          FILE *trace, const char *non_term[],