parsergen - fix newline parsing (again)

[ocean] / csrc / parsergen.mdc
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index c25a87e1f621fd3ffa95d4a809c2276a235b2476..78ff5435b8086fcd720674cfcb72c13b3a99d919 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -877,7 +877,7 @@ changes happen.
                 }
         }
  
                 }
         }
  
-### Setting `can_eol` and `line_like`
+### Setting `line_like`
  
  In order to be able to ignore newline tokens when not relevant, but
  still include them in the parse when needed, we will need to know
  
  In order to be able to ignore newline tokens when not relevant, but
  still include them in the parse when needed, we will need to know
@@ -885,30 +885,26 @@ which states can start a "line-like" section of code.  We ignore
  newlines when there is an indent since the most recent start of a
  line-like symbol.
  
  newlines when there is an indent since the most recent start of a
  line-like symbol.
  
-To know which symbols are line-like, we first need to know which
-symbols start with a NEWLINE token.  Any symbol which is followed by a
-NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol.
-Certainly when trying to parse one of these we must take note of NEWLINEs.
+A "line_like" symbol is simply any symbol that can derive a NEWLINE.
+If a symbol cannot derive a NEWLINE, then it is only part of a line -
+so is word-like.  If it can derive a NEWLINE, then we consider it to
+be like a line.
  
  
-Clearly the `TK_newline` token can start with a NEWLINE.  Any symbol
-which is the head of a production that contains a starts-with-NEWLINE
-symbol preceeded only by nullable symbols is also a
-starts-with-NEWLINE symbol.  We use a new field `can_eol` to record
-this attribute of symbols, and compute it in a repetitive manner
-similar to `set_nullable`.
  
  
-Once we have that, we can determine which symbols are `line_like` by
-seeing which are followed by a `can_eol` symbol in any production.
+Clearly the `TK_newline` token can derive a NEWLINE.  Any symbol which
+is the head of a production that contains a line_like symbol is also a
+line-like symbol.  We use a new field `line_like` to record this
+attribute of symbols, and compute it in a repetitive manner similar to
+`set_nullable`.
  
  ###### symbol fields
  
  ###### symbol fields
-       int can_eol;
         int line_like;
  
  ###### functions
         int line_like;
  
  ###### functions
-       static void set_can_eol(struct grammar *g)
+       static void set_line_like(struct grammar *g)
         {
                 int check_again = 1;
         {
                 int check_again = 1;
-               g->symtab[TK_newline]->can_eol = 1;
+               g->symtab[TK_newline]->line_like = 1;
                 while (check_again) {
                         int p;
                         check_again = 0;
                 while (check_again) {
                         int p;
                         check_again = 0;
@@ -916,35 +912,20 @@ seeing which are followed by a `can_eol` symbol in any production.
                                 struct production *pr = g->productions[p];
                                 int s;
  
                                 struct production *pr = g->productions[p];
                                 int s;
  
-                               if (pr->head->can_eol)
+                               if (pr->head->line_like)
                                         continue;
  
                                 for (s = 0 ; s < pr->body_size; s++) {
                                         continue;
  
                                 for (s = 0 ; s < pr->body_size; s++) {
-                                       if (pr->body[s]->can_eol) {
-                                               pr->head->can_eol = 1;
+                                       if (pr->body[s]->line_like) {
+                                               pr->head->line_like = 1;
                                                 check_again = 1;
                                                 break;
                                         }
                                                 check_again = 1;
                                                 break;
                                         }
-                                       if (!pr->body[s]->nullable)
-                                               break;
                                 }
                         }
                 }
         }
  
                                 }
                         }
                 }
         }
  
-       static void set_line_like(struct grammar *g)
-       {
-               int p;
-               for (p = 0; p < g->production_count; p++) {
-                       struct production *pr = g->productions[p];
-                       int s;
-
-                       for (s = 1; s < pr->body_size; s++)
-                               if (pr->body[s]->can_eol)
-                                       pr->body[s-1]->line_like = 1;
-               }
-       }
-
  ### Building the `first` sets
  
  When calculating what can follow a particular non-terminal, we will need to
  ### Building the `first` sets
  
  When calculating what can follow a particular non-terminal, we will need to
@@ -1180,9 +1161,10 @@ need to be consider for completion again.  So  a `completed` flag is needed.
  
  For correct handling of `TK_newline` when parsing, we will need to
  know which states (itemsets) can occur at the start of a line, so we
  
  For correct handling of `TK_newline` when parsing, we will need to
  know which states (itemsets) can occur at the start of a line, so we
-will record a `starts_line` flag too.
+will record a `starts_line` flag too whenever DOT is at the start of a
+`line_like` symbol.
  
  
-Finally, for handling `TK_out` we need to know where production in the
+Finally, for handling `TK_out` we need to know whether productions in the
  current state started *before* the most recent indent.  A state
  doesn't usually keep details of individual productions, so we need to
  add one extra detail. `min_prefix` is the smallest non-zero number of
  current state started *before* the most recent indent.  A state
  doesn't usually keep details of individual productions, so we need to
  add one extra detail. `min_prefix` is the smallest non-zero number of
@@ -1301,7 +1283,7 @@ be supplemented by the LA set for the item which produce the new item.
  
  We also collect a set of all symbols which follow "DOT" (in `done`) as this
  is used in the next stage.
  
  We also collect a set of all symbols which follow "DOT" (in `done`) as this
  is used in the next stage.
-If any of these symbols are flagged as starting a line, then this
+If any of these symbols are flagged as `line_like`, then this
  state must be a `starts_line` state so now is a good time to record that.
  
  When itemsets are created we assign a precedence to the itemset from
  state must be a `starts_line` state so now is a good time to record that.
  
  When itemsets are created we assign a precedence to the itemset from
@@ -1532,7 +1514,6 @@ changeover point in `first_nonterm`.
                         g->symtab[s->num] = s;
  
                 set_nullable(g);
                         g->symtab[s->num] = s;
  
                 set_nullable(g);
-               set_can_eol(g);
                 set_line_like(g);
                 if (type >= SLR)
                         build_first(g);
                 set_line_like(g);
                 if (type >= SLR)
                         build_first(g);
@@ -1583,9 +1564,8 @@ show if it can end in a newline (`>`), if it is considered to be
                         if (!s)
                                 continue;
  
                         if (!s)
                                 continue;
  
-                       printf(" %c%c%c%3d%c: ",
+                       printf(" %c%c%3d%c: ",
                                s->nullable ? '.':' ',
                                s->nullable ? '.':' ',
-                              s->can_eol ? '>':' ',
                                s->line_like ? '<':' ',
                                s->num, symtypes[s->type]);
                         prtxt(s->name);
                                s->line_like ? '<':' ',
                                s->num, symtypes[s->type]);
                         prtxt(s->name);
@@ -1796,6 +1776,15 @@ terminals to items where that terminal could be shifted and another
  which maps terminals to items that could be reduced when the terminal
  is in look-ahead.  We report when we get conflicts between the two.
  
  which maps terminals to items that could be reduced when the terminal
  is in look-ahead.  We report when we get conflicts between the two.
  
+As a special case, if we find a SHIFT/REDUCE conflict, where a
+terminal that could be shifted is in the lookahead set of some
+reducable item, then set check if the reducable item also have
+`TK_newline` in its lookahead set.  If it does, then a newline will
+force and reduction, but anything else can reasonably be shifts, so
+that isn't really a conflict.  Such apparent conflicts do not get
+reported.  This will not affect a "tradtional" grammar that does not
+include newlines as token.
+
         static int conflicts_slr(struct grammar *g, enum grammar_type type)
         {
                 int i;
         static int conflicts_slr(struct grammar *g, enum grammar_type type)
         {
                 int i;
@@ -1823,7 +1812,7 @@ is in look-ahead.  We report when we get conflicts between the two.
                                                 symset_add(&shifts, sym, itm);
                                 }
                         }
                                                 symset_add(&shifts, sym, itm);
                                 }
                         }
-                       /* Now look for reduction and conflicts */
+                       /* Now look for reductions and conflicts */
                         for (j = 0; j < is->items.cnt; j++) {
                                 unsigned short itm = is->items.syms[j];
                                 int p = item_prod(itm);
                         for (j = 0; j < is->items.cnt; j++) {
                                 unsigned short itm = is->items.syms[j];
                                 int p = item_prod(itm);
@@ -1841,7 +1830,7 @@ is in look-ahead.  We report when we get conflicts between the two.
                                 int k;
                                 for (k = 0; k < la.cnt; k++) {
                                         int pos = symset_find(&shifts, la.syms[k]);
                                 int k;
                                 for (k = 0; k < la.cnt; k++) {
                                         int pos = symset_find(&shifts, la.syms[k]);
-                                       if (pos >= 0) {
+                                       if (pos >= 0 && symset_find(&la, TK_newline) < 0) {
                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
                                                 prtxt(g->symtab[la.syms[k]]->name);
                                                 printf(":\n");
                                                 printf("  State %d has SHIFT/REDUCE conflict on ", i);
                                                 prtxt(g->symtab[la.syms[k]]->name);
                                                 printf(":\n");
@@ -2713,9 +2702,32 @@ the most recent start of line.  This is how a newline forcible
  terminates any line-like structure - we try to reduce down to at most
  one symbol for each line where newlines are allowed.
  
  terminates any line-like structure - we try to reduce down to at most
  one symbol for each line where newlines are allowed.
  
+When, during error handling, we discard token read in, we want to keep
+discarding until we see one that is recognised.  If we had a full set
+of LR(1) grammar states, this will mean looking in the look-ahead set,
+but we don't keep a full look-ahead set.  We only record the subset
+that leads to SHIFT.  We can, however, deduce the look-ahead set but
+looking at the SHIFT subsets for all states that we can get to by
+reducing zero or more times.  So we need a little function which
+checks if a given token is in any of these look-ahead sets.
+
  ###### parser includes
         #include "parser.h"
  ###### parser includes
         #include "parser.h"
+
  ###### parser_run
  ###### parser_run
+
+       static int in_lookahead(struct token *tk, const struct state *states, int state)
+       {
+               while (state >= 0) {
+                       if (search(&states[state], tk->num) >= 0)
+                               return 1;
+                       if (states[state].reduce_prod < 0)
+                               return 0;
+                       state = search(&states[state], states[state].reduce_sym);
+               }
+               return 0;
+       }
+
         void *parser_run(struct token_state *tokens,
                          const struct state states[],
                          int (*do_reduce)(int, void**, struct token_config*, void*),
         void *parser_run(struct token_state *tokens,
                          const struct state states[],
                          int (*do_reduce)(int, void**, struct token_config*, void*),
@@ -2852,7 +2864,7 @@ one symbol for each line where newlines are allowed.
                                 break;
                         }
                         tos = &p.stack[p.tos-1];
                                 break;
                         }
                         tos = &p.stack[p.tos-1];
-                       while (search(&states[tos->state], tk->num) < 0 &&
+                       while (!in_lookahead(tk, states, tos->state) &&
                                tk->num != TK_eof) {
                                 free(tk);
                                 tk = tok_copy(token_next(tokens));
                                tk->num != TK_eof) {
                                 free(tk);
                                 tk = tok_copy(token_next(tokens));
@@ -2865,11 +2877,7 @@ one symbol for each line where newlines are allowed.
                                         // FIXME update since_indent here
                                 }
                         }
                                         // FIXME update since_indent here
                                 }
                         }
-                       if (p.tos == 0 && tk->num == TK_eof)
-                               break;
-                       tos = &p.stack[p.tos-1];
                         tos->indents += indents;
                         tos->indents += indents;
-                       exit(1);
                 }
                 free(tk);
                 pop(&p, p.tos, NULL, do_free);
                 }
                 free(tk);
                 pop(&p, p.tos, NULL, do_free);
@@ -2981,6 +2989,9 @@ an error.
                 ./parsergen --tag calc -o calc parsergen.mdc
         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
                 ./parsergen --tag calc -o calc parsergen.mdc
         calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
                 $(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
+       calctest : calc
+               ./calc parsergen.mdc
+       tests :: calctest
  
  # calc: header
  
  
  # calc: header
  
@@ -3001,6 +3012,7 @@ an error.
         #include <stdio.h>
         #include <malloc.h>
         #include <gmp.h>
         #include <stdio.h>
         #include <malloc.h>
         #include <gmp.h>
+       #include <string.h>
         #include "mdcode.h"
         #include "scanner.h"
         #include "number.h"
         #include "mdcode.h"
         #include "scanner.h"
         #include "number.h"
@@ -3014,12 +3026,19 @@ an error.
                 free(n);
         }
  
                 free(n);
         }
  
+       static int text_is(struct text t, char *s)
+       {
+               return (strlen(s) == t.len &&
+                       strncmp(s, t.txt, t.len) == 0);
+       }
+
         int main(int argc, char *argv[])
         {
                 int fd = open(argv[1], O_RDONLY);
                 int len = lseek(fd, 0, 2);
                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
         int main(int argc, char *argv[])
         {
                 int fd = open(argv[1], O_RDONLY);
                 int len = lseek(fd, 0, 2);
                 char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
-               struct section *s = code_extract(file, file+len, NULL);
+               struct section *table = code_extract(file, file+len, NULL);
+               struct section *s;
                 struct token_config config = {
                         .ignored = (1 << TK_line_comment)
                                  | (1 << TK_block_comment)
                 struct token_config config = {
                         .ignored = (1 << TK_line_comment)
                                  | (1 << TK_block_comment)
@@ -3029,12 +3048,14 @@ an error.
                         .word_start = "",
                         .word_cont = "",
                 };
                         .word_start = "",
                         .word_cont = "",
                 };
-               parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
-               while (s) {
-                       struct section *t = s->next;
-                       code_free(s->code);
-                       free(s);
-                       s = t;
+               for (s = table; s; s = s->next)
+                       if (text_is(s->section, "example: input"))
+                               parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
+               while (table) {
+                       struct section *t = table->next;
+                       code_free(table->code);
+                       free(table);
+                       table = t;
                 }
                 exit(0);
         }
                 }
                 exit(0);
         }
@@ -3072,3 +3093,14 @@ an error.
                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
+
+# example: input
+
+       355/113
+       3.1415926535 - 355/113
+       2 + 4 * 5
+       1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
+       10 * 9 / 2
+       1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
+
+       error