parsergen: sort virtual symbols to send of list

[ocean] / csrc / parsergen.mdc
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index df73f513ceec725931af6c78832d234a0913e45a..13b946fb0e992580c8c7b2d410a003415bb3eea3 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -400,6 +400,21 @@ The `<` may be included for symbols declared as storing a reference
  (not a structure) and means that the reference is being moved out, so
  it will not automatically be freed.
  
+Symbols that are left-recursive are a little special.  These are symbols
+that both the head of a production and the first body symbol of the same
+production.  They are problematic when they appear in other productions
+elsewhere than at the end, and when indenting is used to describe
+structure.  In this case there is no way for a smaller indent to ensure
+the left-recursive symbol cannot be extended.  When it appears at the
+end of a production, that production can be reduced to ensure the symbol
+isn't extended.  So we record left-recursive symbols while reading the
+grammar, and produce a warning when reporting the grammar if they are
+found in an unsuitable place.
+
+A symbol that is only left recursive in a production where it is
+followed by newline does not cause these problems because the newline
+will effectively terminate it.
+
  While building productions we will need to add to an array which needs to
  grow dynamically.
  
@@ -460,6 +475,7 @@ Now we have all the bits we need to parse a full production.
  
  ###### symbol fields
         int first_production;
+       int left_recursive;
  
  ###### functions
         static char *parse_production(struct grammar *g,
@@ -519,6 +535,11 @@ Now we have all the bits we need to parse a full production.
                         }
                         tk = token_next(state);
                 }
+               if (p.body_size >= 2 &&
+                   p.body[0] == p.head &&
+                   p.body[1]->num != TK_newline)
+                       p.head->left_recursive = 1;
+
                 if (tk.num != TK_newline && tk.num != TK_eof) {
                         err = "stray tokens at end of line";
                         goto abort;
@@ -636,6 +657,11 @@ to produce errors that the parser is better positioned to handle.
                         } else if (tk.num == TK_mark
                                    && text_is(tk.txt, "$*")) {
                                 err = dollar_line(state, g, 1);
+                       } else if (tk.num == TK_mark
+                                  && text_is(tk.txt, "//")) {
+                               while (tk.num != TK_newline &&
+                                      tk.num != TK_eof)
+                                       tk = token_next(state);
                         } else {
                                 err = "Unrecognised token at start of line.";
                         }
@@ -1334,11 +1360,11 @@ into the go to set, so the item is ineffective.
                         continue;
                 if (symset_find(&done, s->num) < 0) {
                         symset_add(&done, s->num, 0);
-                       if (s->line_like)
-                               is->starts_line = 1;
                 }
                 if (s->type != Nonterminal)
                         continue;
+               if (s->line_like)
+                       is->starts_line = 1;
                 again = 1;
                 if (type >= LALR) {
                         // Need the LA set.
@@ -1519,6 +1545,11 @@ changeover point in `first_nonterm`.
                                 snum++;
                         }
                 g->first_nonterm = snum;
+               for (s = g->syms; s; s = s->next)
+                       if (s->num < 0 && s->type != Virtual) {
+                               s->num = snum;
+                               snum++;
+                       }
                 for (s = g->syms; s; s = s->next)
                         if (s->num < 0) {
                                 s->num = snum;
@@ -1557,7 +1588,7 @@ all the tables that have been generated, plus a description of any conflicts.
                 if (g->follow)
                         report_follow(g);
                 report_itemsets(g);
-               return report_conflicts(g, type);
+               return report_conflicts(g, type) + report_left_recursive(g);
         }
  
  Firstly we have the complete list of symbols, together with the
@@ -1878,6 +1909,43 @@ but handled internally.
                 return cnt;
         }
  
+
+### Reporting non-final left-recursive symbols.
+
+Left recursive symbols are a problem for parses that honour indentation
+when they appear other than at the end of the production.  So we need to
+report these when asked.
+
+###### functions
+
+       static int report_left_recursive(struct grammar *g)
+       {
+               int p;
+               int bad_left_recursive = 0;
+
+               for (p = 0; p < g->production_count; p++) {
+                       struct production *pr = g->productions[p];
+                       int sn;
+
+                       for (sn = 0; sn < pr->body_size - 1; sn++) {
+                               struct symbol *s = pr->body[sn];
+
+                               if (s->left_recursive == 1 &&
+                                   s != pr->head) {
+                                       if (!bad_left_recursive) {
+                                               bad_left_recursive = 1;
+                                               printf("Misplaced left recursive symbols.\n");
+                                       }
+                                       printf("  ");
+                                       prtxt(s->name);
+                                       printf(" in production [%d]\n", p);
+                                       s->left_recursive = 2;
+                               }
+                       }
+               }
+               return bad_left_recursive;
+       }
+
  ## Generating the parser
  
  The exported part of the parser is the `parse_XX` function, where the name
@@ -1911,7 +1979,6 @@ pieces of code provided in the grammar file, so they are generated first.
                 fprintf(f, "\tstruct token_state *tokens;\n");
                 fprintf(f, "\tconfig->words_marks = known;\n");
                 fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
-               fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
                 fprintf(f, "\ttokens = token_open(code, config);\n");
                 fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
                 fprintf(f, "\ttoken_close(tokens);\n");
@@ -1949,7 +2016,7 @@ numbers right.
                 for (i = TK_reserved;
                      i < g->num_syms;
                      i++)
-                       if (g->symtab[i]->type != Terminal)
+                       if (g->symtab[i]->type == Nonterminal)
                                 fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
                                         g->symtab[i]->name.txt);
                 fprintf(f, "};\n\n");
@@ -3029,7 +3096,7 @@ end inside square brackets.
                         fputs(reserved_words[tk->num], trace);
                 else
                         text_dump(trace, tk->txt, 20);
-               fputs("]", trace);
+               fprintf(trace, ":%d:%d]", tk->line, tk->col);
         }
  
         void parser_trace_action(FILE *trace, char *action)
@@ -3060,7 +3127,7 @@ an error.
  
  # calc: header
  
-       #include "number.h"
+       #include "parse_number.h"
         // what do we use for a demo-grammar?  A calculator of course.
         struct number {
                 mpq_t val;
@@ -3080,7 +3147,6 @@ an error.
         #include <string.h>
         #include "mdcode.h"
         #include "scanner.h"
-       #include "number.h"
         #include "parser.h"
  
         #include "calc.h"
@@ -3106,7 +3172,6 @@ an error.
                 struct section *s;
                 struct token_config config = {
                         .ignored = (1 << TK_line_comment)
-                                | (1 << TK_block_comment)
                                  | (1 << TK_in)
                                  | (1 << TK_out),
                         .number_chars = ".,_+-",
@@ -3128,7 +3193,7 @@ an error.
  # calc: grammar
  
         $LEFT + -
-       $LEFT * /
+       $LEFT * / //
  
         Session -> Session Line
                 | Line
@@ -3156,6 +3221,16 @@ an error.
                 | Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
                 | Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
                 | Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+               | Expression // Expression ${ {
+                       mpz_t z0, z1, z2;
+                       mpq_init($0.val);
+                       mpz_init(z0); mpz_init(z1); mpz_init(z2);
+                       mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
+                       mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
+                       mpz_tdiv_q(z0, z1, z2);
+                       mpq_set_z($0.val, z0);
+                       mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
+               } }$
                 | NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
                 | ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
  
@@ -3168,4 +3243,6 @@ an error.
         10 * 9 / 2
         1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
  
+       355//113
+
         error