parsergen: add more power to symbol references in generated code

author NeilBrown <neil@brown.name>

Sun, 11 Oct 2020 03:49:07 +0000 (14:49 +1100)

committer NeilBrown <neil@brown.name>

Sun, 11 Oct 2020 03:49:07 +0000 (14:49 +1100)
author NeilBrown <neil@brown.name>
Sun, 11 Oct 2020 03:49:07 +0000 (14:49 +1100)
committer NeilBrown <neil@brown.name>
Sun, 11 Oct 2020 03:49:07 +0000 (14:49 +1100)
diff --git a/csrc/indent_test.mdc b/csrc/indent_test.mdc

index 087df32c4957ee5f7fa5bcb441162377ce77eb4b..fddb64b7520a15931b430075406c2853f056fef3 100644 (file)
--- a/csrc/indent_test.mdc
+++ b/csrc/indent_test.mdc
@@ -139,8 +139,8 @@ Program -> Statementlist ${ print_statement($1, 0); }$
  $*statement
         Newlines -> NEWLINE
                 | Newlines NEWLINE
-       Statementlist ->  Statements ${ $0 = $<1; }$
-               | Newlines Statements ${ $0 = $<2; }$
+       Statementlist ->  Statements ${ $0 = $<S; }$
+               | Newlines Statements ${ $0 = $<S1; }$
  
         Statements -> Statements Statement ${
                                 {
@@ -159,20 +159,20 @@ $*statement
                 | Newlines {
         Close -> }
                 | Newlines }
-       Block -> Open Statementlist Close ${ $0 = $<2; }$
-               | Open SimpleStatements } ${ $0 = $<2; }$
-               | : SimpleStatements ${ $0 = $<2; }$
-               | : StatementBlock ${ $0 = $<2; }$
-       StatementBlock -> Statementlist $$OUT ${ $0 = $<1; }$
+       Block -> Open Statementlist Close ${ $0 = $<S; }$
+               | Open SimpleStatements } ${ $0 = $<S; }$
+               | : SimpleStatements ${ $0 = $<SS; }$
+               | : StatementBlock ${ $0 = $<SB; }$
+       StatementBlock -> Statementlist $$OUT ${ $0 = $<Sl; }$
  
         SimpleStatements -> SimpleStatements ; SimpleStatement ${
                         {
                                 struct statement **s;
-                               $0 = $<1;
+                               $0 = $<SSs;
                                 s = &$0;
                                 while (*s)
                                         s = &(*s)->next;
-                               *s = $<3;
+                               *s = $<SS;
                         }
                         }$
                 | SimpleStatement ${ $0 = $<1; }$
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc

index 52c0c637c3154ef085d4a948d5604da93827e6a9..742996e16222854cecedd235f9449987cc6c5e08 100644 (file)
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -408,14 +408,15 @@ be in one `code_node` of the literate code.  The `}$` must be
  at the end of a line.
  
  Text in the code fragment will undergo substitutions where `$N` or
-`$<N`,for some numeric `N`, will be replaced with a variable holding the
-parse information for the particular symbol in the production.  `$0` is
-the head of the production, `$1` is the first symbol of the body, etc.
-The type of `$N` for a terminal symbol is `struct token`.  For a
-non-terminal, it is whatever has been declared for that symbol.  The `<`
-may be included and means that the value (usually a reference) is being
-moved out, so it will not automatically be freed.  The effect of using
-'<' is that the variable is cleareed to all-zeros.
+`$<N`,for some numeric `N` (or non-numeric indicator as described
+later), will be replaced with a variable holding the parse information
+for the particular symbol in the production.  `$0` is the head of the
+production, `$1` is the first symbol of the body, etc.  The type of `$N`
+for a terminal symbol is `struct token`.  For a non-terminal, it is
+whatever has been declared for that symbol.  The `<` may be included and
+means that the value (usually a reference) is being moved out, so it
+will not automatically be freed.  The effect of using '<' is that the
+variable is cleareed to all-zeros.
  
  Symbols that are left-recursive are a little special.  These are symbols
  that both the head of a production and the first body symbol of the same
@@ -2176,8 +2177,98 @@ being moved out, so the object will not be automatically freed.  It is
  equivalent to assigning `NULL` to the pointer or filling a structure
  with zeros.
  
+Instead of a number `N`, the `$` or `$<` can be followed by some letters
+and possibly a number.  A number by itself (other than zero) selects a
+symbol from the body of the production.  A sequence of letters selects
+the shortest symbol in the body which contains those letters in the given
+order.  If a number follows the letters, then a later occurrence of
+that symbol is chosen.  So "`$AB2`" will refer to the structure attached
+to the second occurrence of the shortest symbol which contains an `A`
+followed by a `B`.  If there is no unique shortest system, or if the
+number given is too large, then the symbol reference is not transformed,
+and will cause an error when the code is compiled.
+
  ###### functions
  
+       static int textchr(struct text t, char c, int s)
+       {
+               int i;
+               for (i = s; i < t.len; i++)
+                       if (t.txt[i] == c)
+                               return i;
+               return -1;
+       }
+
+       static int subseq_match(char *seq, int slen, struct text name)
+       {
+               int st = 0;
+               while (slen > 0) {
+                       st = textchr(name, *seq, st);
+                       if (st < 0)
+                               return 0;
+                       slen -= 1;
+                       seq += 1;
+                       st += 1;
+               }
+               return 1;
+       }
+
+       static int choose_sym(char **namep, int len, struct production *p)
+       {
+               char *name = *namep;
+               char *nam = name;
+               int namlen;
+               int n = 0;
+               int i, s, slen;
+               char c;
+
+               c = *name;
+               while (len > 0 &&
+                      ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
+                       name += 1;
+                       len -= 1;
+                       c = *name;
+               }
+               namlen = name-nam;
+               while (len > 0 && (c >= '0' && c <= '9')) {
+                       name += 1;
+                       len -= 1;
+                       n = n * 10 + (c - '0');
+                       c = *name;
+               }
+               if (namlen == 0) {
+                       if (name == *namep)
+                               return -1;
+                       *namep = name;
+                       return n;
+               }
+               slen = 0; s = -1;
+               for (i = 0; i < p->body_size; i++) {
+                       if (!subseq_match(nam, namlen, p->body[i]->name))
+                               continue;
+                       if (slen == 0 || p->body[i]->name.len < slen)
+                               s = i;
+                       if (s >= 0 && p->body[i] != p->body[s] &&
+                           p->body[i]->name.len == p->body[s]->name.len)
+                               /* not unique, so s cannot be used */
+                               s = -1;
+               }
+               if (s < 0)
+                       return -1;
+               if (n == 0);
+                       n = 1;
+               for (i = 0; i < p->body_size; i++)
+                       if (p->body[i] == p->body[s]) {
+                               n -= 1;
+                               if (n == 0)
+                                       break;
+                       }
+               if (n > 1)
+                       return -1;
+               *namep = name;
+               return i + 1;
+       }
+
         static void gen_code(struct production *p, FILE *f, struct grammar *g)
         {
                 char *c;
@@ -2199,24 +2290,19 @@ with zeros.
                                 use = 1;
                                 c++;
                         }
-                       if (*c < '0' || *c > '9') {
+                       n = choose_sym(&c, p->code.txt + p->code.len - c, p);
+                       if (n < 0) {
+                               fputc('$', f);
                                 if (use)
                                         fputc('<', f);
                                 fputc(*c, f);
                                 continue;
                         }
-                       n = *c - '0';
-                       while (c[1] >= '0' && c[1] <= '9') {
-                               c += 1;
-                               n = n * 10 + *c - '0';
-                       }
                         if (n == 0)
                                 fprintf(f, "(*(struct %.*s*%s)ret)",
                                         p->head->struct_name.len,
                                         p->head->struct_name.txt,
                                         p->head->isref ? "*":"");
-                       else if (n > p->body_size)
-                               fprintf(f, "$%d", n);
                         else if (p->body[n-1]->type == Terminal)
                                 fprintf(f, "(*(struct token *)body[%d])",
                                         n-1);
@@ -2229,6 +2315,7 @@ with zeros.
                                         p->body[n-1]->isref ? "*":"", n-1);
                                 used[n-1] = use;
                         }
+                       c -= 1;
                 }
                 fputs("\n", f);
                 for (i = 0; i < p->body_size; i++) {
author	NeilBrown <neil@brown.name>
	Sun, 11 Oct 2020 03:49:07 +0000 (14:49 +1100)
committer	NeilBrown <neil@brown.name>
	Sun, 11 Oct 2020 03:49:07 +0000 (14:49 +1100)
csrc/indent_test.mdc		patch \| blob \| history
csrc/parsergen.mdc		patch \| blob \| history