marking discussed later, and sometimes we won't know what type a symbol
is yet.
+To help with code safety it is possible to declare the terminal symbols.
+If this is done, then any symbol used in a production that does not
+appear in a head and is not declared is treated as an error.
+
###### forward declarations
enum symtype { Unknown, Virtual, Terminal, Nonterminal };
char *symtypes = "UVTN";
###### symbol fields
enum symtype type;
+###### grammar fields
+ int terminals_declared;
Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
table of known symbols and the resulting parser will report them as
### Data types and precedence.
-Data type specification and precedence specification are both
-introduced by a dollar sign at the start of the line. If the next
-word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+Data type specification, precedence specification, and declaration of
+terminals are all introduced by a dollar sign at the start of the line.
+If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+precedence, if it is `TERM` the the line declares terminals without
precedence, otherwise it specifies a data type.
The data type name is simply stored and applied to the head of all
struct token t = token_next(ts);
char *err;
enum assoc assoc;
+ int term = 0;
int found;
if (t.num != TK_ident) {
assoc = Right;
else if (text_is(t.txt, "NON"))
assoc = Non;
- else {
+ else if (text_is(t.txt, "TERM")) {
+ term = 1;
+ g->terminals_declared = 1;
+ } else {
g->current_type = t.txt;
g->type_isref = isref;
if (text_is(t.txt, "void"))
goto abort;
}
- // This is a precedence line, need some symbols.
+ // This is a precedence or TERM line, need some symbols.
found = 0;
g->prec_levels += 1;
t = token_next(ts);
err = "$$ must be followed by a word";
goto abort;
}
+ if (term) {
+ err = "Virtual symbols not permitted on $TERM line";
+ goto abort;
+ }
} else if (t.num != TK_ident &&
t.num != TK_mark) {
err = "Illegal token in precedence line";
}
s = sym_find(g, t.txt);
if (s->type != Unknown) {
- err = "Symbols in precedence line must not already be known.";
+ err = "Symbols in precedence/TERM line must not already be known.";
goto abort;
}
s->type = type;
- s->precedence = g->prec_levels;
- s->assoc = assoc;
+ if (!term) {
+ s->precedence = g->prec_levels;
+ s->assoc = assoc;
+ }
found += 1;
t = token_next(ts);
}
if (found == 0)
- err = "No symbols given on precedence line";
+ err = "No symbols given on precedence/TERM line";
goto abort;
return NULL;
abort:
at the end of a line.
Text in the code fragment will undergo substitutions where `$N` or
-`$<N`,for some numeric `N`, will be replaced with a variable holding
-the parse information for the particular symbol in the production.
-`$0` is the head of the production, `$1` is the first symbol of the
-body, etc. The type of `$N` for a terminal symbol is `struct token`.
-For a non-terminal, it is whatever has been declared for that symbol.
-The `<` may be included for symbols declared as storing a reference
-(not a structure) and means that the reference is being moved out, so
-it will not automatically be freed.
+`$<N`,for some numeric `N` (or non-numeric indicator as described
+later), will be replaced with a variable holding the parse information
+for the particular symbol in the production. `$0` is the head of the
+production, `$1` is the first symbol of the body, etc. The type of `$N`
+for a terminal symbol is `struct token`. For a non-terminal, it is
+whatever has been declared for that symbol. The `<` may be included and
+means that the value (usually a reference) is being moved out, so it
+will not automatically be freed. The effect of using '<' is that the
+variable is cleareed to all-zeros.
Symbols that are left-recursive are a little special. These are symbols
that both the head of a production and the first body symbol of the same
tk = token_next(state);
while (tk.num == TK_ident || tk.num == TK_mark) {
struct symbol *bs = sym_find(g, tk.txt);
- if (bs->type == Unknown)
- bs->type = Terminal;
+ if (bs->type == Unknown) {
+ if (!g->terminals_declared)
+ bs->type = Terminal;
+ }
if (bs->type == Virtual) {
err = "Virtual symbol not permitted in production";
goto abort;
goto abort;
}
token_close(state);
+ if (g->terminals_declared) {
+ struct symbol *s;
+ int errs = 0;
+ for (s = g->syms; s; s = s->next) {
+ if (s->type != Unknown)
+ continue;
+ errs += 1;
+ fprintf(stderr, "Token %.*s not declared\n",
+ s->name.len, s->name.txt);
+ }
+ if (errs) {
+ free(g);
+ g = NULL;
+ }
+ }
return g;
abort:
fprintf(stderr, "Error at line %d: %s\n",
to the appropriate type for each access. All this is handled in
`gen_code`.
-`gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
-This applied only to symbols with references (or pointers), not those with structures.
-The `<` implies that the reference it being moved out, so the object will not be
-automatically freed. This is equivalent to assigning `NULL` to the pointer.
+`gen_code` also allows symbol references to contain a '`<`' as in
+'`$<2`'. This is particularly useful for references (or pointers), but
+can be used with structures too. The `<` implies that the value it
+being moved out, so the object will not be automatically freed. It is
+equivalent to assigning `NULL` to the pointer or filling a structure
+with zeros.
+
+Instead of a number `N`, the `$` or `$<` can be followed by some letters
+and possibly a number. A number by itself (other than zero) selects a
+symbol from the body of the production. A sequence of letters selects
+the shortest symbol in the body which contains those letters in the given
+order. If a number follows the letters, then a later occurrence of
+that symbol is chosen. So "`$AB2`" will refer to the structure attached
+to the second occurrence of the shortest symbol which contains an `A`
+followed by a `B`. If there is no unique shortest system, or if the
+number given is too large, then the symbol reference is not transformed,
+and will cause an error when the code is compiled.
###### functions
+ static int textchr(struct text t, char c, int s)
+ {
+ int i;
+ for (i = s; i < t.len; i++)
+ if (t.txt[i] == c)
+ return i;
+ return -1;
+ }
+
+ static int subseq_match(char *seq, int slen, struct text name)
+ {
+ int st = 0;
+ while (slen > 0) {
+ st = textchr(name, *seq, st);
+ if (st < 0)
+ return 0;
+ slen -= 1;
+ seq += 1;
+ st += 1;
+ }
+ return 1;
+ }
+
+ static int choose_sym(char **namep, int len, struct production *p)
+ {
+ char *name = *namep;
+ char *nam = name;
+ int namlen;
+ int n = 0;
+ int i, s, slen;
+ char c;
+
+ c = *name;
+ while (len > 0 &&
+ ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
+ name += 1;
+ len -= 1;
+ c = *name;
+ }
+ namlen = name-nam;
+ while (len > 0 && (c >= '0' && c <= '9')) {
+ name += 1;
+ len -= 1;
+ n = n * 10 + (c - '0');
+ c = *name;
+ }
+ if (namlen == 0) {
+ if (name == *namep)
+ return -1;
+ *namep = name;
+ return n;
+ }
+ slen = 0; s = -1;
+ for (i = 0; i < p->body_size; i++) {
+ if (!subseq_match(nam, namlen, p->body[i]->name))
+ continue;
+ if (slen == 0 || p->body[i]->name.len < slen)
+ s = i;
+ if (s >= 0 && p->body[i] != p->body[s] &&
+ p->body[i]->name.len == p->body[s]->name.len)
+ /* not unique, so s cannot be used */
+ s = -1;
+ }
+ if (s < 0)
+ return -1;
+ if (n == 0);
+ n = 1;
+ for (i = 0; i < p->body_size; i++)
+ if (p->body[i] == p->body[s]) {
+ n -= 1;
+ if (n == 0)
+ break;
+ }
+ if (n > 1)
+ return -1;
+ *namep = name;
+ return i + 1;
+ }
+
static void gen_code(struct production *p, FILE *f, struct grammar *g)
{
char *c;
use = 1;
c++;
}
- if (*c < '0' || *c > '9') {
+ n = choose_sym(&c, p->code.txt + p->code.len - c, p);
+ if (n < 0) {
+ fputc('$', f);
if (use)
fputc('<', f);
fputc(*c, f);
continue;
}
- n = *c - '0';
- while (c[1] >= '0' && c[1] <= '9') {
- c += 1;
- n = n * 10 + *c - '0';
- }
if (n == 0)
fprintf(f, "(*(struct %.*s*%s)ret)",
p->head->struct_name.len,
p->head->struct_name.txt,
p->head->isref ? "*":"");
- else if (n > p->body_size)
- fprintf(f, "$%d", n);
else if (p->body[n-1]->type == Terminal)
fprintf(f, "(*(struct token *)body[%d])",
n-1);
p->body[n-1]->isref ? "*":"", n-1);
used[n-1] = use;
}
+ c -= 1;
}
fputs("\n", f);
for (i = 0; i < p->body_size; i++) {