marking discussed later, and sometimes we won't know what type a symbol
is yet.
+To help with code safety it is possible to declare the terminal symbols.
+If this is done, then any symbol used in a production that does not
+appear in a head and is not declared is treated as an error.
+
###### forward declarations
enum symtype { Unknown, Virtual, Terminal, Nonterminal };
char *symtypes = "UVTN";
###### symbol fields
enum symtype type;
+###### grammar fields
+ int terminals_declared;
Symbols can be either `TK_ident` or `TK_mark`. They are saved in a
table of known symbols and the resulting parser will report them as
different token types that `scanner` can report.
###### declarations
- static char *reserved_words[] = {
- [TK_error] = "ERROR",
- [TK_number] = "NUMBER",
- [TK_ident] = "IDENTIFIER",
- [TK_mark] = "MARK",
- [TK_string] = "STRING",
- [TK_multi_string] = "MULTI_STRING",
- [TK_in] = "IN",
- [TK_out] = "OUT",
- [TK_newline] = "NEWLINE",
- [TK_eof] = "$eof",
+
+ static struct { int num; char *name; } reserved_words[] = {
+ { TK_error, "ERROR" },
+ { TK_number, "NUMBER" },
+ { TK_ident, "IDENTIFIER" },
+ { TK_mark, "MARK" },
+ { TK_string, "STRING" },
+ { TK_multi_string, "MULTI_STRING" },
+ { TK_in, "IN" },
+ { TK_out, "OUT" },
+ { TK_newline, "NEWLINE" },
+ { TK_eof, "$eof" },
};
###### symbol fields
short num;
for (i = 0; i < entries; i++) {
struct text t;
struct symbol *s;
- t.txt = reserved_words[i];
- if (!t.txt)
- continue;
+ t.txt = reserved_words[i].name;
t.len = strlen(t.txt);
s = sym_find(g, t);
s->type = Terminal;
- s->num = i;
+ s->num = reserved_words[i].num;
}
}
### Data types and precedence.
-Data type specification and precedence specification are both
-introduced by a dollar sign at the start of the line. If the next
-word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+Data type specification, precedence specification, and declaration of
+terminals are all introduced by a dollar sign at the start of the line.
+If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+precedence, if it is `TERM` the the line declares terminals without
precedence, otherwise it specifies a data type.
The data type name is simply stored and applied to the head of all
struct token t = token_next(ts);
char *err;
enum assoc assoc;
+ int term = 0;
int found;
if (t.num != TK_ident) {
assoc = Right;
else if (text_is(t.txt, "NON"))
assoc = Non;
- else {
+ else if (text_is(t.txt, "TERM")) {
+ term = 1;
+ g->terminals_declared = 1;
+ } else {
g->current_type = t.txt;
g->type_isref = isref;
if (text_is(t.txt, "void"))
goto abort;
}
- // This is a precedence line, need some symbols.
+ // This is a precedence or TERM line, need some symbols.
found = 0;
g->prec_levels += 1;
t = token_next(ts);
err = "$$ must be followed by a word";
goto abort;
}
+ if (term) {
+ err = "Virtual symbols not permitted on $TERM line";
+ goto abort;
+ }
} else if (t.num != TK_ident &&
t.num != TK_mark) {
err = "Illegal token in precedence line";
}
s = sym_find(g, t.txt);
if (s->type != Unknown) {
- err = "Symbols in precedence line must not already be known.";
+ err = "Symbols in precedence/TERM line must not already be known.";
goto abort;
}
s->type = type;
- s->precedence = g->prec_levels;
- s->assoc = assoc;
+ if (!term) {
+ s->precedence = g->prec_levels;
+ s->assoc = assoc;
+ }
found += 1;
t = token_next(ts);
}
if (found == 0)
- err = "No symbols given on precedence line";
+ err = "No symbols given on precedence/TERM line";
goto abort;
return NULL;
abort:
at the end of a line.
Text in the code fragment will undergo substitutions where `$N` or
-`$<N`,for some numeric `N`, will be replaced with a variable holding
-the parse information for the particular symbol in the production.
-`$0` is the head of the production, `$1` is the first symbol of the
-body, etc. The type of `$N` for a terminal symbol is `struct token`.
-For a non-terminal, it is whatever has been declared for that symbol.
-The `<` may be included for symbols declared as storing a reference
-(not a structure) and means that the reference is being moved out, so
-it will not automatically be freed.
-
-Symbols that are left-recursive are a little special. These are symbols
-that both the head of a production and the first body symbol of the same
-production. They are problematic when they appear in other productions
-elsewhere than at the end, and when indenting is used to describe
-structure. In this case there is no way for a smaller indent to ensure
-the left-recursive symbol cannot be extended. When it appears at the
-end of a production, that production can be reduced to ensure the symbol
-isn't extended. So we record left-recursive symbols while reading the
-grammar, and produce a warning when reporting the grammar if they are
-found in an unsuitable place.
-
-A symbol that is only left recursive in a production where it is
-followed by newline does not cause these problems because the newline
-will effectively terminate it.
+`$<N`,for some numeric `N` (or non-numeric indicator as described
+later), will be replaced with a variable holding the parse information
+for the particular symbol in the production. `$0` is the head of the
+production, `$1` is the first symbol of the body, etc. The type of `$N`
+for a terminal symbol is `struct token`. For a non-terminal, it is
+whatever has been declared for that symbol. The `<` may be included and
+means that the value (usually a reference) is being moved out, so it
+will not automatically be freed. The effect of using '<' is that the
+variable is cleareed to all-zeros.
While building productions we will need to add to an array which needs to
grow dynamically.
###### symbol fields
int first_production;
- int left_recursive;
###### functions
static char *parse_production(struct grammar *g,
tk = token_next(state);
while (tk.num == TK_ident || tk.num == TK_mark) {
struct symbol *bs = sym_find(g, tk.txt);
- if (bs->type == Unknown)
- bs->type = Terminal;
+ if (bs->type == Unknown) {
+ if (!g->terminals_declared)
+ bs->type = Terminal;
+ }
if (bs->type == Virtual) {
err = "Virtual symbol not permitted in production";
goto abort;
}
tk = token_next(state);
}
- if (p.body_size >= 2 &&
- p.body[0] == p.head &&
- p.body[1]->num != TK_newline)
- p.head->left_recursive = 1;
if (tk.num != TK_newline && tk.num != TK_eof) {
err = "stray tokens at end of line";
goto abort;
}
token_close(state);
+ if (g->terminals_declared) {
+ struct symbol *s;
+ int errs = 0;
+ for (s = g->syms; s; s = s->next) {
+ if (s->type != Unknown)
+ continue;
+ errs += 1;
+ fprintf(stderr, "Token %.*s not declared\n",
+ s->name.len, s->name.txt);
+ }
+ if (errs) {
+ free(g);
+ g = NULL;
+ }
+ }
return g;
abort:
fprintf(stderr, "Error at line %d: %s\n",
the first "0".
###### declarations
- static inline unsigned short item_num(int production, int index)
+ static inline unsigned short item_num(int production, int dot)
{
- return production | ((31-index) << 11);
+ return production | ((31-dot) << 11);
}
static inline int item_prod(unsigned short item)
{
return item & 0x7ff;
}
- static inline int item_index(unsigned short item)
+ static inline int item_dot(unsigned short item)
{
return (31-(item >> 11)) & 0x1f;
}
for (i = 0;
i < a.cnt && i < b.cnt &&
- item_index(a.syms[i]) > 0 &&
- item_index(b.syms[i]) > 0;
+ item_dot(a.syms[i]) > 0 &&
+ item_dot(b.syms[i]) > 0;
i++) {
int diff = a.syms[i] - b.syms[i];
if (diff)
return diff;
}
}
- if (i == a.cnt || item_index(a.syms[i]) == 0)
+ if (i == a.cnt || item_dot(a.syms[i]) == 0)
av = -1;
else
av = a.syms[i];
- if (i == b.cnt || item_index(b.syms[i]) == 0)
+ if (i == b.cnt || item_dot(b.syms[i]) == 0)
bv = -1;
else
bv = b.syms[i];
###### complete itemset
for (i = 0; i < is->items.cnt; i++) {
int p = item_prod(is->items.syms[i]);
- int bs = item_index(is->items.syms[i]);
+ int bs = item_dot(is->items.syms[i]);
struct production *pr = g->productions[p];
int p2;
struct symbol *s;
for (j = 0; j < is->items.cnt; j++) {
int itm = is->items.syms[j];
int p = item_prod(itm);
- int bp = item_index(itm);
+ int bp = item_dot(itm);
struct production *pr = g->productions[p];
unsigned short la = 0;
int pos;
if (g->follow)
report_follow(g);
report_itemsets(g);
- return report_conflicts(g, type) + report_left_recursive(g);
+ return report_conflicts(g, type);
}
Firstly we have the complete list of symbols, together with the
static void report_item(struct grammar *g, int itm)
{
int p = item_prod(itm);
- int dot = item_index(itm);
+ int dot = item_dot(itm);
struct production *pr = g->productions[p];
int i;
for (j = 0; j < is->items.cnt; j++) {
int itm = is->items.syms[j];
int p = item_prod(itm);
- int bp = item_index(itm);
+ int bp = item_dot(itm);
struct production *pr = g->productions[p];
if (bp == pr->body_size) {
for (j = 0; j < is->items.cnt; j++) {
unsigned short itm = is->items.syms[j];
int p = item_prod(itm);
- int bp = item_index(itm);
+ int bp = item_dot(itm);
struct production *pr = g->productions[p];
struct symbol *s;
for (j = 0; j < is->items.cnt; j++) {
unsigned short itm = is->items.syms[j];
int p = item_prod(itm);
- int bp = item_index(itm);
+ int bp = item_dot(itm);
struct production *pr = g->productions[p];
if (bp < pr->body_size)
}
-### Reporting non-final left-recursive symbols.
-
-Left recursive symbols are a problem for parses that honour indentation
-when they appear other than at the end of the production. So we need to
-report these when asked.
-
-###### functions
-
- static int report_left_recursive(struct grammar *g)
- {
- int p;
- int bad_left_recursive = 0;
-
- for (p = 0; p < g->production_count; p++) {
- struct production *pr = g->productions[p];
- int sn;
-
- for (sn = 0; sn < pr->body_size - 1; sn++) {
- struct symbol *s = pr->body[sn];
-
- if (s->left_recursive == 1 &&
- s != pr->head) {
- if (!bad_left_recursive) {
- bad_left_recursive = 1;
- printf("Misplaced left recursive symbols.\n");
- }
- printf(" ");
- prtxt(s->name);
- printf(" in production [%d]\n", p);
- s->left_recursive = 2;
- }
- }
- }
- return bad_left_recursive;
- }
-
## Generating the parser
The exported part of the parser is the `parse_XX` function, where the name
for (j = 0; j < is->items.cnt; j++) {
int itm = is->items.syms[j];
int p = item_prod(itm);
- int bp = item_index(itm);
+ int bp = item_dot(itm);
struct production *pr = g->productions[p];
if (bp < pr->body_size)
to the appropriate type for each access. All this is handled in
`gen_code`.
-`gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
-This applied only to symbols with references (or pointers), not those with structures.
-The `<` implies that the reference it being moved out, so the object will not be
-automatically freed. This is equivalent to assigning `NULL` to the pointer.
+`gen_code` also allows symbol references to contain a '`<`' as in
+'`$<2`'. This is particularly useful for references (or pointers), but
+can be used with structures too. The `<` implies that the value it
+being moved out, so the object will not be automatically freed. It is
+equivalent to assigning `NULL` to the pointer or filling a structure
+with zeros.
+
+Instead of a number `N`, the `$` or `$<` can be followed by some letters
+and possibly a number. A number by itself (other than zero) selects a
+symbol from the body of the production. A sequence of letters selects
+the shortest symbol in the body which contains those letters in the given
+order. If a number follows the letters, then a later occurrence of
+that symbol is chosen. So "`$AB2`" will refer to the structure attached
+to the second occurrence of the shortest symbol which contains an `A`
+followed by a `B`. If there is no unique shortest system, or if the
+number given is too large, then the symbol reference is not transformed,
+and will cause an error when the code is compiled.
###### functions
+ static int textchr(struct text t, char c, int s)
+ {
+ int i;
+ for (i = s; i < t.len; i++)
+ if (t.txt[i] == c)
+ return i;
+ return -1;
+ }
+
+ static int subseq_match(char *seq, int slen, struct text name)
+ {
+ int st = 0;
+ while (slen > 0) {
+ st = textchr(name, *seq, st);
+ if (st < 0)
+ return 0;
+ slen -= 1;
+ seq += 1;
+ st += 1;
+ }
+ return 1;
+ }
+
+ static int choose_sym(char **namep, int len, struct production *p)
+ {
+ char *name = *namep;
+ char *nam = name;
+ int namlen;
+ int n = 0;
+ int i, s, slen;
+ char c;
+
+ c = *name;
+ while (len > 0 &&
+ ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
+ name += 1;
+ len -= 1;
+ c = *name;
+ }
+ namlen = name-nam;
+ while (len > 0 && (c >= '0' && c <= '9')) {
+ name += 1;
+ len -= 1;
+ n = n * 10 + (c - '0');
+ c = *name;
+ }
+ if (namlen == 0) {
+ if (name == *namep)
+ return -1;
+ *namep = name;
+ return n;
+ }
+ slen = 0; s = -1;
+ for (i = 0; i < p->body_size; i++) {
+ if (!subseq_match(nam, namlen, p->body[i]->name))
+ continue;
+ if (slen == 0 || p->body[i]->name.len < slen)
+ s = i;
+ if (s >= 0 && p->body[i] != p->body[s] &&
+ p->body[i]->name.len == p->body[s]->name.len)
+ /* not unique, so s cannot be used */
+ s = -1;
+ }
+ if (s < 0)
+ return -1;
+ if (n == 0);
+ n = 1;
+ for (i = 0; i < p->body_size; i++)
+ if (p->body[i] == p->body[s]) {
+ n -= 1;
+ if (n == 0)
+ break;
+ }
+ if (n > 1)
+ return -1;
+ *namep = name;
+ return i + 1;
+ }
+
static void gen_code(struct production *p, FILE *f, struct grammar *g)
{
char *c;
use = 1;
c++;
}
- if (*c < '0' || *c > '9') {
+ n = choose_sym(&c, p->code.txt + p->code.len - c, p);
+ if (n < 0) {
+ fputc('$', f);
if (use)
fputc('<', f);
fputc(*c, f);
continue;
}
- n = *c - '0';
- while (c[1] >= '0' && c[1] <= '9') {
- c += 1;
- n = n * 10 + *c - '0';
- }
if (n == 0)
fprintf(f, "(*(struct %.*s*%s)ret)",
p->head->struct_name.len,
p->head->struct_name.txt,
p->head->isref ? "*":"");
- else if (n > p->body_size)
- fprintf(f, "$%d", n);
else if (p->body[n-1]->type == Terminal)
fprintf(f, "(*(struct token *)body[%d])",
n-1);
p->body[n-1]->isref ? "*":"", n-1);
used[n-1] = use;
}
+ c -= 1;
}
fputs("\n", f);
for (i = 0; i < p->body_size; i++) {
void *ret = NULL;
shift(&p, TK_eof, 0, 1, NULL, states);
- while (!accepted) {
+ while (!accepted && p.tos > 0) {
struct token *err_tk;
struct frame *tos = &p.stack[p.tos-1];
if (!tk)