X-Git-Url: https://ocean-lang.org/code/?p=ocean;a=blobdiff_plain;f=csrc%2Fparsergen.mdc;h=742996e16222854cecedd235f9449987cc6c5e08;hp=679fab6996de66a9fe267d914c654812f74771f3;hb=d7f2c9af259a43cbdb8def0ebe8040deed480848;hpb=f24b54a97b9955aa5761b22aa64d5418e23e80f2

diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc
index 679fab6..742996e 100644
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -151,11 +151,17 @@ those which don't.  There are also "virtual" symbols used for precedence
 marking discussed later, and sometimes we won't know what type a symbol
 is yet.
 
+To help with code safety it is possible to declare the terminal symbols.
+If this is done, then any symbol used in a production that does not
+appear in a head and is not declared is treated as an error.
+
 ###### forward declarations
 	enum symtype { Unknown, Virtual, Terminal, Nonterminal };
 	char *symtypes = "UVTN";
 ###### symbol fields
 	enum symtype type;
+###### grammar fields
+	int terminals_declared;
 
 Symbols can be either `TK_ident` or `TK_mark`.  They are saved in a
 table of known symbols and the resulting parser will report them as
@@ -241,9 +247,10 @@ symbol, but its type might be `Unknown`.
 
 ### Data types and precedence.
 
-Data type specification and precedence specification are both
-introduced by a dollar sign at the start of the line.  If the next
-word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+Data type specification, precedence specification, and declaration of
+terminals are all introduced by a dollar sign at the start of the line.
+If the next word is `LEFT`, `RIGHT` or `NON`, then the line specifies a
+precedence, if it is `TERM` the the line declares terminals without
 precedence, otherwise it specifies a data type.
 
 The data type name is simply stored and applied to the head of all
@@ -296,6 +303,7 @@ Subsequent lines introduce symbols with higher precedence.
 		struct token t = token_next(ts);
 		char *err;
 		enum assoc assoc;
+		int term = 0;
 		int found;
 
 		if (t.num != TK_ident) {
@@ -308,7 +316,10 @@ Subsequent lines introduce symbols with higher precedence.
 			assoc = Right;
 		else if (text_is(t.txt, "NON"))
 			assoc = Non;
-		else {
+		else if (text_is(t.txt, "TERM")) {
+			term = 1;
+			g->terminals_declared = 1;
+		} else {
 			g->current_type = t.txt;
 			g->type_isref = isref;
 			if (text_is(t.txt, "void"))
@@ -326,7 +337,7 @@ Subsequent lines introduce symbols with higher precedence.
 			goto abort;
 		}
 
-		// This is a precedence line, need some symbols.
+		// This is a precedence or TERM line, need some symbols.
 		found = 0;
 		g->prec_levels += 1;
 		t = token_next(ts);
@@ -340,6 +351,10 @@ Subsequent lines introduce symbols with higher precedence.
 					err = "$$ must be followed by a word";
 					goto abort;
 				}
+				if (term) {
+					err = "Virtual symbols not permitted on $TERM line";
+					goto abort;
+				}
 			} else if (t.num != TK_ident &&
 			           t.num != TK_mark) {
 				err = "Illegal token in precedence line";
@@ -347,17 +362,19 @@ Subsequent lines introduce symbols with higher precedence.
 			}
 			s = sym_find(g, t.txt);
 			if (s->type != Unknown) {
-				err = "Symbols in precedence line must not already be known.";
+				err = "Symbols in precedence/TERM line must not already be known.";
 				goto abort;
 			}
 			s->type = type;
-			s->precedence = g->prec_levels;
-			s->assoc = assoc;
+			if (!term) {
+				s->precedence = g->prec_levels;
+				s->assoc = assoc;
+			}
 			found += 1;
 			t = token_next(ts);
 		}
 		if (found == 0)
-			err = "No symbols given on precedence line";
+			err = "No symbols given on precedence/TERM line";
 			goto abort;
 		return NULL;
 	abort:
@@ -391,14 +408,30 @@ be in one `code_node` of the literate code.  The `}$` must be
 at the end of a line.
 
 Text in the code fragment will undergo substitutions where `$N` or
-`$<N`,for some numeric `N`, will be replaced with a variable holding
-the parse information for the particular symbol in the production.
-`$0` is the head of the production, `$1` is the first symbol of the
-body, etc.  The type of `$N` for a terminal symbol is `struct token`.
-For a non-terminal, it is whatever has been declared for that symbol.
-The `<` may be included for symbols declared as storing a reference
-(not a structure) and means that the reference is being moved out, so
-it will not automatically be freed.
+`$<N`,for some numeric `N` (or non-numeric indicator as described
+later), will be replaced with a variable holding the parse information
+for the particular symbol in the production.  `$0` is the head of the
+production, `$1` is the first symbol of the body, etc.  The type of `$N`
+for a terminal symbol is `struct token`.  For a non-terminal, it is
+whatever has been declared for that symbol.  The `<` may be included and
+means that the value (usually a reference) is being moved out, so it
+will not automatically be freed.  The effect of using '<' is that the
+variable is cleareed to all-zeros.
+
+Symbols that are left-recursive are a little special.  These are symbols
+that both the head of a production and the first body symbol of the same
+production.  They are problematic when they appear in other productions
+elsewhere than at the end, and when indenting is used to describe
+structure.  In this case there is no way for a smaller indent to ensure
+the left-recursive symbol cannot be extended.  When it appears at the
+end of a production, that production can be reduced to ensure the symbol
+isn't extended.  So we record left-recursive symbols while reading the
+grammar, and produce a warning when reporting the grammar if they are
+found in an unsuitable place.
+
+A symbol that is only left recursive in a production where it is
+followed by newline does not cause these problems because the newline
+will effectively terminate it.
 
 While building productions we will need to add to an array which needs to
 grow dynamically.
@@ -460,6 +493,7 @@ Now we have all the bits we need to parse a full production.
 
 ###### symbol fields
 	int first_production;
+	int left_recursive;
 
 ###### functions
 	static char *parse_production(struct grammar *g,
@@ -476,8 +510,10 @@ Now we have all the bits we need to parse a full production.
 		tk = token_next(state);
 		while (tk.num == TK_ident || tk.num == TK_mark) {
 			struct symbol *bs = sym_find(g, tk.txt);
-			if (bs->type == Unknown)
-				bs->type = Terminal;
+			if (bs->type == Unknown) {
+				if (!g->terminals_declared)
+					bs->type = Terminal;
+			}
 			if (bs->type == Virtual) {
 				err = "Virtual symbol not permitted in production";
 				goto abort;
@@ -499,6 +535,8 @@ Now we have all the bits we need to parse a full production.
 			vs = sym_find(g, tk.txt);
 			if (vs->num == TK_newline)
 				p.line_like = 1;
+			else if (vs->num == TK_out)
+				p.line_like = 2;
 			else if (vs->precedence == 0) {
 				err = "symbol after $$ must have precedence";
 				goto abort;
@@ -517,6 +555,11 @@ Now we have all the bits we need to parse a full production.
 			}
 			tk = token_next(state);
 		}
+		if (p.body_size >= 2 &&
+		    p.body[0] == p.head &&
+		    p.body[1]->num != TK_newline)
+			p.head->left_recursive = 1;
+
 		if (tk.num != TK_newline && tk.num != TK_eof) {
 			err = "stray tokens at end of line";
 			goto abort;
@@ -634,6 +677,11 @@ to produce errors that the parser is better positioned to handle.
 			} else if (tk.num == TK_mark
 			           && text_is(tk.txt, "$*")) {
 				err = dollar_line(state, g, 1);
+			} else if (tk.num == TK_mark
+				   && text_is(tk.txt, "//")) {
+				while (tk.num != TK_newline &&
+				       tk.num != TK_eof)
+					tk = token_next(state);
 			} else {
 				err = "Unrecognised token at start of line.";
 			}
@@ -641,6 +689,21 @@ to produce errors that the parser is better positioned to handle.
 				goto abort;
 		}
 		token_close(state);
+		if (g->terminals_declared) {
+			struct symbol *s;
+			int errs = 0;
+			for (s = g->syms; s; s = s->next) {
+				if (s->type != Unknown)
+					continue;
+				errs += 1;
+				fprintf(stderr, "Token %.*s not declared\n",
+					s->name.len, s->name.txt);
+			}
+			if (errs) {
+				free(g);
+				g = NULL;
+			}
+		}
 		return g;
 	abort:
 		fprintf(stderr, "Error at line %d: %s\n",
@@ -1332,11 +1395,11 @@ into the go to set, so the item is ineffective.
 			continue;
 		if (symset_find(&done, s->num) < 0) {
 			symset_add(&done, s->num, 0);
-			if (s->line_like)
-				is->starts_line = 1;
 		}
 		if (s->type != Nonterminal)
 			continue;
+		if (s->line_like)
+			is->starts_line = 1;
 		again = 1;
 		if (type >= LALR) {
 			// Need the LA set.
@@ -1517,6 +1580,11 @@ changeover point in `first_nonterm`.
 				snum++;
 			}
 		g->first_nonterm = snum;
+		for (s = g->syms; s; s = s->next)
+			if (s->num < 0 && s->type != Virtual) {
+				s->num = snum;
+				snum++;
+			}
 		for (s = g->syms; s; s = s->next)
 			if (s->num < 0) {
 				s->num = snum;
@@ -1555,7 +1623,7 @@ all the tables that have been generated, plus a description of any conflicts.
 		if (g->follow)
 			report_follow(g);
 		report_itemsets(g);
-		return report_conflicts(g, type);
+		return report_conflicts(g, type) + report_left_recursive(g);
 	}
 
 Firstly we have the complete list of symbols, together with the
@@ -1651,8 +1719,10 @@ it up a bit.  First the items, with production number and associativity.
 			printf(" [%d%s]", s->precedence,
 			       assoc_names[s->assoc]);
 		}
-		if (pr->line_like)
+		if (pr->line_like == 1)
 			printf(" $$NEWLINE");
+		else if (pr->line_like)
+			printf(" $$OUT");
 		printf("\n");
 	}
 
@@ -1874,6 +1944,43 @@ but handled internally.
 		return cnt;
 	}
 
+
+### Reporting non-final left-recursive symbols.
+
+Left recursive symbols are a problem for parses that honour indentation
+when they appear other than at the end of the production.  So we need to
+report these when asked.
+
+###### functions
+
+	static int report_left_recursive(struct grammar *g)
+	{
+		int p;
+		int bad_left_recursive = 0;
+
+		for (p = 0; p < g->production_count; p++) {
+			struct production *pr = g->productions[p];
+			int sn;
+
+			for (sn = 0; sn < pr->body_size - 1; sn++) {
+				struct symbol *s = pr->body[sn];
+
+				if (s->left_recursive == 1 &&
+				    s != pr->head) {
+					if (!bad_left_recursive) {
+						bad_left_recursive = 1;
+						printf("Misplaced left recursive symbols.\n");
+					}
+					printf("  ");
+					prtxt(s->name);
+					printf(" in production [%d]\n", p);
+					s->left_recursive = 2;
+				}
+			}
+		}
+		return bad_left_recursive;
+	}
+
 ## Generating the parser
 
 The exported part of the parser is the `parse_XX` function, where the name
@@ -1907,7 +2014,6 @@ pieces of code provided in the grammar file, so they are generated first.
 		fprintf(f, "\tstruct token_state *tokens;\n");
 		fprintf(f, "\tconfig->words_marks = known;\n");
 		fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
-		fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
 		fprintf(f, "\ttokens = token_open(code, config);\n");
 		fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
 		fprintf(f, "\ttoken_close(tokens);\n");
@@ -1945,7 +2051,7 @@ numbers right.
 		for (i = TK_reserved;
 		     i < g->num_syms;
 		     i++)
-			if (g->symtab[i]->type != Terminal)
+			if (g->symtab[i]->type == Nonterminal)
 				fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
 					g->symtab[i]->name.txt);
 		fprintf(f, "};\n\n");
@@ -2064,13 +2170,105 @@ structure returned by a previous reduction.  These pointers need to be cast
 to the appropriate type for each access.  All this is handled in
 `gen_code`.
 
-`gen_code` also allows symbol references to contain a '`<`' as in '`$<2`'.
-This applied only to symbols with references (or pointers), not those with structures.
-The `<` implies that the reference it being moved out, so the object will not be
-automatically freed.  This is equivalent to assigning `NULL` to the pointer.
+`gen_code` also allows symbol references to contain a '`<`' as in
+'`$<2`'.  This is particularly useful for references (or pointers), but
+can be used with structures too.  The `<` implies that the value it
+being moved out, so the object will not be automatically freed.  It is
+equivalent to assigning `NULL` to the pointer or filling a structure
+with zeros.
+
+Instead of a number `N`, the `$` or `$<` can be followed by some letters
+and possibly a number.  A number by itself (other than zero) selects a
+symbol from the body of the production.  A sequence of letters selects
+the shortest symbol in the body which contains those letters in the given
+order.  If a number follows the letters, then a later occurrence of
+that symbol is chosen.  So "`$AB2`" will refer to the structure attached
+to the second occurrence of the shortest symbol which contains an `A`
+followed by a `B`.  If there is no unique shortest system, or if the
+number given is too large, then the symbol reference is not transformed,
+and will cause an error when the code is compiled.
 
 ###### functions
 
+	static int textchr(struct text t, char c, int s)
+	{
+		int i;
+		for (i = s; i < t.len; i++)
+			if (t.txt[i] == c)
+				return i;
+		return -1;
+	}
+
+	static int subseq_match(char *seq, int slen, struct text name)
+	{
+		int st = 0;
+		while (slen > 0) {
+			st = textchr(name, *seq, st);
+			if (st < 0)
+				return 0;
+			slen -= 1;
+			seq += 1;
+			st += 1;
+		}
+		return 1;
+	}
+
+	static int choose_sym(char **namep, int len, struct production *p)
+	{
+		char *name = *namep;
+		char *nam = name;
+		int namlen;
+		int n = 0;
+		int i, s, slen;
+		char c;
+
+		c = *name;
+		while (len > 0 &&
+		       ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))) {
+			name += 1;
+			len -= 1;
+			c = *name;
+		}
+		namlen = name-nam;
+		while (len > 0 && (c >= '0' && c <= '9')) {
+			name += 1;
+			len -= 1;
+			n = n * 10 + (c - '0');
+			c = *name;
+		}
+		if (namlen == 0) {
+			if (name == *namep)
+				return -1;
+			*namep = name;
+			return n;
+		}
+		slen = 0; s = -1;
+		for (i = 0; i < p->body_size; i++) {
+			if (!subseq_match(nam, namlen, p->body[i]->name))
+				continue;
+			if (slen == 0 || p->body[i]->name.len < slen)
+				s = i;
+			if (s >= 0 && p->body[i] != p->body[s] &&
+			    p->body[i]->name.len == p->body[s]->name.len)
+				/* not unique, so s cannot be used */
+				s = -1;
+		}
+		if (s < 0)
+			return -1;
+		if (n == 0);
+			n = 1;
+		for (i = 0; i < p->body_size; i++)
+			if (p->body[i] == p->body[s]) {
+				n -= 1;
+				if (n == 0)
+					break;
+			}
+		if (n > 1)
+			return -1;
+		*namep = name;
+		return i + 1;
+	}
+
 	static void gen_code(struct production *p, FILE *f, struct grammar *g)
 	{
 		char *c;
@@ -2092,24 +2290,19 @@ automatically freed.  This is equivalent to assigning `NULL` to the pointer.
 				use = 1;
 				c++;
 			}
-			if (*c < '0' || *c > '9') {
+			n = choose_sym(&c, p->code.txt + p->code.len - c, p);
+			if (n < 0) {
+				fputc('$', f);
 				if (use)
 					fputc('<', f);
 				fputc(*c, f);
 				continue;
 			}
-			n = *c - '0';
-			while (c[1] >= '0' && c[1] <= '9') {
-				c += 1;
-				n = n * 10 + *c - '0';
-			}
 			if (n == 0)
 				fprintf(f, "(*(struct %.*s*%s)ret)",
 					p->head->struct_name.len,
 					p->head->struct_name.txt,
 					p->head->isref ? "*":"");
-			else if (n > p->body_size)
-				fprintf(f, "$%d", n);
 			else if (p->body[n-1]->type == Terminal)
 				fprintf(f, "(*(struct token *)body[%d])",
 					n-1);
@@ -2122,6 +2315,7 @@ automatically freed.  This is equivalent to assigning `NULL` to the pointer.
 					p->body[n-1]->isref ? "*":"", n-1);
 				used[n-1] = use;
 			}
+			c -= 1;
 		}
 		fputs("\n", f);
 		for (i = 0; i < p->body_size; i++) {
@@ -2708,8 +2902,12 @@ accepted the input and can finish.
 We return whatever `asn` was returned by reducing production zero.
 
 If we can neither shift nor reduce we have an error to handle.  We pop
-single entries off the stack until we can shift the `TK_error` symbol, then
-drop input tokens until we find one we can shift into the new error state.
+single entries off the stack until we can shift the `TK_error` symbol,
+then drop input tokens until we find one we can shift into the new error
+state.  We need to ensure that something is dropped or shifted after an
+error, or we could get into an infinite loop, only shifting in
+`TK_error`, then reducing.  So we track if there has been a shift since
+the last error, and if not the next error always discards one token.
 
 When we find `TK_in` and `TK_out` tokens which report indents we need
 to handle them directly as the grammar cannot express what we want to
@@ -2753,11 +2951,11 @@ in the thing that preceed:
 Here the NEWLINE will be shifted because nothing can be reduced until
 the `if` is seen.
 
-When, during error handling, we discard token read in, we want to keep
+When during error handling we discard tokens read in, we want to keep
 discarding until we see one that is recognised.  If we had a full set
-of LR(1) grammar states, this will mean looking in the look-ahead set,
+of LR(1) grammar states, this would mean looking in the look-ahead set,
 but we don't keep a full look-ahead set.  We only record the subset
-that leads to SHIFT.  We can, however, deduce the look-ahead set but
+that leads to SHIFT.  We can, however, deduce the look-ahead set by
 looking at the SHIFT subsets for all states that we can get to by
 reducing zero or more times.  So we need a little function which
 checks if a given token is in any of these look-ahead sets.
@@ -2789,6 +2987,7 @@ checks if a given token is in any of these look-ahead sets.
 		struct parser p = { 0 };
 		struct token *tk = NULL;
 		int accepted = 0;
+		int shift_since_err = 1;
 		void *ret = NULL;
 
 		shift(&p, TK_eof, 0, 1, NULL, states);
@@ -2860,6 +3059,7 @@ checks if a given token is in any of these look-ahead sets.
 					goto force_reduce;
 			}
 			if (shift(&p, tk->num, 0, tk->num == TK_newline, tk, states)) {
+				shift_since_err = 1;
 				tk = NULL;
 				parser_trace_action(trace, "Shift");
 				continue;
@@ -2867,9 +3067,13 @@ checks if a given token is in any of these look-ahead sets.
 		force_reduce:
 			if (states[tos->state].reduce_prod >= 0 &&
 			    states[tos->state].newline_only &&
-			    tk->num != TK_newline && tk->num != TK_eof && tk->num != TK_out) {
-				/* Anything other than newline in an error as this
-				 * production must end at EOL
+			    !(tk->num == TK_newline ||
+			      tk->num == TK_eof ||
+			      tk->num == TK_out ||
+			      (tos->indents == 0 && tos->since_newline == 0))) {
+				/* Anything other than newline or out or eof
+				 * in an error unless we are already at start
+				 * of line, as this production must end at EOL.
 				 */
 			} else if (states[tos->state].reduce_prod >= 0) {
 				void **body;
@@ -2920,11 +3124,22 @@ checks if a given token is in any of these look-ahead sets.
 				// no state accepted TK_error
 				break;
 			}
+			if (!shift_since_err) {
+				/* must discard at least one token to avoid
+				 * infinite loop.
+				 */
+				if (tk->num == TK_eof)
+					break;
+				free(tk);
+				tk = tok_copy(token_next(tokens));
+			}
+			shift_since_err = 0;
 			tos = &p.stack[p.tos-1];
 			while (!in_lookahead(tk, states, tos->state) &&
 			       tk->num != TK_eof) {
 				free(tk);
 				tk = tok_copy(token_next(tokens));
+				shift_since_err = 1;
 				if (tk->num == TK_in)
 					indents += 1;
 				if (tk->num == TK_out) {
@@ -3021,7 +3236,7 @@ end inside square brackets.
 			fputs(reserved_words[tk->num], trace);
 		else
 			text_dump(trace, tk->txt, 20);
-		fputs("]", trace);
+		fprintf(trace, ":%d:%d]", tk->line, tk->col);
 	}
 
 	void parser_trace_action(FILE *trace, char *action)
@@ -3052,7 +3267,7 @@ an error.
 
 # calc: header
 
-	#include "number.h"
+	#include "parse_number.h"
 	// what do we use for a demo-grammar?  A calculator of course.
 	struct number {
 		mpq_t val;
@@ -3072,7 +3287,6 @@ an error.
 	#include <string.h>
 	#include "mdcode.h"
 	#include "scanner.h"
-	#include "number.h"
 	#include "parser.h"
 
 	#include "calc.h"
@@ -3098,7 +3312,6 @@ an error.
 		struct section *s;
 		struct token_config config = {
 			.ignored = (1 << TK_line_comment)
-			         | (1 << TK_block_comment)
 			         | (1 << TK_in)
 			         | (1 << TK_out),
 			.number_chars = ".,_+-",
@@ -3120,7 +3333,7 @@ an error.
 # calc: grammar
 
 	$LEFT + -
-	$LEFT * /
+	$LEFT * / //
 
 	Session -> Session Line
 		| Line
@@ -3148,6 +3361,16 @@ an error.
 		| Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
 		| Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
 		| Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+		| Expression // Expression ${ {
+			mpz_t z0, z1, z2;
+			mpq_init($0.val);
+			mpz_init(z0); mpz_init(z1); mpz_init(z2);
+			mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
+			mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
+			mpz_tdiv_q(z0, z1, z2);
+			mpq_set_z($0.val, z0);
+			mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
+		} }$
 		| NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
 		| ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
 
@@ -3160,4 +3383,6 @@ an error.
 	10 * 9 / 2
 	1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
 
+	355//113
+
 	error