X-Git-Url: https://ocean-lang.org/code/?a=blobdiff_plain;ds=sidebyside;f=csrc%2Fparsergen.mdc;h=13b946fb0e992580c8c7b2d410a003415bb3eea3;hb=2594d150e123c29949493c37c8b9c14aa6abe5a4;hp=82d09b3ed1b497362b77ff7a264dff837499c39a;hpb=1bfa4fc8cfc5fafa5b2b3ae6bdd9b77a4242e74a;p=ocean

diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc
index 82d09b3..13b946f 100644
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -400,6 +400,21 @@ The `<` may be included for symbols declared as storing a reference
 (not a structure) and means that the reference is being moved out, so
 it will not automatically be freed.
 
+Symbols that are left-recursive are a little special.  These are symbols
+that both the head of a production and the first body symbol of the same
+production.  They are problematic when they appear in other productions
+elsewhere than at the end, and when indenting is used to describe
+structure.  In this case there is no way for a smaller indent to ensure
+the left-recursive symbol cannot be extended.  When it appears at the
+end of a production, that production can be reduced to ensure the symbol
+isn't extended.  So we record left-recursive symbols while reading the
+grammar, and produce a warning when reporting the grammar if they are
+found in an unsuitable place.
+
+A symbol that is only left recursive in a production where it is
+followed by newline does not cause these problems because the newline
+will effectively terminate it.
+
 While building productions we will need to add to an array which needs to
 grow dynamically.
 
@@ -460,6 +475,7 @@ Now we have all the bits we need to parse a full production.
 
 ###### symbol fields
 	int first_production;
+	int left_recursive;
 
 ###### functions
 	static char *parse_production(struct grammar *g,
@@ -499,8 +515,10 @@ Now we have all the bits we need to parse a full production.
 			vs = sym_find(g, tk.txt);
 			if (vs->num == TK_newline)
 				p.line_like = 1;
-			else if (vs->type != Virtual) {
-				err = "symbol after $$ must be virtual";
+			else if (vs->num == TK_out)
+				p.line_like = 2;
+			else if (vs->precedence == 0) {
+				err = "symbol after $$ must have precedence";
 				goto abort;
 			} else {
 				p.precedence = vs->precedence;
@@ -517,6 +535,11 @@ Now we have all the bits we need to parse a full production.
 			}
 			tk = token_next(state);
 		}
+		if (p.body_size >= 2 &&
+		    p.body[0] == p.head &&
+		    p.body[1]->num != TK_newline)
+			p.head->left_recursive = 1;
+
 		if (tk.num != TK_newline && tk.num != TK_eof) {
 			err = "stray tokens at end of line";
 			goto abort;
@@ -634,6 +657,11 @@ to produce errors that the parser is better positioned to handle.
 			} else if (tk.num == TK_mark
 			           && text_is(tk.txt, "$*")) {
 				err = dollar_line(state, g, 1);
+			} else if (tk.num == TK_mark
+				   && text_is(tk.txt, "//")) {
+				while (tk.num != TK_newline &&
+				       tk.num != TK_eof)
+					tk = token_next(state);
 			} else {
 				err = "Unrecognised token at start of line.";
 			}
@@ -1309,6 +1337,8 @@ into the go to set, so the item is ineffective.
 		struct symbol *s;
 		struct symset LA = INIT_SYMSET;
 		unsigned short sn = 0;
+		struct symset LAnl = INIT_SYMSET;
+		unsigned short snnl = 0;
 
 		if (is->min_prefix == 0 ||
 		    (bs > 0 && bs < is->min_prefix))
@@ -1330,26 +1360,26 @@ into the go to set, so the item is ineffective.
 			continue;
 		if (symset_find(&done, s->num) < 0) {
 			symset_add(&done, s->num, 0);
-			if (s->line_like)
-				is->starts_line = 1;
 		}
 		if (s->type != Nonterminal)
 			continue;
+		if (s->line_like)
+			is->starts_line = 1;
 		again = 1;
 		if (type >= LALR) {
 			// Need the LA set.
 			int to_end;
 			add_first(pr, bs+1, &LA, g, &to_end);
 			if (to_end) {
-				if (pr->line_like)
-					symset_add(&LA, TK_newline, 0);
-				else {
-					struct symset ss = set_find(g, is->items.data[i]);
-					symset_union(&LA, &ss);
-				}
+				struct symset ss = set_find(g, is->items.data[i]);
+				symset_union(&LA, &ss);
 			}
 			sn = save_set(g, LA);
 			LA = set_find(g, sn);
+			if (symset_find(&LA, TK_newline))
+				symset_add(&LAnl, TK_newline, 0);
+			snnl = save_set(g, LAnl);
+			LAnl = set_find(g, snnl);
 		}
 
 		/* Add productions for this symbol */
@@ -1360,19 +1390,25 @@ into the go to set, so the item is ineffective.
 			int itm = item_num(p2, 0);
 			int pos = symset_find(&is->items, itm);
 			if (pos < 0) {
-				symset_add(&is->items, itm, sn);
+				if (g->productions[p2]->line_like)
+					symset_add(&is->items, itm, snnl);
+				else
+					symset_add(&is->items, itm, sn);
 				/* Will have re-ordered, so start
 				 * from beginning again */
 				i = -1;
 			} else if (type >= LALR) {
 				struct symset ss = set_find(g, is->items.data[pos]);
 				struct symset tmp = INIT_SYMSET;
+				struct symset *la = &LA;
 
+				if (g->productions[p2]->line_like)
+					la = &LAnl;
 				symset_union(&tmp, &ss);
-				if (symset_union(&tmp, &LA)) {
+				if (symset_union(&tmp, la)) {
 					is->items.data[pos] = save_set(g, tmp);
 					i = -1;
-				}else
+				} else
 					symset_free(tmp);
 			}
 		}
@@ -1509,6 +1545,11 @@ changeover point in `first_nonterm`.
 				snum++;
 			}
 		g->first_nonterm = snum;
+		for (s = g->syms; s; s = s->next)
+			if (s->num < 0 && s->type != Virtual) {
+				s->num = snum;
+				snum++;
+			}
 		for (s = g->syms; s; s = s->next)
 			if (s->num < 0) {
 				s->num = snum;
@@ -1547,7 +1588,7 @@ all the tables that have been generated, plus a description of any conflicts.
 		if (g->follow)
 			report_follow(g);
 		report_itemsets(g);
-		return report_conflicts(g, type);
+		return report_conflicts(g, type) + report_left_recursive(g);
 	}
 
 Firstly we have the complete list of symbols, together with the
@@ -1643,8 +1684,10 @@ it up a bit.  First the items, with production number and associativity.
 			printf(" [%d%s]", s->precedence,
 			       assoc_names[s->assoc]);
 		}
-		if (pr->line_like)
+		if (pr->line_like == 1)
 			printf(" $$NEWLINE");
+		else if (pr->line_like)
+			printf(" $$OUT");
 		printf("\n");
 	}
 
@@ -1783,14 +1826,10 @@ terminals to items where that terminal could be shifted and another
 which maps terminals to items that could be reduced when the terminal
 is in look-ahead.  We report when we get conflicts between the two.
 
-As a special case, if we find a SHIFT/REDUCE conflict, where a
-terminal that could be shifted is in the lookahead set of some
-reducable item, then set check if the reducable item also have
-`TK_newline` in its lookahead set.  If it does, then a newline will
-force the reduction, but anything else can reasonably be shifted, so
-that isn't really a conflict.  Such apparent conflicts do not get
-counted, and are reported as non-critical.  This will not affect a
-"traditional" grammar that does not include newlines as token.
+As a special case, if we find a SHIFT/REDUCE conflict, on the NEWLINE
+terminal, we ignore it.  NEWLINES are handled specially with its own
+rules for when to shift and when to reduce.  Conflicts are expected,
+but handled internally.
 
 	static int conflicts_slr(struct grammar *g, enum grammar_type type)
 	{
@@ -1844,12 +1883,9 @@ counted, and are reported as non-critical.  This will not affect a
 				for (k = 0; k < la.cnt; k++) {
 					int pos = symset_find(&shifts, la.syms[k]);
 					if (pos >= 0 && la.syms[k] != TK_newline) {
-						if (symset_find(&la, TK_newline) < 0) {
-							printf("  State %d has SHIFT/REDUCE conflict on ", i);
-							cnt++;
-						} else
-							printf("  State %d has non-critical SHIFT/REDUCE conflict on ", i);
-						prtxt(g->symtab[la.syms[k]]->name);
+						printf("  State %d has SHIFT/REDUCE conflict on ", i);
+						cnt++;
+							prtxt(g->symtab[la.syms[k]]->name);
 						printf(":\n");
 						report_item(g, shifts.data[pos]);
 						report_item(g, itm);
@@ -1873,6 +1909,43 @@ counted, and are reported as non-critical.  This will not affect a
 		return cnt;
 	}
 
+
+### Reporting non-final left-recursive symbols.
+
+Left recursive symbols are a problem for parses that honour indentation
+when they appear other than at the end of the production.  So we need to
+report these when asked.
+
+###### functions
+
+	static int report_left_recursive(struct grammar *g)
+	{
+		int p;
+		int bad_left_recursive = 0;
+
+		for (p = 0; p < g->production_count; p++) {
+			struct production *pr = g->productions[p];
+			int sn;
+
+			for (sn = 0; sn < pr->body_size - 1; sn++) {
+				struct symbol *s = pr->body[sn];
+
+				if (s->left_recursive == 1 &&
+				    s != pr->head) {
+					if (!bad_left_recursive) {
+						bad_left_recursive = 1;
+						printf("Misplaced left recursive symbols.\n");
+					}
+					printf("  ");
+					prtxt(s->name);
+					printf(" in production [%d]\n", p);
+					s->left_recursive = 2;
+				}
+			}
+		}
+		return bad_left_recursive;
+	}
+
 ## Generating the parser
 
 The exported part of the parser is the `parse_XX` function, where the name
@@ -1906,7 +1979,6 @@ pieces of code provided in the grammar file, so they are generated first.
 		fprintf(f, "\tstruct token_state *tokens;\n");
 		fprintf(f, "\tconfig->words_marks = known;\n");
 		fprintf(f, "\tconfig->known_count = sizeof(known)/sizeof(known[0]);\n");
-		fprintf(f, "\tconfig->ignored |= (1 << TK_line_comment) | (1 << TK_block_comment);\n");
 		fprintf(f, "\ttokens = token_open(code, config);\n");
 		fprintf(f, "\tvoid *rv = parser_run(tokens, states, do_reduce, do_free, trace, non_term, config);\n");
 		fprintf(f, "\ttoken_close(tokens);\n");
@@ -1944,7 +2016,7 @@ numbers right.
 		for (i = TK_reserved;
 		     i < g->num_syms;
 		     i++)
-			if (g->symtab[i]->type != Terminal)
+			if (g->symtab[i]->type == Nonterminal)
 				fprintf(f, "\t\"%.*s\",\n", g->symtab[i]->name.len,
 					g->symtab[i]->name.txt);
 		fprintf(f, "};\n\n");
@@ -2866,9 +2938,13 @@ checks if a given token is in any of these look-ahead sets.
 		force_reduce:
 			if (states[tos->state].reduce_prod >= 0 &&
 			    states[tos->state].newline_only &&
-			    tk->num != TK_newline && tk->num != TK_eof && tk->num != TK_out) {
-				/* Anything other than newline in an error as this
-				 * production must end at EOL
+			    !(tk->num == TK_newline ||
+			      tk->num == TK_eof ||
+			      tk->num == TK_out ||
+			      (tos->indents == 0 && tos->since_newline == 0))) {
+				/* Anything other than newline or out or eof
+				 * in an error unless we are already at start
+				 * of line, as this production must end at EOL.
 				 */
 			} else if (states[tos->state].reduce_prod >= 0) {
 				void **body;
@@ -3020,7 +3096,7 @@ end inside square brackets.
 			fputs(reserved_words[tk->num], trace);
 		else
 			text_dump(trace, tk->txt, 20);
-		fputs("]", trace);
+		fprintf(trace, ":%d:%d]", tk->line, tk->col);
 	}
 
 	void parser_trace_action(FILE *trace, char *action)
@@ -3051,7 +3127,7 @@ an error.
 
 # calc: header
 
-	#include "number.h"
+	#include "parse_number.h"
 	// what do we use for a demo-grammar?  A calculator of course.
 	struct number {
 		mpq_t val;
@@ -3071,7 +3147,6 @@ an error.
 	#include <string.h>
 	#include "mdcode.h"
 	#include "scanner.h"
-	#include "number.h"
 	#include "parser.h"
 
 	#include "calc.h"
@@ -3097,7 +3172,6 @@ an error.
 		struct section *s;
 		struct token_config config = {
 			.ignored = (1 << TK_line_comment)
-			         | (1 << TK_block_comment)
 			         | (1 << TK_in)
 			         | (1 << TK_out),
 			.number_chars = ".,_+-",
@@ -3119,7 +3193,7 @@ an error.
 # calc: grammar
 
 	$LEFT + -
-	$LEFT * /
+	$LEFT * / //
 
 	Session -> Session Line
 		| Line
@@ -3147,6 +3221,16 @@ an error.
 		| Expression - Expression ${ mpq_init($0.val); mpq_sub($0.val, $1.val, $3.val); }$
 		| Expression * Expression ${ mpq_init($0.val); mpq_mul($0.val, $1.val, $3.val); }$
 		| Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
+		| Expression // Expression ${ {
+			mpz_t z0, z1, z2;
+			mpq_init($0.val);
+			mpz_init(z0); mpz_init(z1); mpz_init(z2);
+			mpz_tdiv_q(z1, mpq_numref($1.val), mpq_denref($1.val));
+			mpz_tdiv_q(z2, mpq_numref($3.val), mpq_denref($3.val));
+			mpz_tdiv_q(z0, z1, z2);
+			mpq_set_z($0.val, z0);
+			mpz_clear(z0); mpz_clear(z1); mpz_clear(z2);
+		} }$
 		| NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
 		| ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
 
@@ -3159,4 +3243,6 @@ an error.
 	10 * 9 / 2
 	1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
 
+	355//113
+
 	error