X-Git-Url: https://ocean-lang.org/code/?a=blobdiff_plain;f=csrc%2Fparsergen.mdc;h=78ff5435b8086fcd720674cfcb72c13b3a99d919;hb=97b1522ec76e072e927ef3e5f1f917f2a92236db;hp=c25a87e1f621fd3ffa95d4a809c2276a235b2476;hpb=d28c54dc30f0b4322ffb2ecbbfcee9a566052d4b;p=ocean

diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc
index c25a87e..78ff543 100644
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -877,7 +877,7 @@ changes happen.
 		}
 	}
 
-### Setting `can_eol` and `line_like`
+### Setting `line_like`
 
 In order to be able to ignore newline tokens when not relevant, but
 still include them in the parse when needed, we will need to know
@@ -885,30 +885,26 @@ which states can start a "line-like" section of code.  We ignore
 newlines when there is an indent since the most recent start of a
 line-like symbol.
 
-To know which symbols are line-like, we first need to know which
-symbols start with a NEWLINE token.  Any symbol which is followed by a
-NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol.
-Certainly when trying to parse one of these we must take note of NEWLINEs.
+A "line_like" symbol is simply any symbol that can derive a NEWLINE.
+If a symbol cannot derive a NEWLINE, then it is only part of a line -
+so is word-like.  If it can derive a NEWLINE, then we consider it to
+be like a line.
 
-Clearly the `TK_newline` token can start with a NEWLINE.  Any symbol
-which is the head of a production that contains a starts-with-NEWLINE
-symbol preceeded only by nullable symbols is also a
-starts-with-NEWLINE symbol.  We use a new field `can_eol` to record
-this attribute of symbols, and compute it in a repetitive manner
-similar to `set_nullable`.
 
-Once we have that, we can determine which symbols are `line_like` by
-seeing which are followed by a `can_eol` symbol in any production.
+Clearly the `TK_newline` token can derive a NEWLINE.  Any symbol which
+is the head of a production that contains a line_like symbol is also a
+line-like symbol.  We use a new field `line_like` to record this
+attribute of symbols, and compute it in a repetitive manner similar to
+`set_nullable`.
 
 ###### symbol fields
-	int can_eol;
 	int line_like;
 
 ###### functions
-	static void set_can_eol(struct grammar *g)
+	static void set_line_like(struct grammar *g)
 	{
 		int check_again = 1;
-		g->symtab[TK_newline]->can_eol = 1;
+		g->symtab[TK_newline]->line_like = 1;
 		while (check_again) {
 			int p;
 			check_again = 0;
@@ -916,35 +912,20 @@ seeing which are followed by a `can_eol` symbol in any production.
 				struct production *pr = g->productions[p];
 				int s;
 
-				if (pr->head->can_eol)
+				if (pr->head->line_like)
 					continue;
 
 				for (s = 0 ; s < pr->body_size; s++) {
-					if (pr->body[s]->can_eol) {
-						pr->head->can_eol = 1;
+					if (pr->body[s]->line_like) {
+						pr->head->line_like = 1;
 						check_again = 1;
 						break;
 					}
-					if (!pr->body[s]->nullable)
-						break;
 				}
 			}
 		}
 	}
 
-	static void set_line_like(struct grammar *g)
-	{
-		int p;
-		for (p = 0; p < g->production_count; p++) {
-			struct production *pr = g->productions[p];
-			int s;
-
-			for (s = 1; s < pr->body_size; s++)
-				if (pr->body[s]->can_eol)
-					pr->body[s-1]->line_like = 1;
-		}
-	}
-
 ### Building the `first` sets
 
 When calculating what can follow a particular non-terminal, we will need to
@@ -1180,9 +1161,10 @@ need to be consider for completion again.  So  a `completed` flag is needed.
 
 For correct handling of `TK_newline` when parsing, we will need to
 know which states (itemsets) can occur at the start of a line, so we
-will record a `starts_line` flag too.
+will record a `starts_line` flag too whenever DOT is at the start of a
+`line_like` symbol.
 
-Finally, for handling `TK_out` we need to know where production in the
+Finally, for handling `TK_out` we need to know whether productions in the
 current state started *before* the most recent indent.  A state
 doesn't usually keep details of individual productions, so we need to
 add one extra detail. `min_prefix` is the smallest non-zero number of
@@ -1301,7 +1283,7 @@ be supplemented by the LA set for the item which produce the new item.
 
 We also collect a set of all symbols which follow "DOT" (in `done`) as this
 is used in the next stage.
-If any of these symbols are flagged as starting a line, then this
+If any of these symbols are flagged as `line_like`, then this
 state must be a `starts_line` state so now is a good time to record that.
 
 When itemsets are created we assign a precedence to the itemset from
@@ -1532,7 +1514,6 @@ changeover point in `first_nonterm`.
 			g->symtab[s->num] = s;
 
 		set_nullable(g);
-		set_can_eol(g);
 		set_line_like(g);
 		if (type >= SLR)
 			build_first(g);
@@ -1583,9 +1564,8 @@ show if it can end in a newline (`>`), if it is considered to be
 			if (!s)
 				continue;
 
-			printf(" %c%c%c%3d%c: ",
+			printf(" %c%c%3d%c: ",
 			       s->nullable ? '.':' ',
-			       s->can_eol ? '>':' ',
 			       s->line_like ? '<':' ',
 			       s->num, symtypes[s->type]);
 			prtxt(s->name);
@@ -1796,6 +1776,15 @@ terminals to items where that terminal could be shifted and another
 which maps terminals to items that could be reduced when the terminal
 is in look-ahead.  We report when we get conflicts between the two.
 
+As a special case, if we find a SHIFT/REDUCE conflict, where a
+terminal that could be shifted is in the lookahead set of some
+reducable item, then set check if the reducable item also have
+`TK_newline` in its lookahead set.  If it does, then a newline will
+force and reduction, but anything else can reasonably be shifts, so
+that isn't really a conflict.  Such apparent conflicts do not get
+reported.  This will not affect a "tradtional" grammar that does not
+include newlines as token.
+
 	static int conflicts_slr(struct grammar *g, enum grammar_type type)
 	{
 		int i;
@@ -1823,7 +1812,7 @@ is in look-ahead.  We report when we get conflicts between the two.
 						symset_add(&shifts, sym, itm);
 				}
 			}
-			/* Now look for reduction and conflicts */
+			/* Now look for reductions and conflicts */
 			for (j = 0; j < is->items.cnt; j++) {
 				unsigned short itm = is->items.syms[j];
 				int p = item_prod(itm);
@@ -1841,7 +1830,7 @@ is in look-ahead.  We report when we get conflicts between the two.
 				int k;
 				for (k = 0; k < la.cnt; k++) {
 					int pos = symset_find(&shifts, la.syms[k]);
-					if (pos >= 0) {
+					if (pos >= 0 && symset_find(&la, TK_newline) < 0) {
 						printf("  State %d has SHIFT/REDUCE conflict on ", i);
 						prtxt(g->symtab[la.syms[k]]->name);
 						printf(":\n");
@@ -2713,9 +2702,32 @@ the most recent start of line.  This is how a newline forcible
 terminates any line-like structure - we try to reduce down to at most
 one symbol for each line where newlines are allowed.
 
+When, during error handling, we discard token read in, we want to keep
+discarding until we see one that is recognised.  If we had a full set
+of LR(1) grammar states, this will mean looking in the look-ahead set,
+but we don't keep a full look-ahead set.  We only record the subset
+that leads to SHIFT.  We can, however, deduce the look-ahead set but
+looking at the SHIFT subsets for all states that we can get to by
+reducing zero or more times.  So we need a little function which
+checks if a given token is in any of these look-ahead sets.
+
 ###### parser includes
 	#include "parser.h"
+
 ###### parser_run
+
+	static int in_lookahead(struct token *tk, const struct state *states, int state)
+	{
+		while (state >= 0) {
+			if (search(&states[state], tk->num) >= 0)
+				return 1;
+			if (states[state].reduce_prod < 0)
+				return 0;
+			state = search(&states[state], states[state].reduce_sym);
+		}
+		return 0;
+	}
+
 	void *parser_run(struct token_state *tokens,
 	                 const struct state states[],
 	                 int (*do_reduce)(int, void**, struct token_config*, void*),
@@ -2852,7 +2864,7 @@ one symbol for each line where newlines are allowed.
 				break;
 			}
 			tos = &p.stack[p.tos-1];
-			while (search(&states[tos->state], tk->num) < 0 &&
+			while (!in_lookahead(tk, states, tos->state) &&
 			       tk->num != TK_eof) {
 				free(tk);
 				tk = tok_copy(token_next(tokens));
@@ -2865,11 +2877,7 @@ one symbol for each line where newlines are allowed.
 					// FIXME update since_indent here
 				}
 			}
-			if (p.tos == 0 && tk->num == TK_eof)
-				break;
-			tos = &p.stack[p.tos-1];
 			tos->indents += indents;
-			exit(1);
 		}
 		free(tk);
 		pop(&p, p.tos, NULL, do_free);
@@ -2981,6 +2989,9 @@ an error.
 		./parsergen --tag calc -o calc parsergen.mdc
 	calc : calc.o libparser.o libscanner.o libmdcode.o libnumber.o
 		$(CC) $(CFLAGS) -o calc calc.o libparser.o libscanner.o libmdcode.o libnumber.o -licuuc -lgmp
+	calctest : calc
+		./calc parsergen.mdc
+	tests :: calctest
 
 # calc: header
 
@@ -3001,6 +3012,7 @@ an error.
 	#include <stdio.h>
 	#include <malloc.h>
 	#include <gmp.h>
+	#include <string.h>
 	#include "mdcode.h"
 	#include "scanner.h"
 	#include "number.h"
@@ -3014,12 +3026,19 @@ an error.
 		free(n);
 	}
 
+	static int text_is(struct text t, char *s)
+	{
+		return (strlen(s) == t.len &&
+			strncmp(s, t.txt, t.len) == 0);
+	}
+
 	int main(int argc, char *argv[])
 	{
 		int fd = open(argv[1], O_RDONLY);
 		int len = lseek(fd, 0, 2);
 		char *file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
-		struct section *s = code_extract(file, file+len, NULL);
+		struct section *table = code_extract(file, file+len, NULL);
+		struct section *s;
 		struct token_config config = {
 			.ignored = (1 << TK_line_comment)
 			         | (1 << TK_block_comment)
@@ -3029,12 +3048,14 @@ an error.
 			.word_start = "",
 			.word_cont = "",
 		};
-		parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
-		while (s) {
-			struct section *t = s->next;
-			code_free(s->code);
-			free(s);
-			s = t;
+		for (s = table; s; s = s->next)
+			if (text_is(s->section, "example: input"))
+				parse_calc(s->code, &config, argc > 2 ? stderr : NULL);
+		while (table) {
+			struct section *t = table->next;
+			code_free(table->code);
+			free(table);
+			table = t;
 		}
 		exit(0);
 	}
@@ -3072,3 +3093,14 @@ an error.
 		| Expression / Expression ${ mpq_init($0.val); mpq_div($0.val, $1.val, $3.val); }$
 		| NUMBER ${ if (number_parse($0.val, $0.tail, $1.txt) == 0) mpq_init($0.val); }$
 		| ( Expression ) ${ mpq_init($0.val); mpq_set($0.val, $2.val); }$
+
+# example: input
+
+	355/113
+	3.1415926535 - 355/113
+	2 + 4 * 5
+	1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9
+	10 * 9 / 2
+	1 * 1000 + 2 * 100 + 3 * 10 + 4 * 1
+
+	error