From 97b1522ec76e072e927ef3e5f1f917f2a92236db Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Sun, 26 May 2019 15:04:43 +1000
Subject: [PATCH] parsergen - fix newline parsing (again)

Add a test-case to oceani-tests.mdc which fails, but shouldn't.
It fails because expressions are treated as line-like, so newlines
aren't ignored.

I realize that having linelike symbols being those that are followed by
a newline really doesn't work.
So go back to the original idea that "linelike symbols are those which
contain a newline".

Then a state starts a line if it is at the start of a linelike symbol.

This simplifies the code, seems to work correctly for existing tests,
and allows the new test to pass.

Signed-off-by: NeilBrown <neil@brown.name>
---
 csrc/oceani-tests.mdc |  8 ++++++
 csrc/parsergen.mdc    | 60 +++++++++++++++----------------------------
 2 files changed, 28 insertions(+), 40 deletions(-)

diff --git a/csrc/oceani-tests.mdc b/csrc/oceani-tests.mdc
index dfb753d..04b9458 100644
--- a/csrc/oceani-tests.mdc
+++ b/csrc/oceani-tests.mdc
@@ -115,6 +115,13 @@ calculations on them.
 
 		aconst :: string = "unchanging"
 
+		// Check wrapping
+		print
+		  a + b
+		  + (a*2)
+		  + b1
+		  + b
+
 ###### output: valvar
 
 	23 12 35 11 276 1.91667 11
@@ -122,6 +129,7 @@ calculations on them.
 	23 12 12 -23 -12 12
 	False True True False False False
 	This is a string  field theory This is a string field theory
+	81
 
 Next we change the value of variables
 
diff --git a/csrc/parsergen.mdc b/csrc/parsergen.mdc
index 1d9d611..78ff543 100644
--- a/csrc/parsergen.mdc
+++ b/csrc/parsergen.mdc
@@ -877,7 +877,7 @@ changes happen.
 		}
 	}
 
-### Setting `can_eol` and `line_like`
+### Setting `line_like`
 
 In order to be able to ignore newline tokens when not relevant, but
 still include them in the parse when needed, we will need to know
@@ -885,30 +885,26 @@ which states can start a "line-like" section of code.  We ignore
 newlines when there is an indent since the most recent start of a
 line-like symbol.
 
-To know which symbols are line-like, we first need to know which
-symbols start with a NEWLINE token.  Any symbol which is followed by a
-NEWLINE, or anything that starts with a NEWLINE, is deemed to be a line-like symbol.
-Certainly when trying to parse one of these we must take note of NEWLINEs.
+A "line_like" symbol is simply any symbol that can derive a NEWLINE.
+If a symbol cannot derive a NEWLINE, then it is only part of a line -
+so is word-like.  If it can derive a NEWLINE, then we consider it to
+be like a line.
 
-Clearly the `TK_newline` token can start with a NEWLINE.  Any symbol
-which is the head of a production that contains a starts-with-NEWLINE
-symbol preceeded only by nullable symbols is also a
-starts-with-NEWLINE symbol.  We use a new field `can_eol` to record
-this attribute of symbols, and compute it in a repetitive manner
-similar to `set_nullable`.
 
-Once we have that, we can determine which symbols are `line_like` by
-seeing which are followed by a `can_eol` symbol in any production.
+Clearly the `TK_newline` token can derive a NEWLINE.  Any symbol which
+is the head of a production that contains a line_like symbol is also a
+line-like symbol.  We use a new field `line_like` to record this
+attribute of symbols, and compute it in a repetitive manner similar to
+`set_nullable`.
 
 ###### symbol fields
-	int can_eol;
 	int line_like;
 
 ###### functions
-	static void set_can_eol(struct grammar *g)
+	static void set_line_like(struct grammar *g)
 	{
 		int check_again = 1;
-		g->symtab[TK_newline]->can_eol = 1;
+		g->symtab[TK_newline]->line_like = 1;
 		while (check_again) {
 			int p;
 			check_again = 0;
@@ -916,35 +912,20 @@ seeing which are followed by a `can_eol` symbol in any production.
 				struct production *pr = g->productions[p];
 				int s;
 
-				if (pr->head->can_eol)
+				if (pr->head->line_like)
 					continue;
 
 				for (s = 0 ; s < pr->body_size; s++) {
-					if (pr->body[s]->can_eol) {
-						pr->head->can_eol = 1;
+					if (pr->body[s]->line_like) {
+						pr->head->line_like = 1;
 						check_again = 1;
 						break;
 					}
-					if (!pr->body[s]->nullable)
-						break;
 				}
 			}
 		}
 	}
 
-	static void set_line_like(struct grammar *g)
-	{
-		int p;
-		for (p = 0; p < g->production_count; p++) {
-			struct production *pr = g->productions[p];
-			int s;
-
-			for (s = 1; s < pr->body_size; s++)
-				if (pr->body[s]->can_eol)
-					pr->body[s-1]->line_like = 1;
-		}
-	}
-
 ### Building the `first` sets
 
 When calculating what can follow a particular non-terminal, we will need to
@@ -1180,9 +1161,10 @@ need to be consider for completion again.  So  a `completed` flag is needed.
 
 For correct handling of `TK_newline` when parsing, we will need to
 know which states (itemsets) can occur at the start of a line, so we
-will record a `starts_line` flag too.
+will record a `starts_line` flag too whenever DOT is at the start of a
+`line_like` symbol.
 
-Finally, for handling `TK_out` we need to know where production in the
+Finally, for handling `TK_out` we need to know whether productions in the
 current state started *before* the most recent indent.  A state
 doesn't usually keep details of individual productions, so we need to
 add one extra detail. `min_prefix` is the smallest non-zero number of
@@ -1301,7 +1283,7 @@ be supplemented by the LA set for the item which produce the new item.
 
 We also collect a set of all symbols which follow "DOT" (in `done`) as this
 is used in the next stage.
-If any of these symbols are flagged as starting a line, then this
+If any of these symbols are flagged as `line_like`, then this
 state must be a `starts_line` state so now is a good time to record that.
 
 When itemsets are created we assign a precedence to the itemset from
@@ -1532,7 +1514,6 @@ changeover point in `first_nonterm`.
 			g->symtab[s->num] = s;
 
 		set_nullable(g);
-		set_can_eol(g);
 		set_line_like(g);
 		if (type >= SLR)
 			build_first(g);
@@ -1583,9 +1564,8 @@ show if it can end in a newline (`>`), if it is considered to be
 			if (!s)
 				continue;
 
-			printf(" %c%c%c%3d%c: ",
+			printf(" %c%c%3d%c: ",
 			       s->nullable ? '.':' ',
-			       s->can_eol ? '>':' ',
 			       s->line_like ? '<':' ',
 			       s->num, symtypes[s->type]);
 			prtxt(s->name);
-- 
2.43.0