From fcd83db1af72cf3da045ca85e89e4d9b119be651 Mon Sep 17 00:00:00 2001
From: NeilBrown <neil@brown.name>
Date: Sun, 9 Jun 2019 09:42:05 +1000
Subject: [PATCH] scanner: improve transition from node to node.

When we are at the end of a node, it is wrong to use do_strip() as
that looks beyond the end of the node.
It is better, once we have determined to accept the newline at the
end of a node (i.e. once no unget is possible), to move to the
start of the next node, and assess column position and indents from
that perspective.
Do this removes some tests on at_son/at_eon, and make some code a
bit more transparent - for example the flag that say whether an "out"
is next now depends on where a newline was recently seen, which makes
more sense than whether we were at the start of a node (out and newline
alternate in some contexts).

Also: add the test which found this problem.   This requires a
new set of tests - tests which can scan tokens from multiple nodes.

Now that we are testing node transitions, the coverage has jumped
over 92%

Signed-off-by: NeilBrown <neil@brown.name>
---
 csrc/scanner-tests.mdc | 65 +++++++++++++++++++++++++++++++++++++++++-
 csrc/scanner.mdc       | 49 +++++++++++++++++++------------
 2 files changed, 95 insertions(+), 19 deletions(-)

diff --git a/csrc/scanner-tests.mdc b/csrc/scanner-tests.mdc
index 14a7057..7858e1d 100644
--- a/csrc/scanner-tests.mdc
+++ b/csrc/scanner-tests.mdc
@@ -32,11 +32,13 @@ about each test.
 		    echo "PASSED"; \
 		done
 
+		## other tests
+
 		@gcov -o coverage scanner.c libscanner.c > /dev/null 2> /dev/null
 		@mv *.gcov coverage; [ -f .gcov ] && mv .gcov coverage || true
 		@awk '/NOTEST/ { next } /^ *[1-9]/ {ran+=1} /^ *###/ {skip+=1} \
 		    END {printf "coverage: %6.2f%%\n", ran * 100 / (ran + skip); \
-		         if (ran < (ran + skip) *0.90) exit(1) }' \
+		         if (ran < (ran + skip) *0.92) exit(1) }' \
 		        coverage/scanner.mdc.gcov
 		@rm -f .tmp*
 
@@ -715,6 +717,67 @@ sometimes aren't.
 	15:0 newline()
 	15:0 eof()
 
+## Nested tests.
+
+We need to test various aspects of tokenizing code that is stored
+in multiple nodes.  For example, comments and multi-line strings mustn't
+cross a node boundary.
+
+For this we tell `scanner` to extract sections directly from this file.
+As the file changes, line numbers might change as well, so we need to factor
+that out when testing.  A simple awk script can normalise the first line number
+to one.
+
+###### other tests
+	@for T in $(scanner_section_tests); do \
+	   echo -n "Test $$T ... "; \
+	   i="$IFS"; IFS=,; set $$T; IFS="$$i"; section="$$1"; shift; \
+	    ./md2c scanner-tests.mdc "output: $$T" | grep -v '^#' > .tmp.want; \
+	   ./coverage_scanner --file scanner-tests.mdc --section "test: $$section" \
+	     $${1+"$$@"} | awk -F: ' BEGIN {OFS=":"} $$1 ~ /^[0-9]/ {if (!first) first = $$1 - 1; \
+	          $$1 = $$1 - first} { print } '> .tmp.have; \
+	    if ! cmp -s .tmp.want .tmp.have; then \
+	        echo "FAILED"; diff -u .tmp.want .tmp.have; exit 1; fi ; \
+	    echo "PASSED"; \
+	done
+
+###### test list
+	scanner_section_tests += section1
+
+###### test: section1
+
+	foreach s in sections:
+		## section2
+		print done
+
+###### section2
+
+		This is another
+	section
+
+###### output: section1
+	Tokenizing: test: section1
+	1:8 ident(foreach)
+	1:16 ident(s)
+	1:18 ident(in)
+	1:21 ident(sections)
+	1:29 mark(:)
+	7:16 in()
+	7:16 ident(This)
+	7:21 ident(is)
+	7:24 ident(another)
+	8:8 newline()
+	8:8 out()
+	8:8 in()
+	8:8 ident(section)
+	3:16 newline()
+	3:16 ident(print)
+	3:22 ident(done)
+	4:0 newline()
+	4:0 out()
+	4:0 newline()
+	4:0 eof()
+
 ## Ad-hoc test
 
 These tests test bugs that were found in practice, and so prevent them recuring.
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc
index 9a040e1..7776884 100644
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -807,9 +807,11 @@ information and return one token.
 	}
 
 ###### white space
+	if (is_newline(ch))
+		state_check_node(state);
 	if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
 		int newlines = 0;
-		int was_son = at_son(state);
+		int was_nl = is_newline(ch);
 		if (ignored & (1<<TK_in)) {
 			if (!is_newline(ch))
 				continue;
@@ -820,17 +822,17 @@ information and return one token.
 			return tk;
 		}
 		// Indents are needed, so check all white space.
-		while (ch <= ' ' && !at_eon(state)) {
+		while (ch <= ' ' && ch != WEOF) {
 			if (is_newline(ch))
 				newlines += 1;
 			ch = get_char(state);
+			if (is_newline(ch))
+				state_check_node(state);
 		}
-		if (at_eon(state)) {
-			newlines += 1;
-		} else
+		if (ch != WEOF)
 			unget_char(state);
 		state->delayed_lines = newlines;
-		state->out_next = was_son;
+		state->out_next = !was_nl;
 		state->check_indent = 1;
 		continue;
 	}
@@ -962,26 +964,37 @@ a flag that tells us whether or not we need to strip.
 		}
 	}
 
+	static void state_check_node(struct token_state *state)
+	{
+		if (!state->node)
+			return;
+		if (state->node->code.len > state->offset)
+			return;
+
+		do
+			state->node = state->node->next;
+		while (state->node && state->node->code.txt == NULL);
+		state->offset = 0;
+		state->prev_offset = 0;
+		state->strip_offset = 0;
+		state->col = 0;
+		if (state->node == NULL)
+			return;
+		state->line = state->node->line_no;
+		do_strip(state);
+		state->col = state->node->needs_strip;
+		state->strip_offset = state->offset;
+	}
+
 	static wint_t get_char(struct token_state *state)
 	{
 		wchar_t next;
 		size_t n;
 		mbstate_t mbstate;
 
+		state_check_node(state);
 		if (state->node == NULL)
 			return WEOF;
-		if (state->node->code.len <= state->offset) {
-			do
-				state->node = state->node->next;
-			while (state->node && state->node->code.txt == NULL);
-			state->offset = 0;
-			if (state->node == NULL)
-				return WEOF;
-			state->line = state->node->line_no;
-			do_strip(state);
-			state->col = state->node->needs_strip;
-			state->strip_offset = state->offset;
-		}
 
 		## before get_char
 
-- 
2.43.0