scanner: improve transition from node to node.

author NeilBrown <neil@brown.name>

Sat, 8 Jun 2019 23:42:05 +0000 (09:42 +1000)

committer NeilBrown <neil@brown.name>

Sat, 8 Jun 2019 23:56:51 +0000 (09:56 +1000)
author NeilBrown <neil@brown.name>
Sat, 8 Jun 2019 23:42:05 +0000 (09:42 +1000)
committer NeilBrown <neil@brown.name>
Sat, 8 Jun 2019 23:56:51 +0000 (09:56 +1000)
diff --git a/csrc/scanner-tests.mdc b/csrc/scanner-tests.mdc

index 14a70572a6646886c485423fc770ffd452049b0e..7858e1d51f9c43048dd35d2b2c5ef08e187f6656 100644 (file)
--- a/csrc/scanner-tests.mdc
+++ b/csrc/scanner-tests.mdc
@@ -32,11 +32,13 @@ about each test.
                     echo "PASSED"; \
                 done
  
+               ## other tests
+
                 @gcov -o coverage scanner.c libscanner.c > /dev/null 2> /dev/null
                 @mv *.gcov coverage; [ -f .gcov ] && mv .gcov coverage || true
                 @awk '/NOTEST/ { next } /^ *[1-9]/ {ran+=1} /^ *###/ {skip+=1} \
                     END {printf "coverage: %6.2f%%\n", ran * 100 / (ran + skip); \
-                        if (ran < (ran + skip) *0.90) exit(1) }' \
+                        if (ran < (ran + skip) *0.92) exit(1) }' \
                         coverage/scanner.mdc.gcov
                 @rm -f .tmp*
  
@@ -715,6 +717,67 @@ sometimes aren't.
         15:0 newline()
         15:0 eof()
  
+## Nested tests.
+
+We need to test various aspects of tokenizing code that is stored
+in multiple nodes.  For example, comments and multi-line strings mustn't
+cross a node boundary.
+
+For this we tell `scanner` to extract sections directly from this file.
+As the file changes, line numbers might change as well, so we need to factor
+that out when testing.  A simple awk script can normalise the first line number
+to one.
+
+###### other tests
+       @for T in $(scanner_section_tests); do \
+          echo -n "Test $$T ... "; \
+          i="$IFS"; IFS=,; set $$T; IFS="$$i"; section="$$1"; shift; \
+           ./md2c scanner-tests.mdc "output: $$T" | grep -v '^#' > .tmp.want; \
+          ./coverage_scanner --file scanner-tests.mdc --section "test: $$section" \
+            $${1+"$$@"} | awk -F: ' BEGIN {OFS=":"} $$1 ~ /^[0-9]/ {if (!first) first = $$1 - 1; \
+                 $$1 = $$1 - first} { print } '> .tmp.have; \
+           if ! cmp -s .tmp.want .tmp.have; then \
+               echo "FAILED"; diff -u .tmp.want .tmp.have; exit 1; fi ; \
+           echo "PASSED"; \
+       done
+
+###### test list
+       scanner_section_tests += section1
+
+###### test: section1
+
+       foreach s in sections:
+               ## section2
+               print done
+
+###### section2
+
+               This is another
+       section
+
+###### output: section1
+       Tokenizing: test: section1
+       1:8 ident(foreach)
+       1:16 ident(s)
+       1:18 ident(in)
+       1:21 ident(sections)
+       1:29 mark(:)
+       7:16 in()
+       7:16 ident(This)
+       7:21 ident(is)
+       7:24 ident(another)
+       8:8 newline()
+       8:8 out()
+       8:8 in()
+       8:8 ident(section)
+       3:16 newline()
+       3:16 ident(print)
+       3:22 ident(done)
+       4:0 newline()
+       4:0 out()
+       4:0 newline()
+       4:0 eof()
+
  ## Ad-hoc test
  
  These tests test bugs that were found in practice, and so prevent them recuring.
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

index 9a040e1c39e97e1205dcccb5208919eb6b4203ad..777688482566796691d97f6df6cd3e8e5cf9e863 100644 (file)
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -807,9 +807,11 @@ information and return one token.
         }
  
  ###### white space
+       if (is_newline(ch))
+               state_check_node(state);
         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
                 int newlines = 0;
-               int was_son = at_son(state);
+               int was_nl = is_newline(ch);
                 if (ignored & (1<<TK_in)) {
                         if (!is_newline(ch))
                                 continue;
@@ -820,17 +822,17 @@ information and return one token.
                         return tk;
                 }
                 // Indents are needed, so check all white space.
-               while (ch <= ' ' && !at_eon(state)) {
+               while (ch <= ' ' && ch != WEOF) {
                         if (is_newline(ch))
                                 newlines += 1;
                         ch = get_char(state);
+                       if (is_newline(ch))
+                               state_check_node(state);
                 }
-               if (at_eon(state)) {
-                       newlines += 1;
-               } else
+               if (ch != WEOF)
                         unget_char(state);
                 state->delayed_lines = newlines;
-               state->out_next = was_son;
+               state->out_next = !was_nl;
                 state->check_indent = 1;
                 continue;
         }
@@ -962,26 +964,37 @@ a flag that tells us whether or not we need to strip.
                 }
         }
  
+       static void state_check_node(struct token_state *state)
+       {
+               if (!state->node)
+                       return;
+               if (state->node->code.len > state->offset)
+                       return;
+
+               do
+                       state->node = state->node->next;
+               while (state->node && state->node->code.txt == NULL);
+               state->offset = 0;
+               state->prev_offset = 0;
+               state->strip_offset = 0;
+               state->col = 0;
+               if (state->node == NULL)
+                       return;
+               state->line = state->node->line_no;
+               do_strip(state);
+               state->col = state->node->needs_strip;
+               state->strip_offset = state->offset;
+       }
+
         static wint_t get_char(struct token_state *state)
         {
                 wchar_t next;
                 size_t n;
                 mbstate_t mbstate;
  
+               state_check_node(state);
                 if (state->node == NULL)
                         return WEOF;
-               if (state->node->code.len <= state->offset) {
-                       do
-                               state->node = state->node->next;
-                       while (state->node && state->node->code.txt == NULL);
-                       state->offset = 0;
-                       if (state->node == NULL)
-                               return WEOF;
-                       state->line = state->node->line_no;
-                       do_strip(state);
-                       state->col = state->node->needs_strip;
-                       state->strip_offset = state->offset;
-               }
  
                 ## before get_char
author	NeilBrown <neil@brown.name>
	Sat, 8 Jun 2019 23:42:05 +0000 (09:42 +1000)
committer	NeilBrown <neil@brown.name>
	Sat, 8 Jun 2019 23:56:51 +0000 (09:56 +1000)
csrc/scanner-tests.mdc		patch \| blob \| history
csrc/scanner.mdc		patch \| blob \| history