]> ocean-lang.org Git - ocean/commitdiff
scanner: improve transition from node to node.
authorNeilBrown <neil@brown.name>
Sat, 8 Jun 2019 23:42:05 +0000 (09:42 +1000)
committerNeilBrown <neil@brown.name>
Sat, 8 Jun 2019 23:56:51 +0000 (09:56 +1000)
When we are at the end of a node, it is wrong to use do_strip() as
that looks beyond the end of the node.
It is better, once we have determined to accept the newline at the
end of a node (i.e. once no unget is possible), to move to the
start of the next node, and assess column position and indents from
that perspective.
Do this removes some tests on at_son/at_eon, and make some code a
bit more transparent - for example the flag that say whether an "out"
is next now depends on where a newline was recently seen, which makes
more sense than whether we were at the start of a node (out and newline
alternate in some contexts).

Also: add the test which found this problem.   This requires a
new set of tests - tests which can scan tokens from multiple nodes.

Now that we are testing node transitions, the coverage has jumped
over 92%

Signed-off-by: NeilBrown <neil@brown.name>
csrc/scanner-tests.mdc
csrc/scanner.mdc

index 14a70572a6646886c485423fc770ffd452049b0e..7858e1d51f9c43048dd35d2b2c5ef08e187f6656 100644 (file)
@@ -32,11 +32,13 @@ about each test.
                    echo "PASSED"; \
                done
 
                    echo "PASSED"; \
                done
 
+               ## other tests
+
                @gcov -o coverage scanner.c libscanner.c > /dev/null 2> /dev/null
                @mv *.gcov coverage; [ -f .gcov ] && mv .gcov coverage || true
                @awk '/NOTEST/ { next } /^ *[1-9]/ {ran+=1} /^ *###/ {skip+=1} \
                    END {printf "coverage: %6.2f%%\n", ran * 100 / (ran + skip); \
                @gcov -o coverage scanner.c libscanner.c > /dev/null 2> /dev/null
                @mv *.gcov coverage; [ -f .gcov ] && mv .gcov coverage || true
                @awk '/NOTEST/ { next } /^ *[1-9]/ {ran+=1} /^ *###/ {skip+=1} \
                    END {printf "coverage: %6.2f%%\n", ran * 100 / (ran + skip); \
-                        if (ran < (ran + skip) *0.90) exit(1) }' \
+                        if (ran < (ran + skip) *0.92) exit(1) }' \
                        coverage/scanner.mdc.gcov
                @rm -f .tmp*
 
                        coverage/scanner.mdc.gcov
                @rm -f .tmp*
 
@@ -715,6 +717,67 @@ sometimes aren't.
        15:0 newline()
        15:0 eof()
 
        15:0 newline()
        15:0 eof()
 
+## Nested tests.
+
+We need to test various aspects of tokenizing code that is stored
+in multiple nodes.  For example, comments and multi-line strings mustn't
+cross a node boundary.
+
+For this we tell `scanner` to extract sections directly from this file.
+As the file changes, line numbers might change as well, so we need to factor
+that out when testing.  A simple awk script can normalise the first line number
+to one.
+
+###### other tests
+       @for T in $(scanner_section_tests); do \
+          echo -n "Test $$T ... "; \
+          i="$IFS"; IFS=,; set $$T; IFS="$$i"; section="$$1"; shift; \
+           ./md2c scanner-tests.mdc "output: $$T" | grep -v '^#' > .tmp.want; \
+          ./coverage_scanner --file scanner-tests.mdc --section "test: $$section" \
+            $${1+"$$@"} | awk -F: ' BEGIN {OFS=":"} $$1 ~ /^[0-9]/ {if (!first) first = $$1 - 1; \
+                 $$1 = $$1 - first} { print } '> .tmp.have; \
+           if ! cmp -s .tmp.want .tmp.have; then \
+               echo "FAILED"; diff -u .tmp.want .tmp.have; exit 1; fi ; \
+           echo "PASSED"; \
+       done
+
+###### test list
+       scanner_section_tests += section1
+
+###### test: section1
+
+       foreach s in sections:
+               ## section2
+               print done
+
+###### section2
+
+               This is another
+       section
+
+###### output: section1
+       Tokenizing: test: section1
+       1:8 ident(foreach)
+       1:16 ident(s)
+       1:18 ident(in)
+       1:21 ident(sections)
+       1:29 mark(:)
+       7:16 in()
+       7:16 ident(This)
+       7:21 ident(is)
+       7:24 ident(another)
+       8:8 newline()
+       8:8 out()
+       8:8 in()
+       8:8 ident(section)
+       3:16 newline()
+       3:16 ident(print)
+       3:22 ident(done)
+       4:0 newline()
+       4:0 out()
+       4:0 newline()
+       4:0 eof()
+
 ## Ad-hoc test
 
 These tests test bugs that were found in practice, and so prevent them recuring.
 ## Ad-hoc test
 
 These tests test bugs that were found in practice, and so prevent them recuring.
index 9a040e1c39e97e1205dcccb5208919eb6b4203ad..777688482566796691d97f6df6cd3e8e5cf9e863 100644 (file)
@@ -807,9 +807,11 @@ information and return one token.
        }
 
 ###### white space
        }
 
 ###### white space
+       if (is_newline(ch))
+               state_check_node(state);
        if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
                int newlines = 0;
        if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
                int newlines = 0;
-               int was_son = at_son(state);
+               int was_nl = is_newline(ch);
                if (ignored & (1<<TK_in)) {
                        if (!is_newline(ch))
                                continue;
                if (ignored & (1<<TK_in)) {
                        if (!is_newline(ch))
                                continue;
@@ -820,17 +822,17 @@ information and return one token.
                        return tk;
                }
                // Indents are needed, so check all white space.
                        return tk;
                }
                // Indents are needed, so check all white space.
-               while (ch <= ' ' && !at_eon(state)) {
+               while (ch <= ' ' && ch != WEOF) {
                        if (is_newline(ch))
                                newlines += 1;
                        ch = get_char(state);
                        if (is_newline(ch))
                                newlines += 1;
                        ch = get_char(state);
+                       if (is_newline(ch))
+                               state_check_node(state);
                }
                }
-               if (at_eon(state)) {
-                       newlines += 1;
-               } else
+               if (ch != WEOF)
                        unget_char(state);
                state->delayed_lines = newlines;
                        unget_char(state);
                state->delayed_lines = newlines;
-               state->out_next = was_son;
+               state->out_next = !was_nl;
                state->check_indent = 1;
                continue;
        }
                state->check_indent = 1;
                continue;
        }
@@ -962,26 +964,37 @@ a flag that tells us whether or not we need to strip.
                }
        }
 
                }
        }
 
+       static void state_check_node(struct token_state *state)
+       {
+               if (!state->node)
+                       return;
+               if (state->node->code.len > state->offset)
+                       return;
+
+               do
+                       state->node = state->node->next;
+               while (state->node && state->node->code.txt == NULL);
+               state->offset = 0;
+               state->prev_offset = 0;
+               state->strip_offset = 0;
+               state->col = 0;
+               if (state->node == NULL)
+                       return;
+               state->line = state->node->line_no;
+               do_strip(state);
+               state->col = state->node->needs_strip;
+               state->strip_offset = state->offset;
+       }
+
        static wint_t get_char(struct token_state *state)
        {
                wchar_t next;
                size_t n;
                mbstate_t mbstate;
 
        static wint_t get_char(struct token_state *state)
        {
                wchar_t next;
                size_t n;
                mbstate_t mbstate;
 
+               state_check_node(state);
                if (state->node == NULL)
                        return WEOF;
                if (state->node == NULL)
                        return WEOF;
-               if (state->node->code.len <= state->offset) {
-                       do
-                               state->node = state->node->next;
-                       while (state->node && state->node->code.txt == NULL);
-                       state->offset = 0;
-                       if (state->node == NULL)
-                               return WEOF;
-                       state->line = state->node->line_no;
-                       do_strip(state);
-                       state->col = state->node->needs_strip;
-                       state->strip_offset = state->offset;
-               }
 
                ## before get_char
 
 
                ## before get_char