scanner: don't allow an unknown mark to run into a string or comment

[ocean] / csrc / scanner.mdc
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

index 58ccd6b9c6c8017214de5c4316892e053230f797..7e33d0cbc1a89126706d2f72ce1f2351e0d395ee 100644 (file)
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -318,6 +318,12 @@ in a known mark, it will return that first known mark.
  
  If no known mark is found we will test against strings and comments
  below before giving up and assuming an unknown mark.
+
+If an unknown mark contains a quote character or a comment marker, and
+that token is not being ignored, then we terminate the unknown mark
+before that quote or comment.  This ensure that an unknown mark
+immediately before a string is handled correctly.
+
  If `TK_mark` is ignored, then unknown marks as returned as an error.
  
  ###### token types
@@ -329,6 +335,7 @@ Known marks are included in the same list as the list of known words.
         tk.num = TK_error;
         while (is_mark(ch, state->conf)) {
                 int n;
+               wchar_t prev;
                 close_token(state, &tk);
                 n = find_known(state->conf, tk.txt);
                 if (n >= 0)
@@ -339,7 +346,22 @@ Known marks are included in the same list as the list of known words.
                         close_token(state, &tk);
                         return tk;
                 }
+               prev = ch;
+               if (prev == '/')
+                       save_unget_state(state);
                 ch = get_char(state);
+               if (!(ignored && (1<<TK_string)) && is_quote(ch))
+                       break;
+               if (!(ignored && (1<<TK_line_comment)) &&
+                   prev == '/' && ch == '/') {
+                       restore_unget_state(state);
+                       break;
+               }
+               if (!(ignored && (1<<TK_block_comment)) &&
+                   prev == '/' && ch == '*') {
+                       restore_unget_state(state);
+                       break;
+               }
         }
         unget_char(state);
         if (tk.num != TK_error)
@@ -722,6 +744,7 @@ information and return one token.
                         if (ignored & (1<<TK_newline))
                                 continue;
                         tk.num = TK_newline;
+                       close_token(state, &tk);
                         return tk;
                 }
                 // Indents are needed, so check all white space.
@@ -843,7 +866,7 @@ a flag that tells us whether or not we need to strip.
                                 state->offset += 1;
                                 n -= 1;
                         }
-                       while (n == 4 && state->node->code.txt[0] == '\t') {
+                       while (n == 4 && state->node->code.txt[state->offset] == '\t') {
                                 state->offset += 1;
                                 n -= 4;
                         }
@@ -1106,7 +1129,9 @@ As well as getting tokens, we need to be able to create the
                 memset(state, 0, sizeof(*state));
                 state->node = code;
                 state->line = code->line_no;
+               state->col = code->indent;
                 state->conf = conf;
+               do_strip(state);
                 return state;
         }
         void token_close(struct token_state *state)