Scanner: parsing of comments and strings must recognise end-of-node

[ocean] / csrc / scanner.mdc
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

index 113c9d2ea199480e4f9d057a76f9c3895c5b6deb..37b336f2233c152b3cf398f385748cf647849c70 100644 (file)
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -347,25 +347,34 @@ Known marks are included in the same list as the list of known words.
                         return tk;
                 }
                 prev = ch;
-               if (prev == '/')
-                       save_unget_state(state);
+               save_unget_state(state);
                 ch = get_char(state);
                 if (!(ignored && (1<<TK_string)) && is_quote(ch))
                         break;
-               if (!(ignored && (1<<TK_line_comment)) &&
-                   prev == '/' && ch == '/') {
+               if (prev == '#')
+                       break;
+               if (prev == '/' && ch == '/' && tk.txt.len > 1) {
                         restore_unget_state(state);
                         break;
                 }
-               if (!(ignored && (1<<TK_block_comment)) &&
-                   prev == '/' && ch == '*') {
+               if (prev == '/' && ch == '*' && tk.txt.len > 1) {
                         restore_unget_state(state);
                         break;
                 }
         }
         unget_char(state);
-       if (tk.num != TK_error)
+       if (tk.num != TK_error) {
+               close_token(state, &tk);
                 return tk;
+       }
+
+If we don't find a known mark, we will check for strings and comments
+before assuming that we have an unknown mark
+
+###### parse mark
+       ## parse string
+       ## parse comment
+       ## unknown mark
  
  ###### unknown mark
         if (tk.txt.len) {
@@ -441,7 +450,8 @@ followed by the start of a new string.
                          * unget so the newline is seen,
                          * but return rest of string as an error.
                          */
-                       unget_char(state);
+                       if (is_newline(ch))
+                               unget_char(state);
                         close_token(state, &tk);
                         tk.num = TK_error;
                         return tk;
@@ -475,14 +485,18 @@ If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
             !(ignored & (1<<TK_string))) {
                 wchar_t first = tk.txt.txt[0];
                 reset_token(state, &tk);
-               get_char(state);
-               do
+               ch = get_char(state);
+               tk.num = TK_error;
+               while (!at_eon(state) && !is_newline(ch)) {
                         ch = get_char(state);
-               while (ch != first && !is_newline(ch));
-               tk.num = TK_string;
-               if (is_newline(ch)) {
-                       unget_char(state);
-                       tk.num = TK_error;
+                       if (ch == first) {
+                               tk.num = TK_string;
+                               break;
+                       }
+                       if (is_newline(ch)) {
+                               unget_char(state);
+                               break;
+                       }
                 }
                 close_token(state, &tk);
                 return tk;
@@ -506,7 +520,7 @@ it would not suffer from this rule.
  
  These two comment types are reported as two separate token types, and
  consequently can be ignored separately.  When ignored a comment is
-parsed and discarded.
+still parsed, but is discarded.
  
  ###### token types
         TK_line_comment,
@@ -528,14 +542,16 @@ parsed and discarded.
  
  #### Single line comments
  
-A single-line comment continues up to, but not including the newline.
+A single-line comment continues up to, but not including the newline
+or end of node.
  
  ###### parse comment
  
         if (is_line_comment(tk.txt)) {
-               while (!is_newline(ch))
+               while (!is_newline(ch) && !at_eon(state))
                         ch = get_char(state);
-               unget_char(state);
+               if (is_newline(ch))
+                       unget_char(state);
                 close_token(state, &tk);
                 tk.num = TK_line_comment;
                 if (ignored & (1 << TK_line_comment))
@@ -1109,9 +1125,6 @@ loop.
         ## parse number
         ## parse word
         ## parse mark
-       ## parse string
-       ## parse comment
-       ## unknown mark
  
  ### Start and stop