parsergen.mdc: add precedence handling

[ocean] / csrc / scanner.mdc
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

index cbb8d6f4311b9cf479ac9b789bdd467b4bbd1eaf..d15b44b88d200984552805d856d50e3ff8730e85 100644 (file)
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -90,7 +90,7 @@ The different tokens are numbers, words, marks, strings, comments,
  newlines, EOF, and indents, each of which is examined in detail below.
  
  There are various cases where no token can be found in part of the
-input.  All of these will be reported as an `TK_error` token.
+input.  All of these will be reported as a `TK_error` token.
  
  It is possible to declare a number of strings which form distinct
  tokens (rather than being grouped as e.g. 'word').  These are given
@@ -260,6 +260,9 @@ and the length of the list must be given (`known_count`).
  Tokens matching these known words are reported as the index of the
  list added to `TK_reserved`.
  
+If identifiers are ignored, then any work which is not listed as a
+known word results in an error.
+
  ###### token config parameters
         const char **words_marks;
         int known_count;
@@ -368,6 +371,14 @@ Known marks are included in the same list as the list of known words.
                 return tk;
         }
  
+If we don't find a known mark, we will check for strings and comments
+before assuming that we have an unknown mark
+
+###### parse mark
+       ## parse string
+       ## parse comment
+       ## unknown mark
+
  ###### unknown mark
         if (tk.txt.len) {
                 if (ignored & (1<<TK_mark))
@@ -442,7 +453,8 @@ followed by the start of a new string.
                          * unget so the newline is seen,
                          * but return rest of string as an error.
                          */
-                       unget_char(state);
+                       if (is_newline(ch))
+                               unget_char(state);
                         close_token(state, &tk);
                         tk.num = TK_error;
                         return tk;
@@ -476,14 +488,18 @@ If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
             !(ignored & (1<<TK_string))) {
                 wchar_t first = tk.txt.txt[0];
                 reset_token(state, &tk);
-               get_char(state);
-               do
+               ch = get_char(state);
+               tk.num = TK_error;
+               while (!at_eon(state) && !is_newline(ch)) {
                         ch = get_char(state);
-               while (ch != first && !is_newline(ch));
-               tk.num = TK_string;
-               if (is_newline(ch)) {
-                       unget_char(state);
-                       tk.num = TK_error;
+                       if (ch == first) {
+                               tk.num = TK_string;
+                               break;
+                       }
+                       if (is_newline(ch)) {
+                               unget_char(state);
+                               break;
+                       }
                 }
                 close_token(state, &tk);
                 return tk;
@@ -529,14 +545,16 @@ still parsed, but is discarded.
  
  #### Single line comments
  
-A single-line comment continues up to, but not including the newline.
+A single-line comment continues up to, but not including the newline
+or end of node.
  
  ###### parse comment
  
         if (is_line_comment(tk.txt)) {
-               while (!is_newline(ch))
+               while (!is_newline(ch) && !at_eon(state))
                         ch = get_char(state);
-               unget_char(state);
+               if (is_newline(ch))
+                       unget_char(state);
                 close_token(state, &tk);
                 tk.num = TK_line_comment;
                 if (ignored & (1 << TK_line_comment))
@@ -815,6 +833,11 @@ tokens will continue to return the same end-of-file token.
  
  ###### white space
         if (ch == WEOF) {
+               if (state->col) {
+                       state->col = 0;
+                       state->check_indent = 1;
+                       continue;
+               }
                 tk.num = TK_eof;
                 return tk;
         }
@@ -1110,9 +1133,6 @@ loop.
         ## parse number
         ## parse word
         ## parse mark
-       ## parse string
-       ## parse comment
-       ## unknown mark
  
  ### Start and stop