parsergen: improve symbol-discard in error handling.

[ocean] / csrc / scanner.mdc
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

index d15b44b88d200984552805d856d50e3ff8730e85..a5eeb1f128b5163b1715b263a033a3a64265c73a 100644 (file)
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -106,7 +106,7 @@ token numbers from `TK_reserved` upwards.
  ### Numbers
  
  Numbers are the messiest tokens to parse, primarily because they can
-contain characters that also have meaning outside of number and,
+contain characters that also have meaning outside of numbers and,
  particularly, immediately after numbers.
  
  The obvious example is the '`-`' sign.  It can come inside a number for
@@ -260,7 +260,7 @@ and the length of the list must be given (`known_count`).
  Tokens matching these known words are reported as the index of the
  list added to `TK_reserved`.
  
-If identifiers are ignored, then any work which is not listed as a
+If identifiers are ignored, then any word which is not listed as a
  known word results in an error.
  
  ###### token config parameters
@@ -324,10 +324,17 @@ below before giving up and assuming an unknown mark.
  
  If an unknown mark contains a quote character or a comment marker, and
  that token is not being ignored, then we terminate the unknown mark
-before that quote or comment.  This ensure that an unknown mark
+before that quote or comment.  This ensures that an unknown mark
  immediately before a string is handled correctly.
  
-If `TK_mark` is ignored, then unknown marks as returned as an error.
+If the first character of a comment marker (i.e. '/') is a known mark,
+the above rules would suggest that the start of a comment would be
+parsed as that mark, which is not what is wanted.  So the introductory
+sequences for a comment ("//" and "/*") are treated as
+partially-known.  They prevent the leading "/" from being a mark by
+itself, but do not actually constitute a stand-alone mark.
+
+If `TK_mark` is ignored, then unknown marks are returned as errors.
  
  ###### token types
         TK_mark,
@@ -344,7 +351,16 @@ Known marks are included in the same list as the list of known words.
                 if (n >= 0)
                         tk.num = TK_reserved + n;
                 else if (tk.num != TK_error) {
-                       /* found a longest-known-mark */
+                       /* found a longest-known-mark, still need to
+                        * check for comments
+                        */
+                       if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
+                           (ch == '/' || ch == '*')) {
+                               /* Yes, this is a comment, not a '/' */
+                               restore_unget_state(state);
+                               tk.num = TK_error;
+                               break;
+                       }
                         unget_char(state);
                         close_token(state, &tk);
                         return tk;
@@ -354,13 +370,16 @@ Known marks are included in the same list as the list of known words.
                 ch = get_char(state);
                 if (!(ignored && (1<<TK_string)) && is_quote(ch))
                         break;
-               if (prev == '#')
+               if (prev == '#' && n < 0)
+                       /* '#' is not a known mark, so assume it is a comment */
                         break;
-               if (prev == '/' && ch == '/' && tk.txt.len > 1) {
+               if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
+                       close_token(state, &tk);
                         restore_unget_state(state);
                         break;
                 }
-               if (prev == '/' && ch == '*' && tk.txt.len > 1) {
+               if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
+                       close_token(state, &tk);
                         restore_unget_state(state);
                         break;
                 }
@@ -395,7 +414,7 @@ and continue until a matching character on the same line.  Any of
  these characters can be included in the list of known marks and then
  they will not be used for identifying strings.
  
-Immediately following the close quote one or two ASCII letters may
+Immediately following the close quote, one or two ASCII letters may
  appear.  These are somewhat like the arbitrary letters allowed in
  "Numbers" above.  They can be used by the language in various ways.
  
@@ -882,19 +901,23 @@ a flag that tells us whether or not we need to strip.
  
  ###### internal functions
  
-       static void do_strip(struct token_state *state)
+       static int do_strip(struct token_state *state)
         {
+               int indent = 0;
                 if (state->node->needs_strip) {
                         int n = 4;
                         while (n && state->node->code.txt[state->offset] == ' ') {
+                               indent += 1;
                                 state->offset += 1;
                                 n -= 1;
                         }
                         while (n == 4 && state->node->code.txt[state->offset] == '\t') {
+                               indent = indent_tab(indent);
                                 state->offset += 1;
                                 n -= 4;
                         }
                 }
+               return indent;
         }
  
         static wint_t get_char(struct token_state *state)
@@ -912,9 +935,8 @@ a flag that tells us whether or not we need to strip.
                         state->offset = 0;
                         if (state->node == NULL)
                                 return WEOF;
-                       do_strip(state);
                         state->line = state->node->line_no;
-                       state->col = state->node->indent;
+                       state->col = do_strip(state);
                 }
  
                 ## before get_char
@@ -939,8 +961,7 @@ a flag that tells us whether or not we need to strip.
                         state->col += 1;
                 } else if (is_newline(next)) {
                         state->line += 1;
-                       state->col = state->node->indent;
-                       do_strip(state);
+                       state->col = do_strip(state);
                 } else if (next == '\t') {
                         state->col = indent_tab(state->col);
                 }
@@ -1150,9 +1171,8 @@ As well as getting tokens, we need to be able to create the
                 memset(state, 0, sizeof(*state));
                 state->node = code;
                 state->line = code->line_no;
-               state->col = code->indent;
+               state->col = do_strip(state);
                 state->conf = conf;
-               do_strip(state);
                 return state;
         }
         void token_close(struct token_state *state)
@@ -1762,7 +1782,7 @@ required indent is found.
                 if (c == ' ')
                         skipped += 1;
                 else if (c == '\t')
-                       skipped = indent_tab(c);
+                       skipped = indent_tab(skipped);
                 else
                         break;
                 i+= 1;
@@ -2010,4 +2030,3 @@ the tokens one per line.
                         libmdcode.o libnumber.o libstring.o -licuuc -lgmp
         scanner.o : scanner.c
                 $(CC) $(CFLAGS) -c scanner.c
-