scanner: change the meaning of ignoring comment tokens.

[ocean] / csrc / scanner.mdc
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

index fa27a89524a85d4eeb9a16019558e61a993afb5f..6b706411f5010e3da61b6214742f3db39e91de77 100644 (file)
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -69,13 +69,15 @@ The scanner is not completely general, yet not completely specified.
  There are a fixed set of token types, though particular tokens within
  those types can be distinguish via configuration.
  
  There are a fixed set of token types, though particular tokens within
  those types can be distinguish via configuration.
  
-Most token types may be explicitly ignored, as typically comments
-would be.  The exact consequence of ignoring each token type varies
-from token to token.
+Most token types may be explicitly ignored, so they aren't parsed.
+Comments typically parsed but not returned, but an option is provided to
+return comments for further processing.  The exact consequence of
+ignoring each token type varies from token to token.
  
  ###### public types
         struct token_config {
                 int ignored;    // bit set of ignored tokens.
  
  ###### public types
         struct token_config {
                 int ignored;    // bit set of ignored tokens.
+               int return_comments;
                 ## token config parameters
         };
  
                 ## token config parameters
         };
  
@@ -354,10 +356,10 @@ immediately before a string is handled correctly.
  
  If the first character of a comment marker (i.e. '/') is a known mark,
  the above rules would suggest that the start of a comment would be
  
  If the first character of a comment marker (i.e. '/') is a known mark,
  the above rules would suggest that the start of a comment would be
-parsed as that mark, which is not what is wanted.  So the introductory
-sequences for a comment ("//" and "/*") are treated as
-partially-known.  They prevent the leading "/" from being a mark by
-itself, but do not actually constitute a stand-alone mark.
+parsed as that mark, which is not what is wanted.  So when comments are
+not ignored, the introductory sequences for a comment ("//" and "/*")
+are treated as partially-known.  They prevent the leading "/" from being
+a mark by itself, but do not actually constitute a stand-alone mark.
  
  If `TK_mark` is ignored, then unknown marks are returned as errors.
  
  
  If `TK_mark` is ignored, then unknown marks are returned as errors.
  
@@ -379,8 +381,7 @@ Known marks are included in the same list as the list of known words.
                         /* found a longest-known-mark, still need to
                          * check for comments
                          */
                         /* found a longest-known-mark, still need to
                          * check for comments
                          */
-                       if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
-                           (ch == '/' || ch == '*')) {
+                       if (is_comment(ignored, tk.txt)) {
                                 /* Yes, this is a comment, not a '/' */
                                 restore_unget_state(state);
                                 tk.num = TK_error;
                                 /* Yes, this is a comment, not a '/' */
                                 restore_unget_state(state);
                                 tk.num = TK_error;
@@ -393,22 +394,21 @@ Known marks are included in the same list as the list of known words.
                 prev = ch;
                 save_unget_state(state);
                 ch = get_char(state);
                 prev = ch;
                 save_unget_state(state);
                 ch = get_char(state);
+               if (n >= 0)
+                       /* No need to worry about other token types */
+                       continue;
                 if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
                         /* If strings are allowed, a quote (Which isn't a known mark)
                          * mustn't be treated as part of an unknown mark.  It can be
                 if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
                         /* If strings are allowed, a quote (Which isn't a known mark)
                          * mustn't be treated as part of an unknown mark.  It can be
-                        * part of a multi-line srtings though.
+                        * part of a multi-line string though.
                          */
                         break;
                          */
                         break;
-               if (prev == '#' && n < 0)
-                       /* '#' is not a known mark, so assume it is a comment */
-                       break;
-               if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
-                       close_token(state, &tk);
-                       restore_unget_state(state);
-                       break;
-               }
-               if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
-                       close_token(state, &tk);
+
+               close_token(state, &tk);
+               if (is_comment(ignored, tk.txt)) {
+                       /* looks like a permitted comment, and not a known mark,
+                        * so assume it is a comment.
+                        */
                         restore_unget_state(state);
                         break;
                 }
                         restore_unget_state(state);
                         break;
                 }
@@ -466,7 +466,7 @@ ignored, we fall through and treat a triple quote as an empty string
  followed by the start of a new string.
  
  ###### parse string
  followed by the start of a new string.
  
  ###### parse string
-       if (tk.txt.len == 3 &&
+       if (tk.txt.len >= 3 &&
             !(ignored & (1 << TK_multi_string)) &&
             is_quote(tk.txt.txt[0]) &&
             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
             !(ignored & (1 << TK_multi_string)) &&
             is_quote(tk.txt.txt[0]) &&
             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
@@ -573,19 +573,29 @@ still parsed, but is discarded.
         TK_block_comment,
  
  ###### internal functions
         TK_block_comment,
  
  ###### internal functions
-       static int is_line_comment(struct text txt)
+       static int is_line_comment(int ignored, struct text txt)
         {
         {
+               if (ignored & (1 << TK_line_comment))
+                       return 0;
                 return (txt.len >= 1 && txt.txt[0] == '#') ||
                        (txt.len >= 2 && txt.txt[0] == '/' &&
                                         txt.txt[1] == '/');
         }
  
                 return (txt.len >= 1 && txt.txt[0] == '#') ||
                        (txt.len >= 2 && txt.txt[0] == '/' &&
                                         txt.txt[1] == '/');
         }
  
-       static int is_block_comment(struct text txt)
+       static int is_block_comment(int ignored, struct text txt)
         {
         {
+               if (ignored & (1 << TK_block_comment))
+                       return 0;
                 return txt.len >= 2 && txt.txt[0] == '/' &&
                        txt.txt[1] == '*';
         }
  
                 return txt.len >= 2 && txt.txt[0] == '/' &&
                        txt.txt[1] == '*';
         }
  
+       static int is_comment(int ignored, struct text txt)
+       {
+               return is_line_comment(ignored, txt) ||
+                      is_block_comment(ignored, txt);
+       }
+
  #### Single line comments
  
  A single-line comment continues up to, but not including the newline
  #### Single line comments
  
  A single-line comment continues up to, but not including the newline
@@ -593,14 +603,14 @@ or end of node.
  
  ###### parse comment
  
  
  ###### parse comment
  
-       if (is_line_comment(tk.txt)) {
+       if (is_line_comment(ignored, tk.txt)) {
                 while (!is_newline(ch) && !at_eon(state))
                         ch = get_char(state);
                 if (is_newline(ch))
                         unget_char(state);
                 close_token(state, &tk);
                 tk.num = TK_line_comment;
                 while (!is_newline(ch) && !at_eon(state))
                         ch = get_char(state);
                 if (is_newline(ch))
                         unget_char(state);
                 close_token(state, &tk);
                 tk.num = TK_line_comment;
-               if (ignored & (1 << TK_line_comment))
+               if (!state->conf->return_comments)
                         continue;
                 return tk;
         }
                         continue;
                 return tk;
         }
@@ -617,7 +627,7 @@ the unget state (explained later).
  
  ###### parse comment
  
  
  ###### parse comment
  
-       if (is_block_comment(tk.txt)) {
+       if (is_block_comment(ignored, tk.txt)) {
                 wchar_t prev;
                 int newlines = 0;
                 reset_token(state, &tk);
                 wchar_t prev;
                 int newlines = 0;
                 reset_token(state, &tk);
@@ -655,8 +665,7 @@ the unget state (explained later).
                         if (!is_newline(ch))
                                 tk.num = TK_error;
                 }
                         if (!is_newline(ch))
                                 tk.num = TK_error;
                 }
-               if (tk.num == TK_error ||
-                   !(ignored & (1 << TK_block_comment)))
+               if (tk.num == TK_error || state->conf->return_comments)
                         return tk;
                 continue;
         }
                         return tk;
                 continue;
         }
@@ -716,6 +725,8 @@ ignored.
         int     indent_level;
         int     indent_sizes[20];
  
         int     indent_level;
         int     indent_sizes[20];
  
+`indent_sizes[0]` will always be zero - this simplifies some code.
+
  #### Newlines
  
  Newlines can optionally be reported.  Newlines within a block comment
  #### Newlines
  
  Newlines can optionally be reported.  Newlines within a block comment
@@ -796,10 +807,20 @@ Separately we need, on each call to `token_next`, to check if
  there are some delayed tokens and if so we need to advance the state
  information and return one token.
  
  there are some delayed tokens and if so we need to advance the state
  information and return one token.
  
+###### internal functions
+       static int state_indent(struct token_state *state)
+       {
+               if (state->node == NULL)
+                       return state->col;
+               return state->node->indent - state->node->needs_strip + state->col;
+       }
+
  ###### white space
  ###### white space
+       if (is_newline(ch))
+               state_check_node(state);
         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
                 int newlines = 0;
         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
                 int newlines = 0;
-               int was_son = at_son(state);
+               int was_nl = is_newline(ch);
                 if (ignored & (1<<TK_in)) {
                         if (!is_newline(ch))
                                 continue;
                 if (ignored & (1<<TK_in)) {
                         if (!is_newline(ch))
                                 continue;
@@ -810,22 +831,17 @@ information and return one token.
                         return tk;
                 }
                 // Indents are needed, so check all white space.
                         return tk;
                 }
                 // Indents are needed, so check all white space.
-               while (ch <= ' ' && !at_eon(state)) {
+               while (ch <= ' ' && ch != WEOF) {
                         if (is_newline(ch))
                                 newlines += 1;
                         ch = get_char(state);
                         if (is_newline(ch))
                                 newlines += 1;
                         ch = get_char(state);
+                       if (is_newline(ch))
+                               state_check_node(state);
                 }
                 }
-               if (at_eon(state)) {
-                       newlines += 1;
-                       if (state->node->next &&
-                           state->node->next->indent > state->node->indent)
-                               state->col = state->node->next->indent;
-                       else
-                               state->col = state->node->indent;
-               } else
+               if (ch != WEOF)
                         unget_char(state);
                 state->delayed_lines = newlines;
                         unget_char(state);
                 state->delayed_lines = newlines;
-               state->out_next = was_son;
+               state->out_next = !was_nl;
                 state->check_indent = 1;
                 continue;
         }
                 state->check_indent = 1;
                 continue;
         }
@@ -833,7 +849,7 @@ information and return one token.
  ###### delayed tokens
  
         if (state->check_indent || state->delayed_lines) {
  ###### delayed tokens
  
         if (state->check_indent || state->delayed_lines) {
-               if (state->col < state->indent_sizes[state->indent_level]) {
+               if (state_indent(state) < state->indent_sizes[state->indent_level]) {
                         if (!state->out_next &&
                             !(ignored & (1<<TK_newline))) {
                                 state->out_next = 1;
                         if (!state->out_next &&
                             !(ignored & (1<<TK_newline))) {
                                 state->out_next = 1;
@@ -845,11 +861,12 @@ information and return one token.
                         tk.num = TK_out;
                         return tk;
                 }
                         tk.num = TK_out;
                         return tk;
                 }
-               if (state->col > state->indent_sizes[state->indent_level] &&
+               if (state_indent(state) > state->indent_sizes[state->indent_level] &&
                     state->indent_level < sizeof(state->indent_sizes)-1) {
                         state->indent_level += 1;
                     state->indent_level < sizeof(state->indent_sizes)-1) {
                         state->indent_level += 1;
-                       state->indent_sizes[state->indent_level] = state->col;
-                       state->delayed_lines -= 1;
+                       state->indent_sizes[state->indent_level] = state_indent(state);
+                       if (state->delayed_lines)
+                               state->delayed_lines -= 1;
                         tk.num = TK_in;
                         return tk;
                 }
                         tk.num = TK_in;
                         return tk;
                 }
@@ -874,11 +891,6 @@ tokens will continue to return the same end-of-file token.
  
  ###### white space
         if (ch == WEOF) {
  
  ###### white space
         if (ch == WEOF) {
-               if (state->col) {
-                       state->col = 0;
-                       state->check_indent = 1;
-                       continue;
-               }
                 tk.num = TK_eof;
                 return tk;
         }
                 tk.num = TK_eof;
                 return tk;
         }
@@ -938,7 +950,7 @@ a flag that tells us whether or not we need to strip.
  
  ###### internal functions
  
  
  ###### internal functions
  
-       static int do_strip(struct token_state *state)
+       static void do_strip(struct token_state *state)
         {
                 int indent = 0;
                 if (state->node->needs_strip) {
         {
                 int indent = 0;
                 if (state->node->needs_strip) {
@@ -954,7 +966,28 @@ a flag that tells us whether or not we need to strip.
                                 n -= 4;
                         }
                 }
                                 n -= 4;
                         }
                 }
-               return indent;
+       }
+
+       static void state_check_node(struct token_state *state)
+       {
+               if (!state->node)
+                       return;
+               if (state->node->code.len > state->offset)
+                       return;
+
+               do
+                       state->node = state->node->next;
+               while (state->node && state->node->code.txt == NULL);
+               state->offset = 0;
+               state->prev_offset = 0;
+               state->strip_offset = 0;
+               state->col = 0;
+               if (state->node == NULL)
+                       return;
+               state->line = state->node->line_no;
+               do_strip(state);
+               state->col = state->node->needs_strip;
+               state->strip_offset = state->offset;
         }
  
         static wint_t get_char(struct token_state *state)
         }
  
         static wint_t get_char(struct token_state *state)
@@ -963,19 +996,9 @@ a flag that tells us whether or not we need to strip.
                 size_t n;
                 mbstate_t mbstate;
  
                 size_t n;
                 mbstate_t mbstate;
  
+               state_check_node(state);
                 if (state->node == NULL)
                         return WEOF;
                 if (state->node == NULL)
                         return WEOF;
-               if (state->node->code.len <= state->offset) {
-                       do
-                               state->node = state->node->next;
-                       while (state->node && state->node->code.txt == NULL);
-                       state->offset = 0;
-                       if (state->node == NULL)
-                               return WEOF;
-                       state->line = state->node->line_no;
-                       state->col = do_strip(state);
-                       state->strip_offset = state->offset;
-               }
  
                 ## before get_char
  
  
                 ## before get_char
  
@@ -986,12 +1009,12 @@ a flag that tells us whether or not we need to strip.
                             &mbstate);
                 if (n == -2 || n == 0) {
                         /* Not enough bytes - not really possible */
                             &mbstate);
                 if (n == -2 || n == 0) {
                         /* Not enough bytes - not really possible */
-                       next = '\n';
-                       state->offset = state->node->code.len;
+                       next = '\n';                            // NOTEST
+                       state->offset = state->node->code.len;  // NOTEST
                 } else if (n == -1) {
                         /* error */
                 } else if (n == -1) {
                         /* error */
-                       state->offset += 1;
-                       next = 0x7f; // an illegal character
+                       state->offset += 1;                     // NOTEST
+                       next = 0x7f; // an illegal character    // NOTEST
                 } else
                         state->offset += n;
  
                 } else
                         state->offset += n;
  
@@ -999,7 +1022,8 @@ a flag that tells us whether or not we need to strip.
                         state->col += 1;
                 } else if (is_newline(next)) {
                         state->line += 1;
                         state->col += 1;
                 } else if (is_newline(next)) {
                         state->line += 1;
-                       state->col = do_strip(state);
+                       do_strip(state);
+                       state->col = state->node->needs_strip;
                 } else if (next == '\t') {
                         state->col = indent_tab(state->col);
                 }
                 } else if (next == '\t') {
                         state->col = indent_tab(state->col);
                 }
@@ -1211,7 +1235,8 @@ As well as getting tokens, we need to be able to create the
                 memset(state, 0, sizeof(*state));
                 state->node = code;
                 state->line = code->line_no;
                 memset(state, 0, sizeof(*state));
                 state->node = code;
                 state->line = code->line_no;
-               state->col = do_strip(state);
+               do_strip(state);
+               state->col = state->node->needs_strip;
                 state->strip_offset = state->offset;
                 state->conf = conf;
                 return state;
                 state->strip_offset = state->offset;
                 state->conf = conf;
                 return state;
@@ -1654,7 +1679,7 @@ Number parsing goes in `libnumber.c`
         ## number includes
         ## number functions
  
         ## number includes
         ## number functions
  
-###### File: number.h
+###### File: parse_number.h
         int number_parse(mpq_t num, char tail[3], struct text tok);
  
  ###### File: scanner.mk
         int number_parse(mpq_t num, char tail[3], struct text tok);
  
  ###### File: scanner.mk
@@ -1938,7 +1963,7 @@ String parsing goes in `libstring.c`
         ## string functions
         ## string main
  
         ## string functions
         ## string main
  
-###### File: string.h
+###### File: parse_string.h
         int string_parse(struct token *tok, char escape,
                          struct text *str, char tail[3]);
  
         int string_parse(struct token *tok, char escape,
                          struct text *str, char tail[3]);
  
@@ -1967,8 +1992,8 @@ the tokens one per line.
         #include <getopt.h>
         #include "mdcode.h"
         #include "scanner.h"
         #include <getopt.h>
         #include "mdcode.h"
         #include "scanner.h"
-       #include "number.h"
-       #include "string.h"
+       #include "parse_number.h"
+       #include "parse_string.h"
  
         static int errs;
         static void pr_err(char *msg)
  
         static int errs;
         static void pr_err(char *msg)
@@ -2021,13 +2046,17 @@ the tokens one per line.
                         { "ignore-newline",     0, NULL, 'l'},
                         { "ignore-block-comment", 0, NULL, 'C'},
                         { "ignore-indent",      0, NULL, 'i'},
                         { "ignore-newline",     0, NULL, 'l'},
                         { "ignore-block-comment", 0, NULL, 'C'},
                         { "ignore-indent",      0, NULL, 'i'},
+                       { "return-comments",    0, NULL, 'r'},
                         { "file",               1, NULL, 'f'},
                         { "file",               1, NULL, 'f'},
+                       { "section",            1, NULL, 's'},
                         { NULL,                 0, NULL, 0},
                 };
                         { NULL,                 0, NULL, 0},
                 };
-               static const char options[] = "W:w:n:NIMSzclCif:";
+               static const char options[] = "W:w:n:NIMSzclCirf:s:";
  
                 struct section *table, *s, *prev;
                 int opt;
  
                 struct section *table, *s, *prev;
                 int opt;
+               char *section_name = NULL;
+               int section_found = 0;
  
                 setlocale(LC_ALL,"");
                 while ((opt = getopt_long(argc, argv, options, long_options, NULL))
  
                 setlocale(LC_ALL,"");
                 while ((opt = getopt_long(argc, argv, options, long_options, NULL))
@@ -2045,7 +2074,9 @@ the tokens one per line.
                         case 'C': conf.ignored |= 1 << TK_block_comment; break;
                         case 'l': conf.ignored |= 1 << TK_newline; break;
                         case 'i': conf.ignored |= 1 << TK_in; break;
                         case 'C': conf.ignored |= 1 << TK_block_comment; break;
                         case 'l': conf.ignored |= 1 << TK_newline; break;
                         case 'i': conf.ignored |= 1 << TK_in; break;
+                       case 'r': conf.return_comments = 1; break;
                         case 'f': filename = optarg; break;
                         case 'f': filename = optarg; break;
+                       case 's': section_name = optarg; break;
                         default: fprintf(stderr, "scanner: unknown option '%c'.\n",
                                          opt);
                                 exit(1);
                         default: fprintf(stderr, "scanner: unknown option '%c'.\n",
                                          opt);
                                 exit(1);
@@ -2082,6 +2113,12 @@ the tokens one per line.
  
                 for (s = table; s;
                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
  
                 for (s = table; s;
                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
+                       if (section_name &&
+                           (s->section.len != strlen(section_name) ||
+                            strncmp(s->section.txt, section_name, s->section.len) != 0))
+                               continue;
+                       if (section_name)
+                               section_found = 1;
                         printf("Tokenizing: %.*s\n", s->section.len,
                                 s->section.txt);
                         state = token_open(s->code, &conf);
                         printf("Tokenizing: %.*s\n", s->section.len,
                                 s->section.txt);
                         state = token_open(s->code, &conf);
@@ -2124,6 +2161,10 @@ the tokens one per line.
                 }
                 if (conf.words_marks != known)
                         free(conf.words_marks);
                 }
                 if (conf.words_marks != known)
                         free(conf.words_marks);
+               if (section_name && !section_found) {
+                       fprintf(stderr, "scanner: section %s not found\n", section_name);
+                       errs = 1;
+               }
                 exit(!!errs);
         }
  ###### File: scanner.mk
                 exit(!!errs);
         }
  ###### File: scanner.mk