scanner: improve transition from node to node.

[ocean] / csrc / scanner.mdc
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

index e54dac686bb8906734ba3bf712df8c4aa763edc6..777688482566796691d97f6df6cd3e8e5cf9e863 100644 (file)
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -716,6 +716,8 @@ ignored.
         int     indent_level;
         int     indent_sizes[20];
  
+`indent_sizes[0]` will always be zero - this simplifies some code.
+
  #### Newlines
  
  Newlines can optionally be reported.  Newlines within a block comment
@@ -750,7 +752,7 @@ blank lines that have been skipped.
  
  When a Newline leads to the next block of code there is a question of
  whether the various Newline and OUT/IN tokens should appear to
-pbelong to the earlier or later block.  This is addressed by processing
+belong to the earlier or later block.  This is addressed by processing
  the tokens in two stages based on the relative indent levels of the
  two blocks (each block has a base indent to which the actual indents
  are added).
@@ -787,19 +789,29 @@ For this we store one more than the number of blank lines as
         int delayed_lines;
         int out_next;
  
-Generating these tokens involve two separate pieces of code.
+Generating these tokens involves two separate pieces of code.
  
  Firstly we need to recognise white space and count the indents and
  newlines.  These are recorded in the above state fields.
  
-Separately we need, on each call to `token_next`, we need to check if
+Separately we need, on each call to `token_next`, to check if
  there are some delayed tokens and if so we need to advance the state
  information and return one token.
  
+###### internal functions
+       static int state_indent(struct token_state *state)
+       {
+               if (state->node == NULL)
+                       return state->col;
+               return state->node->indent - state->node->needs_strip + state->col;
+       }
+
  ###### white space
+       if (is_newline(ch))
+               state_check_node(state);
         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
                 int newlines = 0;
-               int was_son = at_son(state);
+               int was_nl = is_newline(ch);
                 if (ignored & (1<<TK_in)) {
                         if (!is_newline(ch))
                                 continue;
@@ -810,22 +822,17 @@ information and return one token.
                         return tk;
                 }
                 // Indents are needed, so check all white space.
-               while (ch <= ' ' && !at_eon(state)) {
+               while (ch <= ' ' && ch != WEOF) {
                         if (is_newline(ch))
                                 newlines += 1;
                         ch = get_char(state);
+                       if (is_newline(ch))
+                               state_check_node(state);
                 }
-               if (at_eon(state)) {
-                       newlines += 1;
-                       if (state->node->next &&
-                           state->node->next->indent > state->node->indent)
-                               state->col = state->node->next->indent;
-                       else
-                               state->col = state->node->indent;
-               } else
+               if (ch != WEOF)
                         unget_char(state);
                 state->delayed_lines = newlines;
-               state->out_next = was_son;
+               state->out_next = !was_nl;
                 state->check_indent = 1;
                 continue;
         }
@@ -833,7 +840,7 @@ information and return one token.
  ###### delayed tokens
  
         if (state->check_indent || state->delayed_lines) {
-               if (state->col < state->indent_sizes[state->indent_level]) {
+               if (state_indent(state) < state->indent_sizes[state->indent_level]) {
                         if (!state->out_next &&
                             !(ignored & (1<<TK_newline))) {
                                 state->out_next = 1;
@@ -845,11 +852,12 @@ information and return one token.
                         tk.num = TK_out;
                         return tk;
                 }
-               if (state->col > state->indent_sizes[state->indent_level] &&
+               if (state_indent(state) > state->indent_sizes[state->indent_level] &&
                     state->indent_level < sizeof(state->indent_sizes)-1) {
                         state->indent_level += 1;
-                       state->indent_sizes[state->indent_level] = state->col;
-                       state->delayed_lines -= 1;
+                       state->indent_sizes[state->indent_level] = state_indent(state);
+                       if (state->delayed_lines)
+                               state->delayed_lines -= 1;
                         tk.num = TK_in;
                         return tk;
                 }
@@ -934,10 +942,11 @@ a flag that tells us whether or not we need to strip.
         int    offset;
         int    line;
         int    col;
+       int    strip_offset;
  
  ###### internal functions
  
-       static int do_strip(struct token_state *state)
+       static void do_strip(struct token_state *state)
         {
                 int indent = 0;
                 if (state->node->needs_strip) {
@@ -953,7 +962,28 @@ a flag that tells us whether or not we need to strip.
                                 n -= 4;
                         }
                 }
-               return indent;
+       }
+
+       static void state_check_node(struct token_state *state)
+       {
+               if (!state->node)
+                       return;
+               if (state->node->code.len > state->offset)
+                       return;
+
+               do
+                       state->node = state->node->next;
+               while (state->node && state->node->code.txt == NULL);
+               state->offset = 0;
+               state->prev_offset = 0;
+               state->strip_offset = 0;
+               state->col = 0;
+               if (state->node == NULL)
+                       return;
+               state->line = state->node->line_no;
+               do_strip(state);
+               state->col = state->node->needs_strip;
+               state->strip_offset = state->offset;
         }
  
         static wint_t get_char(struct token_state *state)
@@ -962,18 +992,9 @@ a flag that tells us whether or not we need to strip.
                 size_t n;
                 mbstate_t mbstate;
  
+               state_check_node(state);
                 if (state->node == NULL)
                         return WEOF;
-               if (state->node->code.len <= state->offset) {
-                       do
-                               state->node = state->node->next;
-                       while (state->node && state->node->code.txt == NULL);
-                       state->offset = 0;
-                       if (state->node == NULL)
-                               return WEOF;
-                       state->line = state->node->line_no;
-                       state->col = do_strip(state);
-               }
  
                 ## before get_char
  
@@ -997,7 +1018,8 @@ a flag that tells us whether or not we need to strip.
                         state->col += 1;
                 } else if (is_newline(next)) {
                         state->line += 1;
-                       state->col = do_strip(state);
+                       do_strip(state);
+                       state->col = state->node->needs_strip;
                 } else if (next == '\t') {
                         state->col = indent_tab(state->col);
                 }
@@ -1116,7 +1138,7 @@ parsed too much already.  For that there is `reset_token`.
                 tok->txt.len = 0;
         }
  
-Tokens make not cross into the next `code_node`, and some tokens can
+Tokens may not cross into the next `code_node`, and some tokens can
  include the newline at the and of a `code_node`, we must be able to
  easily check if we have reached the end.  Equally we need to know if
  we are at the start of a node, as white space is treated a little
@@ -1126,7 +1148,7 @@ differently there.
  
         static int at_son(struct token_state *state)
         {
-               return state->offset == 0;
+               return state->prev_offset <= state->strip_offset;
         }
  
         static int at_eon(struct token_state *state)
@@ -1209,7 +1231,9 @@ As well as getting tokens, we need to be able to create the
                 memset(state, 0, sizeof(*state));
                 state->node = code;
                 state->line = code->line_no;
-               state->col = do_strip(state);
+               do_strip(state);
+               state->col = state->node->needs_strip;
+               state->strip_offset = state->offset;
                 state->conf = conf;
                 return state;
         }
@@ -2019,12 +2043,15 @@ the tokens one per line.
                         { "ignore-block-comment", 0, NULL, 'C'},
                         { "ignore-indent",      0, NULL, 'i'},
                         { "file",               1, NULL, 'f'},
+                       { "section",            1, NULL, 's'},
                         { NULL,                 0, NULL, 0},
                 };
-               static const char options[] = "W:w:n:NIMSzclCif:";
+               static const char options[] = "W:w:n:NIMSzclCif:s:";
  
                 struct section *table, *s, *prev;
                 int opt;
+               char *section_name = NULL;
+               int section_found = 0;
  
                 setlocale(LC_ALL,"");
                 while ((opt = getopt_long(argc, argv, options, long_options, NULL))
@@ -2043,6 +2070,7 @@ the tokens one per line.
                         case 'l': conf.ignored |= 1 << TK_newline; break;
                         case 'i': conf.ignored |= 1 << TK_in; break;
                         case 'f': filename = optarg; break;
+                       case 's': section_name = optarg; break;
                         default: fprintf(stderr, "scanner: unknown option '%c'.\n",
                                          opt);
                                 exit(1);
@@ -2079,6 +2107,12 @@ the tokens one per line.
  
                 for (s = table; s;
                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
+                       if (section_name &&
+                           (s->section.len != strlen(section_name) ||
+                            strncmp(s->section.txt, section_name, s->section.len) != 0))
+                               continue;
+                       if (section_name)
+                               section_found = 1;
                         printf("Tokenizing: %.*s\n", s->section.len,
                                 s->section.txt);
                         state = token_open(s->code, &conf);
@@ -2121,6 +2155,10 @@ the tokens one per line.
                 }
                 if (conf.words_marks != known)
                         free(conf.words_marks);
+               if (section_name && !section_found) {
+                       fprintf(stderr, "scanner: section %s not found\n", section_name);
+                       errs = 1;
+               }
                 exit(!!errs);
         }
  ###### File: scanner.mk