oceani: simplify test in var_block_close.

[ocean] / csrc / scanner.mdc
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

index 547a037f0c12ab07dbce59839e11ba3056e33705..6b706411f5010e3da61b6214742f3db39e91de77 100644 (file)
--- a/csrc/scanner.mdc
+++ b/csrc/scanner.mdc
@@ -19,7 +19,7 @@ The text is assumed to be UTF-8 though some matching assumes the
  ASCII subset.  If the text provided does not conform to UTF-8 an error
  will be reported and some number of bytes will be skipped.
  
-###### includes
+###### public types
         #include <wchar.h>
         #include <wctype.h>
         #include <unicode/uchar.h>
@@ -69,13 +69,15 @@ The scanner is not completely general, yet not completely specified.
  There are a fixed set of token types, though particular tokens within
  those types can be distinguish via configuration.
  
-Most token types may be explicitly ignored, as typically comments
-would be.  The exact consequence of ignoring each token type varies
-from token to token.
+Most token types may be explicitly ignored, so they aren't parsed.
+Comments typically parsed but not returned, but an option is provided to
+return comments for further processing.  The exact consequence of
+ignoring each token type varies from token to token.
  
  ###### public types
         struct token_config {
                 int ignored;    // bit set of ignored tokens.
+               int return_comments;
                 ## token config parameters
         };
  
@@ -85,12 +87,11 @@ from token to token.
  ###### token_next init
         int ignored = state->conf->ignored;
  
-
  The different tokens are numbers, words, marks, strings, comments,
  newlines, EOF, and indents, each of which is examined in detail below.
  
  There are various cases where no token can be found in part of the
-input.  All of these will be reported as an `TK_error` token.
+input.  All of these will be reported as a `TK_error` token.
  
  It is possible to declare a number of strings which form distinct
  tokens (rather than being grouped as e.g. 'word').  These are given
@@ -106,7 +107,7 @@ token numbers from `TK_reserved` upwards.
  ### Numbers
  
  Numbers are the messiest tokens to parse, primarily because they can
-contain characters that also have meaning outside of number and,
+contain characters that also have meaning outside of numbers and,
  particularly, immediately after numbers.
  
  The obvious example is the '`-`' sign.  It can come inside a number for
@@ -120,7 +121,11 @@ To make matters worse, our language designer has decided to experiment
  with allowing commas to be used as the decimal indicator, and spaces
  to be used to separate groups of digits in large numbers.  Both of
  these can reasonably be restricted to appear between two digits, so we
-have to add that condition to our tests.
+have to add that condition to our tests.  For consistency we require
+every non-alpha-numeric to appear between two hex digits, with the
+exception that a sign can appear only after a 'p' or 'e', and a space
+can only appear between decimal digits.  Allowing a space before a
+letter easily leads to confusion, such a in `a < 3 and b < 4`.
  
  So we cannot just treat numbers as starting with a digit and being
  followed by some set of characters.  We need more structure than that.
@@ -128,13 +133,16 @@ followed by some set of characters.  We need more structure than that.
  So:
  
  - Numbers must start with a digit.
-- If the first digit is zero, the next character must be a base
-  signifier (one of `xob`) or a decimal marker (`.` or `,`).
-  In the first case the first `p` or `P` may be followed by a sign.
+- If the first digit is zero, the next character should be a base
+  signifier (one of `xob`) or a decimal marker (`.` or `,`) (though this isn't
+  enforced at this stage)
+  In the first case the only first `p` or `P` may be followed by a sign.
  - If the number doesn't start with `0` followed by one of `xob`, the
    first `e` may be followed by a sign.
-- Any digit or hex digit may be followed by a space or underscore
-  providing that the subsequence character is also a (hex) digit.
+- A sign must always be followed by a digit.
+- Any digit may be followed by a space or underscore and any hex digit
+  maybe followed by an underscore, providing that the subsequence character
+  is also a digit (for space) or hex digit (for underscore).
    This rule will require an extra level of 'unget' to be
    supported when handling characters.
  - Otherwise any digits or ASCII letters are allowed.  We do not at
@@ -164,7 +172,7 @@ are declared to be a start character for words.
  ###### parse number
  
         if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
-               int prev_special = 0;
+               int prev = 0;
                 int expect_p = 0;
                 int decimal_mark = 0;
                 if (ch == '0') {
@@ -177,43 +185,62 @@ are declared to be a start character for words.
                         int sign_ok = 0;
                         switch(expect_p) {
                         case 0:
-                               if (ch == 'e')
+                               if (ch == 'e' || ch == 'E') {
                                         sign_ok = 1;
+                                       decimal_mark = 1;
+                               }
                                 break;
                         case 1:
-                               if (ch == 'p')
+                               if (ch == 'p' || ch == 'P') {
                                         sign_ok = 1;
+                                       decimal_mark = 1;
+                               }
                                 break;
                         }
                         save_unget_state(state);
+                       prev = ch;
                         ch = get_char(state);
-                       if (iswalnum(ch)) {
-                               prev_special = 0;
+
+                       if (!iswalnum(prev)) {
+                               /* special characters, like separators and decimal marks
+                                * and signs, must be followed by a hexdigit, and the
+                                * space and signs must be followed by a decimal digit.
+                                */
+                               if (!iswxdigit(ch) ||
+                                  ((prev == '-' || prev == '+') && !iswdigit(ch)) ||
+                                  (prev == ' ' && !iswdigit(ch))) {
+                                       /* don't want the new char or the special */
+                                       restore_unget_state(state);
+                                       break;
+                               }
+                       }
+                       if (iswalnum(ch))
                                 continue;
+
+                       if (!strchr(state->conf->number_chars, ch)) {
+                               /* non-number char */
+                               break;
                         }
                         if (ch == '+' || ch == '-') {
+                               /* previous must be 'e' or 'p' in appropraite context */
                                 if (!sign_ok)
                                         break;
                                 expect_p = -1;
+                       } else if (ch == ' ') {
+                               /* previous must be a digit */
+                               if (!iswdigit(prev))
+                                       break;
+                       } else {
+                               /* previous must be a hex digit */
+                               if (!iswxdigit(prev))
+                                       break;
                         }
                         if (ch == '.' || ch == ',') {
+                               /* only one of these permitted */
                                 if (decimal_mark)
                                         break;
                                 decimal_mark = 1;
                         }
-                       if (prev_special) {
-                               /* Don't allow that special char,
-                                * need two 'ungets'
-                                */
-                               restore_unget_state(state);
-                               break;
-                       }
-                       if (strchr(state->conf->number_chars, ch)) {
-                               prev_special = 1;
-                               continue;
-                       }
-                       /* non-number char */
-                       break;
                 }
                 /* We seem to have a "number" token */
                 unget_char(state);
@@ -260,8 +287,11 @@ and the length of the list must be given (`known_count`).
  Tokens matching these known words are reported as the index of the
  list added to `TK_reserved`.
  
+If identifiers are ignored, then any word which is not listed as a
+known word results in an error.
+
  ###### token config parameters
-       char **words_marks;
+       const char **words_marks;
         int known_count;
  
  ###### parse word
@@ -318,7 +348,20 @@ in a known mark, it will return that first known mark.
  
  If no known mark is found we will test against strings and comments
  below before giving up and assuming an unknown mark.
-If `TK_mark` is ignored, then unknown marks as returned as an error.
+
+If an unknown mark contains a quote character or a comment marker, and
+that token is not being ignored, then we terminate the unknown mark
+before that quote or comment.  This ensures that an unknown mark
+immediately before a string is handled correctly.
+
+If the first character of a comment marker (i.e. '/') is a known mark,
+the above rules would suggest that the start of a comment would be
+parsed as that mark, which is not what is wanted.  So when comments are
+not ignored, the introductory sequences for a comment ("//" and "/*")
+are treated as partially-known.  They prevent the leading "/" from being
+a mark by itself, but do not actually constitute a stand-alone mark.
+
+If `TK_mark` is ignored, then unknown marks are returned as errors.
  
  ###### token types
         TK_mark,
@@ -329,31 +372,61 @@ Known marks are included in the same list as the list of known words.
         tk.num = TK_error;
         while (is_mark(ch, state->conf)) {
                 int n;
+               wchar_t prev;
                 close_token(state, &tk);
                 n = find_known(state->conf, tk.txt);
                 if (n >= 0)
                         tk.num = TK_reserved + n;
                 else if (tk.num != TK_error) {
-                       /* found a longest-known-mark */
+                       /* found a longest-known-mark, still need to
+                        * check for comments
+                        */
+                       if (is_comment(ignored, tk.txt)) {
+                               /* Yes, this is a comment, not a '/' */
+                               restore_unget_state(state);
+                               tk.num = TK_error;
+                               break;
+                       }
                         unget_char(state);
                         close_token(state, &tk);
                         return tk;
                 }
+               prev = ch;
+               save_unget_state(state);
                 ch = get_char(state);
+               if (n >= 0)
+                       /* No need to worry about other token types */
+                       continue;
+               if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
+                       /* If strings are allowed, a quote (Which isn't a known mark)
+                        * mustn't be treated as part of an unknown mark.  It can be
+                        * part of a multi-line string though.
+                        */
+                       break;
+
+               close_token(state, &tk);
+               if (is_comment(ignored, tk.txt)) {
+                       /* looks like a permitted comment, and not a known mark,
+                        * so assume it is a comment.
+                        */
+                       restore_unget_state(state);
+                       break;
+               }
         }
         unget_char(state);
-       if (tk.num != TK_error)
-               return tk;
-
-###### unknown mark
-       if (tk.txt.len) {
-               if (ignored & (1<<TK_mark))
-                       tk.num = TK_error;
-               else
-                       tk.num = TK_mark;
+       if (tk.num != TK_error) {
+               close_token(state, &tk);
                 return tk;
         }
  
+If we don't find a known mark, we will check for strings and comments
+before assuming that we have an unknown mark
+
+###### parse mark
+       ## parse string
+       ## parse comment
+       ## unknown mark
+
  ### Strings
  
  Strings start with one of single quote, double quote, or back quote
@@ -361,7 +434,7 @@ and continue until a matching character on the same line.  Any of
  these characters can be included in the list of known marks and then
  they will not be used for identifying strings.
  
-Immediately following the close quote one or two ASCII letters may
+Immediately following the close quote, one or two ASCII letters may
  appear.  These are somewhat like the arbitrary letters allowed in
  "Numbers" above.  They can be used by the language in various ways.
  
@@ -383,7 +456,7 @@ token types.
  ###### internal functions
         static int is_quote(wchar_t ch)
         {
-               return ch == '\'' || ch == '"' || ch == '`';
+               return ch == '\'' || ch == '"' || ch == '`'; // "
         }
  
  #### Multi-line strings
@@ -393,7 +466,7 @@ ignored, we fall through and treat a triple quote as an empty string
  followed by the start of a new string.
  
  ###### parse string
-       if (tk.txt.len == 3 &&
+       if (tk.txt.len >= 3 &&
             !(ignored & (1 << TK_multi_string)) &&
             is_quote(tk.txt.txt[0]) &&
             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
@@ -419,7 +492,8 @@ followed by the start of a new string.
                          * unget so the newline is seen,
                          * but return rest of string as an error.
                          */
-                       unget_char(state);
+                       if (is_newline(ch))
+                               unget_char(state);
                         close_token(state, &tk);
                         tk.num = TK_error;
                         return tk;
@@ -453,15 +527,23 @@ If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
             !(ignored & (1<<TK_string))) {
                 wchar_t first = tk.txt.txt[0];
                 reset_token(state, &tk);
-               get_char(state);
-               do
+               ch = get_char(state);
+               tk.num = TK_error;
+               while (!at_eon(state) && !is_newline(ch)) {
                         ch = get_char(state);
-               while (ch != first && !is_newline(ch));
-               tk.num = TK_string;
-               if (is_newline(ch)) {
-                       unget_char(state);
-                       tk.num = TK_error;
+                       if (ch == first) {
+                               tk.num = TK_string;
+                               break;
+                       }
+                       if (is_newline(ch)) {
+                               unget_char(state);
+                               break;
+                       }
                 }
+               while (!at_eon(state) && (ch = get_char(state)) &&
+                                         iswalpha(ch))
+                       ;
+               unget_char(state);
                 close_token(state, &tk);
                 return tk;
         }
@@ -484,39 +566,51 @@ it would not suffer from this rule.
  
  These two comment types are reported as two separate token types, and
  consequently can be ignored separately.  When ignored a comment is
-parsed and discarded.
+still parsed, but is discarded.
  
  ###### token types
         TK_line_comment,
         TK_block_comment,
  
  ###### internal functions
-       static int is_line_comment(struct text txt)
+       static int is_line_comment(int ignored, struct text txt)
         {
+               if (ignored & (1 << TK_line_comment))
+                       return 0;
                 return (txt.len >= 1 && txt.txt[0] == '#') ||
                        (txt.len >= 2 && txt.txt[0] == '/' &&
                                         txt.txt[1] == '/');
         }
  
-       static int is_block_comment(struct text txt)
+       static int is_block_comment(int ignored, struct text txt)
         {
+               if (ignored & (1 << TK_block_comment))
+                       return 0;
                 return txt.len >= 2 && txt.txt[0] == '/' &&
                        txt.txt[1] == '*';
         }
  
+       static int is_comment(int ignored, struct text txt)
+       {
+               return is_line_comment(ignored, txt) ||
+                      is_block_comment(ignored, txt);
+       }
+
  #### Single line comments
  
-A single-line comment continues up to, but not including the newline.
+A single-line comment continues up to, but not including the newline
+or end of node.
  
  ###### parse comment
  
-       if (is_line_comment(tk.txt)) {
-               while (!is_newline(ch))
+       if (is_line_comment(ignored, tk.txt)) {
+               while (!is_newline(ch) && !at_eon(state))
                         ch = get_char(state);
-               unget_char(state);
+               if (is_newline(ch))
+                       unget_char(state);
                 close_token(state, &tk);
                 tk.num = TK_line_comment;
-               if (ignored & (1 << TK_line_comment))
+               if (!state->conf->return_comments)
                         continue;
                 return tk;
         }
@@ -533,7 +627,7 @@ the unget state (explained later).
  
  ###### parse comment
  
-       if (is_block_comment(tk.txt)) {
+       if (is_block_comment(ignored, tk.txt)) {
                 wchar_t prev;
                 int newlines = 0;
                 reset_token(state, &tk);
@@ -571,8 +665,7 @@ the unget state (explained later).
                         if (!is_newline(ch))
                                 tk.num = TK_error;
                 }
-               if (tk.num == TK_error ||
-                   !(ignored & (1 << TK_block_comment)))
+               if (tk.num == TK_error || state->conf->return_comments)
                         return tk;
                 continue;
         }
@@ -596,22 +689,22 @@ node (detected by `at_son()`);
  
  If a line starts with more white-space than the previous non-blank
  line - or if the first non-blank line in the document starts with any
-white-space - then an Indent is reported at the start of the line.
+white-space - then an "IN" is reported at the start of the line.
  
  Before the next non-blank line which starts with less white space, or
-at the latest at the end of the document, a matching Undent token
-is reported.  There will always be an exact match between Indent and
-Undent tokens.
+at the latest at the end of the document, a matching "OUT" token
+is reported.  There will always be an exact match between "IN" and
+"OUT" tokens.
  
-It is possible for Undent to be followed (almost) immediately by an
-Indent.  This happens if, for example, the indent of three consecutive
+It is possible for "OUT" to be followed (almost) immediately by an
+"IN".  This happens if, for example, the indent of three consecutive
  lines are 0, 8, 4 spaces.  Before the second line we report an
-Indent.  Before the third line we must report an Undent, as 4 is less
+"IN".  Before the third line we must report an "OUT", as 4 is less
  than 8, then also an Ident as 4 is greater than 0.
  
  ###### token types
-       TK_indent,
-       TK_undent,
+       TK_in,
+       TK_out,
  
  For the purpose of measuring the length of white space, a tab adds at
  least one space, and rounds up to a multiple of 8.
@@ -632,6 +725,8 @@ ignored.
         int     indent_level;
         int     indent_sizes[20];
  
+`indent_sizes[0]` will always be zero - this simplifies some code.
+
  #### Newlines
  
  Newlines can optionally be reported.  Newlines within a block comment
@@ -639,9 +734,9 @@ or a multi-line string are not reported separately, but each of these
  must be followed immediately by a newline so these constructs cannot
  hide the fact that a newline was present.
  
-When Indents are being reported, the Newline which would normally be
-reported immediately before the Indent is delayed until after the
-matching undent.  This makes an indented section act like a
+When indents are being reported, the Newline which would normally be
+reported immediately before the "IN" is delayed until after the
+matching "OUT".  This makes an indented section act like a
  continuation of the previous line to some extent.
  
  A blank line would normally be reported simply as two consecutive Newline
@@ -650,7 +745,7 @@ reported) then the right thing to do is less obvious as Newlines should be
  delayed - but how many Newlines?
  
  The approach we will take is to report the extra Newlines immediately after
-the Indent token, so the blank line is treated as though it were an indented
+the IN token, so the blank line is treated as though it were an indented
  blank line.
  
  ###### token types
@@ -659,114 +754,120 @@ blank line.
  If we find a newline or white space at the start of a block, we keep
  collecting spaces, tabs, and newlines until we find some real text.
  Then depending on the indent we generate some number of tokens.  These
-will be a sequence of "Newline Undent" pairs representing a decrease
-in indent, then either a Newline or an Indent depending on whether the
+will be a sequence of "Newline OUT" pairs representing a decrease
+in indent, then either a Newline or an IN depending on whether the
  next line is indented, then zero or more Newlines representing all the
  blank lines that have been skipped.
  
  When a Newline leads to the next block of code there is a question of
-whether the various Newline and Undent/Indent tokens should appear to
-pbelong to the earlier or later block.  This is addressed by processing
+whether the various Newline and OUT/IN tokens should appear to
+belong to the earlier or later block.  This is addressed by processing
  the tokens in two stages based on the relative indent levels of the
  two blocks (each block has a base indent to which the actual indents
  are added).
  
-Any "Newline Undent" pairs needed to reduce the current indent to the
+Any "Newline OUT" pairs needed to reduce the current indent to the
  maximum of the base indents of the old and new blocks are generated
  against the old block.  Then if the next block does not have an
  increased indent, one more "Newline" is generated.
  
-If further "Newline Undent" pairs are needed to get to the indent
+If further "Newline OUT" pairs are needed to get to the indent
  level of the 'next' block, they are generated against that block,
  though the first Newline is suppressed (it having already been
  generated).
  
-Finally the Newline or Indent for the first line of the new block is
+Finally the Newline or IN for the first line of the new block is
  generated, unless the Newline needs to be suppressed because it
  appeared at the end of the previous block.
  
-This means that a block may start with an Undent or an Indent, but
+This means that a block may start with an OUT or an IN, but
  will only start with a Newline if it actually starts with a blank
  line.
  
  We will need to represent in the `token_state` where in this sequence
  of delayed tokens we are.  As `state.col` records the target indent we
-don't need to record how many undents or indents are needed.  We do
+don't need to record how many OUTs or INs are needed.  We do
  need to record the number of blank lines, and which of Newline and
-Undent is needed next in the initial sequence of pairs.
+OUT is needed next in the initial sequence of pairs.
  
  For this we store one more than the number of blank lines as
-`delayed_lines` and a flag for `undent_next`.
+`delayed_lines` and a flag for `out_next`.
  
  ###### state fields
         int check_indent;
         int delayed_lines;
-       int undent_next;
+       int out_next;
  
-Generating these tokens involve two separate pieces of code.
+Generating these tokens involves two separate pieces of code.
  
  Firstly we need to recognise white space and count the indents and
  newlines.  These are recorded in the above state fields.
  
-Separately we need, on each call to `token_next`, we need to check if
+Separately we need, on each call to `token_next`, to check if
  there are some delayed tokens and if so we need to advance the state
  information and return one token.
  
+###### internal functions
+       static int state_indent(struct token_state *state)
+       {
+               if (state->node == NULL)
+                       return state->col;
+               return state->node->indent - state->node->needs_strip + state->col;
+       }
+
  ###### white space
+       if (is_newline(ch))
+               state_check_node(state);
         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
                 int newlines = 0;
-               int was_son = at_son(state);
-               if (ignored & (1<<TK_indent)) {
+               int was_nl = is_newline(ch);
+               if (ignored & (1<<TK_in)) {
                         if (!is_newline(ch))
                                 continue;
                         if (ignored & (1<<TK_newline))
                                 continue;
                         tk.num = TK_newline;
+                       close_token(state, &tk);
                         return tk;
                 }
                 // Indents are needed, so check all white space.
-               while (ch <= ' ' && !at_eon(state)) {
+               while (ch <= ' ' && ch != WEOF) {
                         if (is_newline(ch))
                                 newlines += 1;
                         ch = get_char(state);
+                       if (is_newline(ch))
+                               state_check_node(state);
                 }
-               if (at_eon(state)) {
-                       newlines += 1;
-                       if (state->node->next &&
-                           state->node->next->indent > state->node->indent)
-                               state->col = state->node->next->indent;
-                       else
-                               state->col = state->node->indent;
-               } else
+               if (ch != WEOF)
                         unget_char(state);
                 state->delayed_lines = newlines;
-               state->undent_next = was_son;
+               state->out_next = !was_nl;
                 state->check_indent = 1;
                 continue;
         }
  
-
  ###### delayed tokens
  
         if (state->check_indent || state->delayed_lines) {
-               if (state->col < state->indent_sizes[state->indent_level]) {
-                       if (!state->undent_next &&
+               if (state_indent(state) < state->indent_sizes[state->indent_level]) {
+                       if (!state->out_next &&
                             !(ignored & (1<<TK_newline))) {
-                               state->undent_next = 1;
+                               state->out_next = 1;
                                 tk.num = TK_newline;
                                 return tk;
                         }
                         state->indent_level -= 1;
-                       state->undent_next = 0;
-                       tk.num = TK_undent;
+                       state->out_next = 0;
+                       tk.num = TK_out;
                         return tk;
                 }
-               if (state->col > state->indent_sizes[state->indent_level] &&
+               if (state_indent(state) > state->indent_sizes[state->indent_level] &&
                     state->indent_level < sizeof(state->indent_sizes)-1) {
                         state->indent_level += 1;
-                       state->indent_sizes[state->indent_level] = state->col;
-                       state->delayed_lines -= 1;
-                       tk.num = TK_indent;
+                       state->indent_sizes[state->indent_level] = state_indent(state);
+                       if (state->delayed_lines)
+                               state->delayed_lines -= 1;
+                       tk.num = TK_in;
                         return tk;
                 }
                 state->check_indent = 0;
@@ -788,7 +889,6 @@ tokens will continue to return the same end-of-file token.
  ###### token types
         TK_eof,
  
-
  ###### white space
         if (ch == WEOF) {
                 tk.num = TK_eof;
@@ -802,7 +902,21 @@ If the token we have is not empty and `TK_mark` is allowed,
  we have an unknown mark, otherwise this must be an error.
  
  ###### unknown mark
-       /* one unknown character */
+
+       /* one unknown mark character */
+       if (tk.txt.len) {
+               close_token(state, &tk);
+               if (ignored & (1<<TK_mark))
+                       tk.num = TK_error;
+               else
+                       tk.num = TK_mark;
+               return tk;
+       }
+       /* Completely unrecognised character is next, possibly
+        * a digit and we are ignoring numbers.
+        * What ever it is, make it an error.
+        */
+       get_char(state);
         close_token(state, &tk);
         tk.num = TK_error;
         return tk;
@@ -832,43 +946,59 @@ a flag that tells us whether or not we need to strip.
         int    offset;
         int    line;
         int    col;
+       int    strip_offset;
  
  ###### internal functions
  
         static void do_strip(struct token_state *state)
         {
+               int indent = 0;
                 if (state->node->needs_strip) {
                         int n = 4;
                         while (n && state->node->code.txt[state->offset] == ' ') {
+                               indent += 1;
                                 state->offset += 1;
                                 n -= 1;
                         }
-                       while (n == 4 && state->node->code.txt[0] == '\t') {
+                       while (n == 4 && state->node->code.txt[state->offset] == '\t') {
+                               indent = indent_tab(indent);
                                 state->offset += 1;
                                 n -= 4;
                         }
                 }
         }
  
+       static void state_check_node(struct token_state *state)
+       {
+               if (!state->node)
+                       return;
+               if (state->node->code.len > state->offset)
+                       return;
+
+               do
+                       state->node = state->node->next;
+               while (state->node && state->node->code.txt == NULL);
+               state->offset = 0;
+               state->prev_offset = 0;
+               state->strip_offset = 0;
+               state->col = 0;
+               if (state->node == NULL)
+                       return;
+               state->line = state->node->line_no;
+               do_strip(state);
+               state->col = state->node->needs_strip;
+               state->strip_offset = state->offset;
+       }
+
         static wint_t get_char(struct token_state *state)
         {
                 wchar_t next;
                 size_t n;
                 mbstate_t mbstate;
  
+               state_check_node(state);
                 if (state->node == NULL)
                         return WEOF;
-               if (state->node->code.len <= state->offset) {
-                       do
-                               state->node = state->node->next;
-                       while (state->node && state->node->code.txt == NULL);
-                       state->offset = 0;
-                       if (state->node == NULL)
-                               return WEOF;
-                       do_strip(state);
-                       state->line = state->node->line_no;
-                       state->col = state->node->indent;
-               }
  
                 ## before get_char
  
@@ -879,12 +1009,12 @@ a flag that tells us whether or not we need to strip.
                             &mbstate);
                 if (n == -2 || n == 0) {
                         /* Not enough bytes - not really possible */
-                       next = '\n';
-                       state->offset = state->node->code.len;
+                       next = '\n';                            // NOTEST
+                       state->offset = state->node->code.len;  // NOTEST
                 } else if (n == -1) {
                         /* error */
-                       state->offset += 1;
-                       next = 0x7f; // an illegal character
+                       state->offset += 1;                     // NOTEST
+                       next = 0x7f; // an illegal character    // NOTEST
                 } else
                         state->offset += n;
  
@@ -892,8 +1022,8 @@ a flag that tells us whether or not we need to strip.
                         state->col += 1;
                 } else if (is_newline(next)) {
                         state->line += 1;
-                       state->col = state->node->indent;
                         do_strip(state);
+                       state->col = state->node->needs_strip;
                 } else if (next == '\t') {
                         state->col = indent_tab(state->col);
                 }
@@ -996,8 +1126,11 @@ parsed too much already.  For that there is `reset_token`.
         static void close_token(struct token_state *state,
                                 struct token *tk)
         {
-               tk->txt.len = (state->node->code.txt + state->offset)
-                             - tk->txt.txt;
+               if (state->node != tk->node)
+                       tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
+               else
+                       tk->txt.len = (state->node->code.txt + state->offset)
+                                     - tk->txt.txt;
         }
  
         static void reset_token(struct token_state *state, struct token *tok)
@@ -1009,8 +1142,7 @@ parsed too much already.  For that there is `reset_token`.
                 tok->txt.len = 0;
         }
  
-
-Tokens make not cross into the next `code_node`, and some tokens can
+Tokens may not cross into the next `code_node`, and some tokens can
  include the newline at the and of a `code_node`, we must be able to
  easily check if we have reached the end.  Equally we need to know if
  we are at the start of a node, as white space is treated a little
@@ -1020,7 +1152,7 @@ differently there.
  
         static int at_son(struct token_state *state)
         {
-               return state->offset == 0;
+               return state->prev_offset <= state->strip_offset;
         }
  
         static int at_eon(struct token_state *state)
@@ -1067,7 +1199,7 @@ searching for.
  Now we have all the bits there is just one section missing:  combining
  all the token parsing code into one block.
  
-The handling of delayed tokens (newlines, indents, undents) must come
+The handling of delayed tokens (Newlines, INs, OUTs) must come
  first before we try getting another character.
  
  Then we parse all the test, making sure that we check for known marks
@@ -1086,9 +1218,6 @@ loop.
         ## parse number
         ## parse word
         ## parse mark
-       ## parse string
-       ## parse comment
-       ## unknown mark
  
  ### Start and stop
  
@@ -1106,6 +1235,9 @@ As well as getting tokens, we need to be able to create the
                 memset(state, 0, sizeof(*state));
                 state->node = code;
                 state->line = code->line_no;
+               do_strip(state);
+               state->col = state->node->needs_strip;
+               state->strip_offset = state->offset;
                 state->conf = conf;
                 return state;
         }
@@ -1169,8 +1301,8 @@ so that it can be used to tracing processed strings too.
                         [TK_multi_string] = "mstring",
                         [TK_line_comment] = "lcomment",
                         [TK_block_comment] = "bcomment",
-                       [TK_indent] = "indent",
-                       [TK_undent] = "undent",
+                       [TK_in] = "in",
+                       [TK_out] = "out",
                         [TK_newline] = "newline",
                         [TK_eof] = "eof",
                         [TK_error] = "ERROR",
@@ -1180,8 +1312,8 @@ so that it can be used to tracing processed strings too.
                 default: /* known word or mark */
                         fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
                         break;
-               case TK_indent:
-               case TK_undent:
+               case TK_in:
+               case TK_out:
                 case TK_newline:
                 case TK_eof:
                         /* No token text included */
@@ -1232,7 +1364,7 @@ tokens.  Now we just need C files to store them, and a mk file to make them.
  
  Converting a `TK_number` token to a numerical value is a slightly
  higher level task than lexical analysis, and slightly lower than
-grammar parsing, so put it here - as an index if you like.
+grammar parsing, so put it here - as an appendix if you like.
  
  Importantly it will be used by the same testing rig that is used for
  testing the token scanner.
@@ -1257,10 +1389,10 @@ had never been initialised.
                                 int *placesp)
         {
                 /* Accept digits up to 'base', ignore '_' and
-                * ' ' if they appear between two legal digits,
-                * and if `placesp` is not NULL, allow a single
-                * '.' or ',' and report the number of digits
-                * beyond there.
+                * (for base 10) ' ' if they appear between two
+                * legal digits, and if `placesp` is not NULL,
+                * allow a single '.' or ',' and report the number
+                * of digits beyond there.
                  * Return number of characters processed (p),
                  * or 0 if something illegal was found.
                  */
@@ -1273,7 +1405,7 @@ had never been initialised.
                         int dig;
                         char c = tok.txt[p];
  
-                       if (c == '_' || c == ' ') {
+                       if (c == '_' || (c == ' ' && base == 10)) {
                                 if (prev != Digit)
                                         goto bad;
                                 prev = Space;
@@ -1393,7 +1525,7 @@ we need to record the number of places.  We won't impose the number of
  places until we have the exponent as well.
  
  ###### number vars
-       int places =0;
+       int places = 0;
         mpz_t mant;
         int d;
  
@@ -1443,7 +1575,6 @@ character `expc`.
         tok.txt += d;
         tok.len -= d;
  
-
  Now that we have the mantissa and the exponent we can multiply them
  together, also allowing for the number of digits after the decimal
  mark.
@@ -1485,7 +1616,7 @@ Multiplication.
                 mpq_set_ui(tens, 10, 1);
                 while (1) {
                         if (lexp & 1) {
-                               if (esign > 1)
+                               if (esign > 0)
                                         mpq_mul(num, num, tens);
                                 else
                                         mpq_div(num, num, tens);
@@ -1507,7 +1638,6 @@ Now we are ready to parse a number: the base, mantissa, and exponent.
  If all goes well we check for the possible trailing letters and
  return.  Return value is 1 for success and 0 for failure.
  
-
  ###### number functions
         int number_parse(mpq_t num, char tail[3], struct text tok)
         {
@@ -1549,7 +1679,7 @@ Number parsing goes in `libnumber.c`
         ## number includes
         ## number functions
  
-###### File: number.h
+###### File: parse_number.h
         int number_parse(mpq_t num, char tail[3], struct text tok);
  
  ###### File: scanner.mk
@@ -1716,7 +1846,7 @@ required indent is found.
                 if (c == ' ')
                         skipped += 1;
                 else if (c == '\t')
-                       skipped = indent_tab(c);
+                       skipped = indent_tab(skipped);
                 else
                         break;
                 i+= 1;
@@ -1833,7 +1963,7 @@ String parsing goes in `libstring.c`
         ## string functions
         ## string main
  
-###### File: string.h
+###### File: parse_string.h
         int string_parse(struct token *tok, char escape,
                          struct text *str, char tail[3]);
  
@@ -1842,7 +1972,6 @@ String parsing goes in `libstring.c`
         libstring.o : libstring.c
                 $(CC) $(CFLAGS) -c libstring.c
  
-
  ## Testing
  
  As "untested code is buggy code" we need a program to easily test
@@ -1860,10 +1989,11 @@ the tokens one per line.
         #include <stdio.h>
         #include <gmp.h>
         #include <locale.h>
+       #include <getopt.h>
         #include "mdcode.h"
         #include "scanner.h"
-       #include "number.h"
-       #include "string.h"
+       #include "parse_number.h"
+       #include "parse_string.h"
  
         static int errs;
         static void pr_err(char *msg)
@@ -1872,13 +2002,21 @@ the tokens one per line.
                 fprintf(stderr, "%s\n", msg);
         }
  
+       static int kcmp(const void *ap, const void *bp)
+       {
+               char * const *a = ap;
+               char * const *b = bp;
+               return strcmp(*a, *b);
+       }
+
         int main(int argc, char *argv[])
         {
                 int fd;
                 int len;
                 char *file;
+               char *filename = NULL;
                 struct token_state *state;
-               char *known[] = {
+               const char *known[] = {
                         "==",
                         "else",
                         "if",
@@ -1893,27 +2031,94 @@ the tokens one per line.
                         .words_marks = known,
                         .number_chars = "., _+-",
                         .known_count = sizeof(known)/sizeof(known[0]),
-                       .ignored = (0 << TK_line_comment)
-                                 |(0 << TK_block_comment),
+                       .ignored = 0,
+               };
+               static const struct option long_options[] = {
+                       { "word-start",         1, NULL, 'W'},
+                       { "word-cont",          1, NULL, 'w'},
+                       { "number-chars",       1, NULL, 'n'},
+                       { "ignore-numbers",     0, NULL, 'N'},
+                       { "ignore-ident",       0, NULL, 'I'},
+                       { "ignore-marks",       0, NULL, 'M'},
+                       { "ignore-strings",     0, NULL, 'S'},
+                       { "ignore-multi-strings",0, NULL, 'z'},
+                       { "ignore-line-comment",0, NULL, 'c'},
+                       { "ignore-newline",     0, NULL, 'l'},
+                       { "ignore-block-comment", 0, NULL, 'C'},
+                       { "ignore-indent",      0, NULL, 'i'},
+                       { "return-comments",    0, NULL, 'r'},
+                       { "file",               1, NULL, 'f'},
+                       { "section",            1, NULL, 's'},
+                       { NULL,                 0, NULL, 0},
                 };
+               static const char options[] = "W:w:n:NIMSzclCirf:s:";
+
                 struct section *table, *s, *prev;
+               int opt;
+               char *section_name = NULL;
+               int section_found = 0;
+
                 setlocale(LC_ALL,"");
-               if (argc != 2) {
-                       fprintf(stderr, "Usage: scanner file\n");
-                       exit(2);
+               while ((opt = getopt_long(argc, argv, options, long_options, NULL))
+                      != -1) {
+                       switch(opt) {
+                       case 'W': conf.word_start = optarg; break;
+                       case 'w': conf.word_cont = optarg; break;
+                       case 'n': conf.number_chars = optarg; break;
+                       case 'N': conf.ignored |= 1 << TK_number; break;
+                       case 'I': conf.ignored |= 1 << TK_ident; break;
+                       case 'M': conf.ignored |= 1 << TK_mark; break;
+                       case 'S': conf.ignored |= 1 << TK_string; break;
+                       case 'z': conf.ignored |= 1 << TK_multi_string; break;
+                       case 'c': conf.ignored |= 1 << TK_line_comment; break;
+                       case 'C': conf.ignored |= 1 << TK_block_comment; break;
+                       case 'l': conf.ignored |= 1 << TK_newline; break;
+                       case 'i': conf.ignored |= 1 << TK_in; break;
+                       case 'r': conf.return_comments = 1; break;
+                       case 'f': filename = optarg; break;
+                       case 's': section_name = optarg; break;
+                       default: fprintf(stderr, "scanner: unknown option '%c'.\n",
+                                        opt);
+                               exit(1);
+                       }
+               }
+
+               if (optind < argc) {
+                       const char **wm = calloc(argc - optind, sizeof(char*));
+                       int i;
+                       for (i = optind; i < argc; i++)
+                               wm[i - optind] = argv[i];
+                       qsort(wm, argc-optind, sizeof(char*), kcmp);
+                       conf.words_marks = wm;
+                       conf.known_count = argc - optind;
                 }
-               fd = open(argv[1], O_RDONLY);
+
+               if (filename)
+                       fd = open(filename, O_RDONLY);
+               else
+                       fd = 0;
                 if (fd < 0) {
                         fprintf(stderr, "scanner: cannot open %s: %s\n",
-                               argv[1], strerror(errno));
+                               filename, strerror(errno));
                         exit(1);
                 }
                 len = lseek(fd, 0, 2);
+               if (len <= 0) {
+                       fprintf(stderr,"scanner: %s is empty or not seekable\n",
+                               filename ?: "stdin");
+                       exit(1);
+               }
                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
                 table = code_extract(file, file+len, pr_err);
  
                 for (s = table; s;
                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
+                       if (section_name &&
+                           (s->section.len != strlen(section_name) ||
+                            strncmp(s->section.txt, section_name, s->section.len) != 0))
+                               continue;
+                       if (section_name)
+                               section_found = 1;
                         printf("Tokenizing: %.*s\n", s->section.len,
                                 s->section.txt);
                         state = token_open(s->code, &conf);
@@ -1952,6 +2157,13 @@ the tokens one per line.
                                 if (tk.num == TK_eof)
                                         break;
                         }
+                       token_close(state);
+               }
+               if (conf.words_marks != known)
+                       free(conf.words_marks);
+               if (section_name && !section_found) {
+                       fprintf(stderr, "scanner: section %s not found\n", section_name);
+                       errs = 1;
                 }
                 exit(!!errs);
         }
@@ -1964,4 +2176,3 @@ the tokens one per line.
                         libmdcode.o libnumber.o libstring.o -licuuc -lgmp
         scanner.o : scanner.c
                 $(CC) $(CFLAGS) -c scanner.c
-