scanner.mdc: lexical scanner for Ocean.

author NeilBrown <neilb@suse.de>

Sat, 22 Jun 2013 09:18:55 +0000 (19:18 +1000)

committer NeilBrown <neilb@suse.de>

Sat, 22 Jun 2013 09:18:55 +0000 (19:18 +1000)
author NeilBrown <neilb@suse.de>
Sat, 22 Jun 2013 09:18:55 +0000 (19:18 +1000)
committer NeilBrown <neilb@suse.de>
Sat, 22 Jun 2013 09:18:55 +0000 (19:18 +1000)
diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc

new file mode 100644 (file)

index 0000000..547a037
--- /dev/null
+++ b/csrc/scanner.mdc
@@ -0,0 +1,1967 @@
+# Lexical Scanner #
+
+## The Task at Hand ##
+
+The main task of the lexical scanner is to convert a stream of
+characters into a stream of tokens.  The tokens are then typically
+used by a parser to extract the syntactic structure.
+
+The stream of characters are assumed to be in memory identified by a
+linked list of blocks, such as provided by the "[mdcode][]" literate
+program extractor.  A single token may never cross a block boundary.
+
+[mdcode]: mdcode.html
+
+###### includes
+       #include "mdcode.h"
+
+The text is assumed to be UTF-8 though some matching assumes the
+ASCII subset.  If the text provided does not conform to UTF-8 an error
+will be reported and some number of bytes will be skipped.
+
+###### includes
+       #include <wchar.h>
+       #include <wctype.h>
+       #include <unicode/uchar.h>
+
+Tokens are returned by successive calls to the main interface
+function: `token_next()` which has a `state` structure to keep track
+of where it is up to.  Each token carries not just a numeric
+identifier but also the code block, the line and character within that
+block, and the actual start and length using the `struct text` from
+"mdcode".
+
+###### public types
+       struct token {
+               int               num;
+               struct code_node *node;
+               struct text       txt;
+               int               line, col;
+       };
+       struct token_state;
+
+###### private types
+       struct token_state {
+               ## state fields
+       };
+
+###### exported functions
+       struct token token_next(struct token_state *state);
+
+###### main functions
+       struct token token_next(struct token_state *state)
+       {
+               ## token_next init
+               while (1) {
+                       wint_t ch;
+                       struct token tk;
+
+                       ## one token
+               }
+       }
+
+The `line` and `col` offsets are useful for reporting errors.
+The `txt` provides the content when that is important.
+
+### Token types and configuration ##
+
+The scanner is not completely general, yet not completely specified.
+There are a fixed set of token types, though particular tokens within
+those types can be distinguish via configuration.
+
+Most token types may be explicitly ignored, as typically comments
+would be.  The exact consequence of ignoring each token type varies
+from token to token.
+
+###### public types
+       struct token_config {
+               int ignored;    // bit set of ignored tokens.
+               ## token config parameters
+       };
+
+###### state fields
+       struct token_config *conf;
+
+###### token_next init
+       int ignored = state->conf->ignored;
+
+
+The different tokens are numbers, words, marks, strings, comments,
+newlines, EOF, and indents, each of which is examined in detail below.
+
+There are various cases where no token can be found in part of the
+input.  All of these will be reported as an `TK_error` token.
+
+It is possible to declare a number of strings which form distinct
+tokens (rather than being grouped as e.g. 'word').  These are given
+token numbers from `TK_reserved` upwards.
+
+###### public types
+       enum token_num {
+               TK_error,
+               ## token types
+               TK_reserved
+       };
+
+### Numbers
+
+Numbers are the messiest tokens to parse, primarily because they can
+contain characters that also have meaning outside of number and,
+particularly, immediately after numbers.
+
+The obvious example is the '`-`' sign.  It can come inside a number for
+a negative exponent, or after a number as a subtraction operator.  To
+be sure we have parsed as best as possible we need to only allow the
+'`-`' inside a number if it is after an exponent character.  This can be
+`e` or `p` (for hex exponents), but `e` can also be a hexadecimal
+digit, so we don't allow '`-`' after just any `e`.
+
+To make matters worse, our language designer has decided to experiment
+with allowing commas to be used as the decimal indicator, and spaces
+to be used to separate groups of digits in large numbers.  Both of
+these can reasonably be restricted to appear between two digits, so we
+have to add that condition to our tests.
+
+So we cannot just treat numbers as starting with a digit and being
+followed by some set of characters.  We need more structure than that.
+
+So:
+
+- Numbers must start with a digit.
+- If the first digit is zero, the next character must be a base
+  signifier (one of `xob`) or a decimal marker (`.` or `,`).
+  In the first case the first `p` or `P` may be followed by a sign.
+- If the number doesn't start with `0` followed by one of `xob`, the
+  first `e` may be followed by a sign.
+- Any digit or hex digit may be followed by a space or underscore
+  providing that the subsequence character is also a (hex) digit.
+  This rule will require an extra level of 'unget' to be
+  supported when handling characters.
+- Otherwise any digits or ASCII letters are allowed.  We do not at
+  this point check that the digits given are permitted by the base.
+  That will happen when the token is converted to a number.
+
+To allow easy configuration, the various non alphanumeric characters
+are only permitted if they are listed in a configuration parameter.
+
+###### token config parameters
+       char *number_chars;
+
+Note that numbers may not start with a period, so `.75` is not a
+number.  This is not the norm, but is not unheard of.  Excluding these
+numbers simplifies the rule at very little cost.
+
+###### token types
+       TK_number,
+
+If TK_number is ignored, digits will result in an error unless they
+are declared to be a start character for words.
+
+###### includes
+
+       #include <string.h>
+
+###### parse number
+
+       if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
+               int prev_special = 0;
+               int expect_p = 0;
+               int decimal_mark = 0;
+               if (ch == '0') {
+                       wchar_t ch2 = get_char(state);
+                       if (strchr("xobXOB", ch2) != NULL)
+                               expect_p = 1;
+                       unget_char(state);
+               }
+               while (1) {
+                       int sign_ok = 0;
+                       switch(expect_p) {
+                       case 0:
+                               if (ch == 'e')
+                                       sign_ok = 1;
+                               break;
+                       case 1:
+                               if (ch == 'p')
+                                       sign_ok = 1;
+                               break;
+                       }
+                       save_unget_state(state);
+                       ch = get_char(state);
+                       if (iswalnum(ch)) {
+                               prev_special = 0;
+                               continue;
+                       }
+                       if (ch == '+' || ch == '-') {
+                               if (!sign_ok)
+                                       break;
+                               expect_p = -1;
+                       }
+                       if (ch == '.' || ch == ',') {
+                               if (decimal_mark)
+                                       break;
+                               decimal_mark = 1;
+                       }
+                       if (prev_special) {
+                               /* Don't allow that special char,
+                                * need two 'ungets'
+                                */
+                               restore_unget_state(state);
+                               break;
+                       }
+                       if (strchr(state->conf->number_chars, ch)) {
+                               prev_special = 1;
+                               continue;
+                       }
+                       /* non-number char */
+                       break;
+               }
+               /* We seem to have a "number" token */
+               unget_char(state);
+               close_token(state, &tk);
+               tk.num = TK_number;
+               return tk;
+       }
+
+### Words
+Words start with a "start" character followed by the longest
+sequence of "continue" characters.  The Unicode ID_START and
+ID_CONTINUE sets are always permitted, but other ASCII characters
+can be added to these sets.
+
+###### token config parameters
+       char *word_start;
+       char *word_cont;
+
+###### internal functions
+       static int is_word_start(wchar_t ch, struct token_config *conf)
+       {
+               return iswalpha(ch) ||
+                      strchr(conf->word_start, ch) != NULL ||
+                      u_hasBinaryProperty(ch, UCHAR_ID_START);
+       }
+
+       static int is_word_continue(wchar_t ch, struct token_config *conf)
+       {
+               return iswalnum(ch) ||
+                      strchr(conf->word_cont, ch) != NULL ||
+                      u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
+       }
+
+Words can be either known or unknown.  Known words are referred to as
+"reserved words" and get a unique token number.  Unknown words are
+"identifiers" and are syntactically a single token.
+
+###### token types
+       TK_ident,
+
+A list of known words must be provided.  This list is shared with the
+"marks" which are described next.  The list must be lexically sorted
+and the length of the list must be given (`known_count`).
+Tokens matching these known words are reported as the index of the
+list added to `TK_reserved`.
+
+###### token config parameters
+       char **words_marks;
+       int known_count;
+
+###### parse word
+
+       if (is_word_start(ch, state->conf)) {
+               int n;
+               /* A word: identifier or reserved */
+               do
+                       ch = get_char(state);
+               while (is_word_continue(ch, state->conf));
+               unget_char(state);
+               close_token(state, &tk);
+               tk.num = TK_ident;
+               if (ignored & (1<<TK_ident))
+                       tk.num = TK_error;
+               n = find_known(state->conf, tk.txt);
+               if (n >= 0)
+                       tk.num = TK_reserved + n;
+               return tk;
+       }
+
+### Marks
+
+Marks are generally one or more punctuation marks joined together.  It
+would be nice to use the term "symbol" for these, but that causes
+confusion in a subsequent discussion of the grammar, which has terminal
+symbols and non-terminal symbols which are conceptually quite
+different.  So strings of punctuation characters will be marks.
+
+A "mark" consists of ASCII characters that are not white space, are not
+"start" characters for words, and are not digits.
+These will collectively be called mark characters.
+
+###### internal functions
+       static int is_mark(wchar_t ch, struct token_config *conf)
+       {
+               return ch > ' ' &&
+                      ch < 0x7f &&
+                      !iswalnum(ch) &&
+                      strchr(conf->word_start, ch) == NULL;
+       }
+
+As with words, there can be known and unknown marks, though the rules
+are slightly different.
+
+Two marks do not need to be separated by a non-mark characters.  This
+is different from words which do need to be separated by at least one
+non-continue character.
+
+The scanner will normally prefer longer sequences of mark characters,
+but will more strongly prefer known marks over unknown marks.  So if
+it finds a known mark where adding one more character does not result
+in a known mark, it will return that first known mark.
+
+If no known mark is found we will test against strings and comments
+below before giving up and assuming an unknown mark.
+If `TK_mark` is ignored, then unknown marks as returned as an error.
+
+###### token types
+       TK_mark,
+
+Known marks are included in the same list as the list of known words.
+
+###### parse mark
+       tk.num = TK_error;
+       while (is_mark(ch, state->conf)) {
+               int n;
+               close_token(state, &tk);
+               n = find_known(state->conf, tk.txt);
+               if (n >= 0)
+                       tk.num = TK_reserved + n;
+               else if (tk.num != TK_error) {
+                       /* found a longest-known-mark */
+                       unget_char(state);
+                       close_token(state, &tk);
+                       return tk;
+               }
+               ch = get_char(state);
+       }
+       unget_char(state);
+       if (tk.num != TK_error)
+               return tk;
+
+###### unknown mark
+       if (tk.txt.len) {
+               if (ignored & (1<<TK_mark))
+                       tk.num = TK_error;
+               else
+                       tk.num = TK_mark;
+               return tk;
+       }
+
+### Strings
+
+Strings start with one of single quote, double quote, or back quote
+and continue until a matching character on the same line.  Any of
+these characters can be included in the list of known marks and then
+they will not be used for identifying strings.
+
+Immediately following the close quote one or two ASCII letters may
+appear.  These are somewhat like the arbitrary letters allowed in
+"Numbers" above.  They can be used by the language in various ways.
+
+If 3 identical quote characters appear in a row and are
+followed by a newline, then this forms a multi-line string which
+continues until an identical triple quote appears on a line preceded
+only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
+
+Multi-line strings may not extend beyond the end of the `code_node` in
+which they start.
+
+Normal strings and multi-line strings are encoded as two different
+token types.
+
+###### token types
+       TK_string,
+       TK_multi_string,
+
+###### internal functions
+       static int is_quote(wchar_t ch)
+       {
+               return ch == '\'' || ch == '"' || ch == '`';
+       }
+
+#### Multi-line strings
+
+The multi-line string is checked for first.  If they are being
+ignored, we fall through and treat a triple quote as an empty string
+followed by the start of a new string.
+
+###### parse string
+       if (tk.txt.len == 3 &&
+           !(ignored & (1 << TK_multi_string)) &&
+           is_quote(tk.txt.txt[0]) &&
+           memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
+           is_newline(tk.txt.txt[3])) {
+               // triple quote
+               wchar_t first = tk.txt.txt[0];
+               int qseen = 0;
+               int at_sol = 1;
+               while (!at_eon(state) && qseen < 3) {
+                       ch = get_char(state);
+                       if (is_newline(ch)) {
+                               at_sol = 1;
+                               qseen = 0;
+                       } else if (at_sol && ch == first) {
+                               qseen += 1;
+                       } else if (ch != ' ' && ch != '\t') {
+                               at_sol = 0;
+                               qseen = 0;
+                       }
+               }
+               if (qseen != 3) {
+                       /* Hit end of node - error.
+                        * unget so the newline is seen,
+                        * but return rest of string as an error.
+                        */
+                       unget_char(state);
+                       close_token(state, &tk);
+                       tk.num = TK_error;
+                       return tk;
+               }
+               /* 2 letters are allowed */
+               ch = get_char(state);
+               if (iswalpha(ch))
+                       ch = get_char(state);
+               if (iswalpha(ch))
+                       ch = get_char(state);
+               /* Now we must have a newline, but we don't return it
+                * whatever it is.*/
+               unget_char(state);
+               close_token(state, &tk);
+               tk.num = TK_multi_string;
+               if (!is_newline(ch))
+                       tk.num = TK_error;
+               return tk;
+       }
+
+#### Single-line strings
+
+The sequence of marks collected may be more than a single-line
+string, so we reset to the start and collect characters until
+we find a close quote or a newline.
+
+If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
+
+###### parse string
+       if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
+           !(ignored & (1<<TK_string))) {
+               wchar_t first = tk.txt.txt[0];
+               reset_token(state, &tk);
+               get_char(state);
+               do
+                       ch = get_char(state);
+               while (ch != first && !is_newline(ch));
+               tk.num = TK_string;
+               if (is_newline(ch)) {
+                       unget_char(state);
+                       tk.num = TK_error;
+               }
+               close_token(state, &tk);
+               return tk;
+       }
+
+### Comments
+
+Single line comments may start with '`//`' or '`#`' providing that these
+are not known marks.  They continue to the end of the line.
+
+Block comments start with '`/*`' if this is not a known mark.  They
+continue to the first occurrence of '`*/`' and may not contain any
+occurrence of '`/*`'.
+
+Block comments can be wholly within one line or can continue over
+multiple lines.  The multi-line version should be followed immediately
+by a newline.  The Linux kernel contains over 285000 multi-line
+comments are only 34 are followed by characters other than white space
+(which should be removed) or a backslash (only needed in macros).  So
+it would not suffer from this rule.
+
+These two comment types are reported as two separate token types, and
+consequently can be ignored separately.  When ignored a comment is
+parsed and discarded.
+
+###### token types
+       TK_line_comment,
+       TK_block_comment,
+
+###### internal functions
+       static int is_line_comment(struct text txt)
+       {
+               return (txt.len >= 1 && txt.txt[0] == '#') ||
+                      (txt.len >= 2 && txt.txt[0] == '/' &&
+                                       txt.txt[1] == '/');
+       }
+
+       static int is_block_comment(struct text txt)
+       {
+               return txt.len >= 2 && txt.txt[0] == '/' &&
+                      txt.txt[1] == '*';
+       }
+
+#### Single line comments
+
+A single-line comment continues up to, but not including the newline.
+
+###### parse comment
+
+       if (is_line_comment(tk.txt)) {
+               while (!is_newline(ch))
+                       ch = get_char(state);
+               unget_char(state);
+               close_token(state, &tk);
+               tk.num = TK_line_comment;
+               if (ignored & (1 << TK_line_comment))
+                       continue;
+               return tk;
+       }
+
+#### Block comments
+
+The token text collected so far could exceed the comment, so we need
+to reset it first.
+
+If we find an embedded `/*` we reset to just before the '/' and report
+an error.  That way the next thing to be parsed will be the rest of
+the comment.  This requires a double unget, so we need to save/restore
+the unget state (explained later).
+
+###### parse comment
+
+       if (is_block_comment(tk.txt)) {
+               wchar_t prev;
+               int newlines = 0;
+               reset_token(state, &tk);
+               get_char(state);
+               get_char(state);
+               save_unget_state(state);
+               ch = get_char(state);
+               prev = 0;
+               while (!at_eon(state) &&
+                      (prev != '/' || ch != '*') &&
+                      (prev != '*' || ch != '/')) {
+                       if (is_newline(ch))
+                               newlines = 1;
+                       prev = ch;
+                       save_unget_state(state);
+                       ch = get_char(state);
+               }
+               close_token(state, &tk);
+               if (at_eon(state)) {
+                       tk.num = TK_error;
+                       return tk;
+               }
+               if (prev == '/') {
+                       /* embedded.  Need to unget twice! */
+                       restore_unget_state(state);
+                       unget_char(state);
+                       tk.num = TK_error;
+                       return tk;
+               }
+               tk.num = TK_block_comment;
+               if (newlines && !(ignored & (1<<TK_newline))) {
+                       /* next char must be newline */
+                       ch = get_char(state);
+                       unget_char(state);
+                       if (!is_newline(ch))
+                               tk.num = TK_error;
+               }
+               if (tk.num == TK_error ||
+                   !(ignored & (1 << TK_block_comment)))
+                       return tk;
+               continue;
+       }
+
+### Indents, Newlines, and White Space.
+
+Normally white space is ignored.  However newlines can be important as
+can indents, which are either after a newline or at the start of a
+node (detected by `at_son()`);
+
+###### exported functions
+       static inline int is_newline(wchar_t ch)
+       {
+               return ch == '\n' || ch == '\f' || ch == '\v';
+       }
+
+###### white space
+       if (ch <= ' ' && !is_newline(ch)
+           && ! at_son(state))
+               continue;
+
+If a line starts with more white-space than the previous non-blank
+line - or if the first non-blank line in the document starts with any
+white-space - then an Indent is reported at the start of the line.
+
+Before the next non-blank line which starts with less white space, or
+at the latest at the end of the document, a matching Undent token
+is reported.  There will always be an exact match between Indent and
+Undent tokens.
+
+It is possible for Undent to be followed (almost) immediately by an
+Indent.  This happens if, for example, the indent of three consecutive
+lines are 0, 8, 4 spaces.  Before the second line we report an
+Indent.  Before the third line we must report an Undent, as 4 is less
+than 8, then also an Ident as 4 is greater than 0.
+
+###### token types
+       TK_indent,
+       TK_undent,
+
+For the purpose of measuring the length of white space, a tab adds at
+least one space, and rounds up to a multiple of 8.
+
+###### exported functions
+       static inline int indent_tab(int indent)
+       {
+               return (indent|7)+1;
+       }
+
+We need to track the current levels of indent.  This requires some
+sort of stack as indent levels are pushed on and popped off.  In
+practice this stack is unlikely to often exceed 5 so we will used a
+fixed stack of 20 indent levels.  More than this will be silently
+ignored.
+
+###### state fields
+       int     indent_level;
+       int     indent_sizes[20];
+
+#### Newlines
+
+Newlines can optionally be reported.  Newlines within a block comment
+or a multi-line string are not reported separately, but each of these
+must be followed immediately by a newline so these constructs cannot
+hide the fact that a newline was present.
+
+When Indents are being reported, the Newline which would normally be
+reported immediately before the Indent is delayed until after the
+matching undent.  This makes an indented section act like a
+continuation of the previous line to some extent.
+
+A blank line would normally be reported simply as two consecutive Newline
+tokens.  However if the subsequent line is indented (and indents are being
+reported) then the right thing to do is less obvious as Newlines should be
+delayed - but how many Newlines?
+
+The approach we will take is to report the extra Newlines immediately after
+the Indent token, so the blank line is treated as though it were an indented
+blank line.
+
+###### token types
+       TK_newline,
+
+If we find a newline or white space at the start of a block, we keep
+collecting spaces, tabs, and newlines until we find some real text.
+Then depending on the indent we generate some number of tokens.  These
+will be a sequence of "Newline Undent" pairs representing a decrease
+in indent, then either a Newline or an Indent depending on whether the
+next line is indented, then zero or more Newlines representing all the
+blank lines that have been skipped.
+
+When a Newline leads to the next block of code there is a question of
+whether the various Newline and Undent/Indent tokens should appear to
+pbelong to the earlier or later block.  This is addressed by processing
+the tokens in two stages based on the relative indent levels of the
+two blocks (each block has a base indent to which the actual indents
+are added).
+
+Any "Newline Undent" pairs needed to reduce the current indent to the
+maximum of the base indents of the old and new blocks are generated
+against the old block.  Then if the next block does not have an
+increased indent, one more "Newline" is generated.
+
+If further "Newline Undent" pairs are needed to get to the indent
+level of the 'next' block, they are generated against that block,
+though the first Newline is suppressed (it having already been
+generated).
+
+Finally the Newline or Indent for the first line of the new block is
+generated, unless the Newline needs to be suppressed because it
+appeared at the end of the previous block.
+
+This means that a block may start with an Undent or an Indent, but
+will only start with a Newline if it actually starts with a blank
+line.
+
+We will need to represent in the `token_state` where in this sequence
+of delayed tokens we are.  As `state.col` records the target indent we
+don't need to record how many undents or indents are needed.  We do
+need to record the number of blank lines, and which of Newline and
+Undent is needed next in the initial sequence of pairs.
+
+For this we store one more than the number of blank lines as
+`delayed_lines` and a flag for `undent_next`.
+
+###### state fields
+       int check_indent;
+       int delayed_lines;
+       int undent_next;
+
+Generating these tokens involve two separate pieces of code.
+
+Firstly we need to recognise white space and count the indents and
+newlines.  These are recorded in the above state fields.
+
+Separately we need, on each call to `token_next`, we need to check if
+there are some delayed tokens and if so we need to advance the state
+information and return one token.
+
+###### white space
+       if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
+               int newlines = 0;
+               int was_son = at_son(state);
+               if (ignored & (1<<TK_indent)) {
+                       if (!is_newline(ch))
+                               continue;
+                       if (ignored & (1<<TK_newline))
+                               continue;
+                       tk.num = TK_newline;
+                       return tk;
+               }
+               // Indents are needed, so check all white space.
+               while (ch <= ' ' && !at_eon(state)) {
+                       if (is_newline(ch))
+                               newlines += 1;
+                       ch = get_char(state);
+               }
+               if (at_eon(state)) {
+                       newlines += 1;
+                       if (state->node->next &&
+                           state->node->next->indent > state->node->indent)
+                               state->col = state->node->next->indent;
+                       else
+                               state->col = state->node->indent;
+               } else
+                       unget_char(state);
+               state->delayed_lines = newlines;
+               state->undent_next = was_son;
+               state->check_indent = 1;
+               continue;
+       }
+
+
+###### delayed tokens
+
+       if (state->check_indent || state->delayed_lines) {
+               if (state->col < state->indent_sizes[state->indent_level]) {
+                       if (!state->undent_next &&
+                           !(ignored & (1<<TK_newline))) {
+                               state->undent_next = 1;
+                               tk.num = TK_newline;
+                               return tk;
+                       }
+                       state->indent_level -= 1;
+                       state->undent_next = 0;
+                       tk.num = TK_undent;
+                       return tk;
+               }
+               if (state->col > state->indent_sizes[state->indent_level] &&
+                   state->indent_level < sizeof(state->indent_sizes)-1) {
+                       state->indent_level += 1;
+                       state->indent_sizes[state->indent_level] = state->col;
+                       state->delayed_lines -= 1;
+                       tk.num = TK_indent;
+                       return tk;
+               }
+               state->check_indent = 0;
+               if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
+                       tk.num = TK_newline;
+                       state->delayed_lines -= 1;
+                       return tk;
+               }
+               state->delayed_lines = 0;
+               continue;
+       }
+
+### End of File
+
+After the last newline in the file has been processed, a special
+end-of-file token will be returned.  any further attempts to get more
+tokens will continue to return the same end-of-file token.
+
+###### token types
+       TK_eof,
+
+
+###### white space
+       if (ch == WEOF) {
+               tk.num = TK_eof;
+               return tk;
+       }
+
+### Unknown Marks, or errors.
+
+We have now handled all the possible known mark-like tokens.
+If the token we have is not empty and `TK_mark` is allowed,
+we have an unknown mark, otherwise this must be an error.
+
+###### unknown mark
+       /* one unknown character */
+       close_token(state, &tk);
+       tk.num = TK_error;
+       return tk;
+
+## Tools For The Task
+
+You may have noticed that are few gaps we left in the above -
+functions used without first defining them.  Doing so above would have
+broken the flow.
+
+### Character by character
+
+As we walk through the various `code_node`s we need to process whole
+Unicode codepoints, and keep track of which line and column we are on.
+We will assume for now that any printing character uses one column,
+though that is not true in general.
+
+As the text in a `code_node` may include an indent that identifies it as
+being code, we need to be careful to strip that.  The `code_node` has
+a flag that tells us whether or not we need to strip.
+
+###### includes
+       #include <memory.h>
+
+###### state fields
+       struct code_node *node;
+       int    offset;
+       int    line;
+       int    col;
+
+###### internal functions
+
+       static void do_strip(struct token_state *state)
+       {
+               if (state->node->needs_strip) {
+                       int n = 4;
+                       while (n && state->node->code.txt[state->offset] == ' ') {
+                               state->offset += 1;
+                               n -= 1;
+                       }
+                       while (n == 4 && state->node->code.txt[0] == '\t') {
+                               state->offset += 1;
+                               n -= 4;
+                       }
+               }
+       }
+
+       static wint_t get_char(struct token_state *state)
+       {
+               wchar_t next;
+               size_t n;
+               mbstate_t mbstate;
+
+               if (state->node == NULL)
+                       return WEOF;
+               if (state->node->code.len <= state->offset) {
+                       do
+                               state->node = state->node->next;
+                       while (state->node && state->node->code.txt == NULL);
+                       state->offset = 0;
+                       if (state->node == NULL)
+                               return WEOF;
+                       do_strip(state);
+                       state->line = state->node->line_no;
+                       state->col = state->node->indent;
+               }
+
+               ## before get_char
+
+               memset(&mbstate, 0, sizeof(mbstate));
+
+               n = mbrtowc(&next, state->node->code.txt + state->offset,
+                           state->node->code.len - state->offset,
+                           &mbstate);
+               if (n == -2 || n == 0) {
+                       /* Not enough bytes - not really possible */
+                       next = '\n';
+                       state->offset = state->node->code.len;
+               } else if (n == -1) {
+                       /* error */
+                       state->offset += 1;
+                       next = 0x7f; // an illegal character
+               } else
+                       state->offset += n;
+
+               if (next >= ' ') {
+                       state->col += 1;
+               } else if (is_newline(next)) {
+                       state->line += 1;
+                       state->col = state->node->indent;
+                       do_strip(state);
+               } else if (next == '\t') {
+                       state->col = indent_tab(state->col);
+               }
+               return next;
+       }
+
+We will sometimes want to "unget" the last character as it needs to be
+considered again as part of the next token.  So we need to store a
+'previous' version of all metadata.
+
+###### state fields
+       int    prev_offset;
+       int    prev_line;
+       int    prev_col;
+
+###### before get_char
+       state->prev_offset = state->offset;
+       state->prev_line   = state->line;
+       state->prev_col    = state->col;
+
+###### internal functions
+
+       static void unget_char(struct token_state *state)
+       {
+               if (state->node) {
+                       state->offset = state->prev_offset;
+                       state->line   = state->prev_line;
+                       state->col    = state->prev_col;
+               }
+       }
+
+We occasionally need a double-unget, particularly for numbers and
+block comments.  We don't impose this cost on all scanning, but
+require those code sections that need it to call `save_unget_state`
+before each `get_char`, and then `restore_unget_state` when a
+double-unget is needed.
+
+###### state fields
+       int     prev_offset2;
+       int     prev_line2;
+       int     prev_col2;
+
+###### internal functions
+       static void save_unget_state(struct token_state *state)
+       {
+               state->prev_offset2 = state->prev_offset;
+               state->prev_line2 = state->prev_line;
+               state->prev_col2 = state->prev_col;
+       }
+
+       static void restore_unget_state(struct token_state *state)
+       {
+               state->prev_offset = state->prev_offset2;
+               state->prev_line = state->prev_line2;
+               state->prev_col = state->prev_col2;
+       }
+
+At the start of a token we don't want to be at the end of a code block
+if we can help it.  To avoid this possibility, we 'get' and 'unget' a
+single character.  This will move into the next non-empty code block
+and leave the current pointer at the start of it.
+
+This has to happen _after_ dealing with delayed tokens as some of them
+must appear in the previous node.  When we do this, we need to reset
+the data in the token.
+
+###### delayed tokens
+       if (at_eon(state)) {
+               get_char(state);
+               unget_char(state);
+               tk.node = state->node;
+               if (state->node)
+                       tk.txt.txt = state->node->code.txt + state->offset;
+               tk.line = state->line;
+               tk.col = state->col;
+               tk.txt.len = 0;
+       }
+
+### Managing tokens
+
+The current token is initialized to line up with the first character
+that we 'get' for each token.  When we have, or might have, a full
+token we can call `close_token` to set the `len` of the token
+appropriately.  This can safely be called multiple times.
+
+Finally we occasionally (for single-line strings and block comments)
+need to reset to the beginning of the current token as we might have
+parsed too much already.  For that there is `reset_token`.
+
+###### one token
+       tk.node = state->node;
+       if (state->node)
+               tk.txt.txt = state->node->code.txt + state->offset;
+       tk.line = state->line;
+       tk.col = state->col;
+       tk.txt.len = 0;
+
+###### internal functions
+
+       static void close_token(struct token_state *state,
+                               struct token *tk)
+       {
+               tk->txt.len = (state->node->code.txt + state->offset)
+                             - tk->txt.txt;
+       }
+
+       static void reset_token(struct token_state *state, struct token *tok)
+       {
+               state->prev_line = tok->line;
+               state->prev_col = tok->col;
+               state->prev_offset = tok->txt.txt - state->node->code.txt;
+               unget_char(state);
+               tok->txt.len = 0;
+       }
+
+
+Tokens make not cross into the next `code_node`, and some tokens can
+include the newline at the and of a `code_node`, we must be able to
+easily check if we have reached the end.  Equally we need to know if
+we are at the start of a node, as white space is treated a little
+differently there.
+
+###### internal functions
+
+       static int at_son(struct token_state *state)
+       {
+               return state->offset == 0;
+       }
+
+       static int at_eon(struct token_state *state)
+       {
+               // at end-of-node ??
+               return state->node == NULL ||
+                      state->offset >= state->node->code.len;
+       }
+
+### Find a known word
+
+As the known-word list is sorted we can use a simple binary search.
+Following the pattern established in "mdcode", we will use a `struct
+text` with start and length to represent the code fragment we are
+searching for.
+
+###### internal functions
+       static int find_known(struct token_config *conf, struct text txt)
+       {
+               int lo = 0;
+               int hi = conf->known_count;
+
+               while (lo + 1 < hi) {
+                       int mid = (lo + hi) / 2;
+                       int cmp = strncmp(conf->words_marks[mid],
+                                         txt.txt, txt.len);
+                       if (cmp == 0 && conf->words_marks[mid][txt.len])
+                               cmp = 1;
+                       if (cmp <= 0)
+                               lo = mid;
+                       else
+                               hi = mid;
+               }
+               if (strncmp(conf->words_marks[lo],
+                          txt.txt, txt.len) == 0
+                   && conf->words_marks[lo][txt.len] == 0)
+                       return lo;
+               else
+                       return -1;
+       }
+
+### Bringing it all together
+
+Now we have all the bits there is just one section missing:  combining
+all the token parsing code into one block.
+
+The handling of delayed tokens (newlines, indents, undents) must come
+first before we try getting another character.
+
+Then we parse all the test, making sure that we check for known marks
+before strings and comments, but unknown marks after strings and comments.
+
+This block of code will either return a token, or will choose to
+ignore one, in which case it will `continue` around to the top of the
+loop.
+
+###### one token
+       ## delayed tokens
+
+       ch = get_char(state);
+
+       ## white space
+       ## parse number
+       ## parse word
+       ## parse mark
+       ## parse string
+       ## parse comment
+       ## unknown mark
+
+### Start and stop
+
+As well as getting tokens, we need to be able to create the
+`token_state` to start with, and discard it later.
+
+###### includes
+       #include <malloc.h>
+
+###### main functions
+       struct token_state *token_open(struct code_node *code, struct
+                                      token_config *conf)
+       {
+               struct token_state *state = malloc(sizeof(*state));
+               memset(state, 0, sizeof(*state));
+               state->node = code;
+               state->line = code->line_no;
+               state->conf = conf;
+               return state;
+       }
+       void token_close(struct token_state *state)
+       {
+               free(state);
+       }
+
+###### exported functions
+       struct token_state *token_open(struct code_node *code, struct
+                                      token_config *conf);
+       void token_close(struct token_state *state);
+
+### Trace tokens
+
+Getting tokens is the main thing but it is also useful to be able to
+print out token information, particularly for tracing and testing.
+
+Known tokens are printed verbatim.  Other tokens are printed as
+`type(content)` where content is truncated to a given number of characters.
+
+The function for printing a truncated string (`text_dump`) is also exported
+so that it can be used to tracing processed strings too.
+
+###### includes
+       #include <stdio.h>
+
+###### exported functions
+       void token_trace(FILE *f, struct token tok, int max);
+       void text_dump(FILE *f, struct text t, int max);
+
+###### main functions
+
+       void text_dump(FILE *f, struct text txt, int max)
+       {
+               int i;
+               if (txt.len > max)
+                       max -= 2;
+               else
+                       max = txt.len;
+               for (i = 0; i < max; i++) {
+                       char c = txt.txt[i];
+                       if (c < ' ' || c > '~')
+                               fprintf(f, "\\x%02x", c & 0xff);
+                       else if (c == '\\')
+                               fprintf(f, "\\\\");
+                       else
+                               fprintf(f, "%c", c);
+               }
+               if (i < txt.len)
+                       fprintf(f, "..");
+       }
+
+       void token_trace(FILE *f, struct token tok, int max)
+       {
+               static char *types[] = {
+                       [TK_ident] = "ident",
+                       [TK_mark] = "mark",
+                       [TK_number] = "number",
+                       [TK_string] = "string",
+                       [TK_multi_string] = "mstring",
+                       [TK_line_comment] = "lcomment",
+                       [TK_block_comment] = "bcomment",
+                       [TK_indent] = "indent",
+                       [TK_undent] = "undent",
+                       [TK_newline] = "newline",
+                       [TK_eof] = "eof",
+                       [TK_error] = "ERROR",
+                       };
+
+               switch (tok.num) {
+               default: /* known word or mark */
+                       fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
+                       break;
+               case TK_indent:
+               case TK_undent:
+               case TK_newline:
+               case TK_eof:
+                       /* No token text included */
+                       fprintf(f, "%s()", types[tok.num]);
+                       break;
+               case TK_ident:
+               case TK_mark:
+               case TK_number:
+               case TK_string:
+               case TK_multi_string:
+               case TK_line_comment:
+               case TK_block_comment:
+               case TK_error:
+                       fprintf(f, "%s(", types[tok.num]);
+                       text_dump(f, tok.txt, max);
+                       fprintf(f, ")");
+                       break;
+               }
+       }
+
+### And there we have it
+
+We now have all the library functions defined for reading and printing
+tokens.  Now we just need C files to store them, and a mk file to make them.
+
+###### File: scanner.h
+       ## public types
+       ## exported functions
+
+###### File: libscanner.c
+       ## includes
+       #include "scanner.h"
+       ## private types
+       ## internal functions
+       ## main functions
+
+###### File: scanner.mk
+
+       CFLAGS += -Wall -g
+       all ::
+       scanner.mk scanner.h libscanner.c : scanner.mdc
+               ./md2c scanner.mdc
+       all :: libscanner.o
+       libscanner.o : libscanner.c
+               $(CC) $(CFLAGS) -c libscanner.c
+
+## Processing numbers
+
+Converting a `TK_number` token to a numerical value is a slightly
+higher level task than lexical analysis, and slightly lower than
+grammar parsing, so put it here - as an index if you like.
+
+Importantly it will be used by the same testing rig that is used for
+testing the token scanner.
+
+The numeric value that we will convert all numbers into is the `mpq_t`
+from the GNU high precision number library "libgmp".
+
+###### number includes
+       #include <gmp.h>
+       #include "mdcode.h"
+
+Firstly we need to be able to parse a string of digits in a given base
+and possibly with a decimal marker.  We store this in an `mpz_t`
+integer and report the number of digits after the decimal mark.
+
+On error we return zero and ensure that the 'mpz_t' has been freed, or
+had never been initialised.
+
+###### number functions
+
+       static int parse_digits(mpz_t num, struct text tok, int base,
+                               int *placesp)
+       {
+               /* Accept digits up to 'base', ignore '_' and
+                * ' ' if they appear between two legal digits,
+                * and if `placesp` is not NULL, allow a single
+                * '.' or ',' and report the number of digits
+                * beyond there.
+                * Return number of characters processed (p),
+                * or 0 if something illegal was found.
+                */
+               int p;
+               int decimal = -1; // digits after marker
+               enum {Digit, Space, Other} prev = Other;
+               int digits = 0;
+
+               for (p = 0; p < tok.len; p++) {
+                       int dig;
+                       char c = tok.txt[p];
+
+                       if (c == '_' || c == ' ') {
+                               if (prev != Digit)
+                                       goto bad;
+                               prev = Space;
+                               continue;
+                       }
+                       if (c == '.' || c == ',') {
+                               if (prev != Digit)
+                                       goto bad;
+                               if (!placesp || decimal >= 0)
+                                       return p-1;
+                               decimal = 0;
+                               prev = Other;
+                               continue;
+                       }
+                       if (isdigit(c))
+                               dig = c - '0';
+                       else if (isupper(c))
+                               dig = 10 + c - 'A';
+                       else if (islower(c))
+                               dig = 10 + c - 'a';
+                       else
+                               dig = base;
+                       if (dig >= base) {
+                               if (prev == Space)
+                                       p--;
+                               break;
+                       }
+                       prev = Digit;
+                       if (digits)
+                               mpz_mul_ui(num, num, base);
+                       else
+                               mpz_init(num);
+                       digits += 1;
+                       mpz_add_ui(num, num, dig);
+                       if (decimal >= 0)
+                               decimal++;
+               }
+               if (digits == 0)
+                       return 0;
+               if (placesp) {
+                       if (decimal >= 0)
+                               *placesp = decimal;
+                       else
+                               *placesp = 0;
+               }
+               return p;
+       bad:
+               if (digits)
+                       mpz_clear(num);
+               return 0;
+       }
+
+###### number includes
+       #include <ctype.h>
+
+To parse a full number we need to consider the optional base, the
+mantissa, and the optional exponent.  We will treat these one at a
+time.
+
+The base is indicated by a letter after a leading zero, which must be
+followed by a base letter or a period.  The base also determines the
+character which will mark an exponent.
+
+###### number vars
+       int base = 10;
+       char expc = 'e';
+
+###### parse base
+
+       if (tok.txt[0] == '0' && tok.len > 1) {
+               int skip = 0;
+               switch(tok.txt[1]) {
+               case 'x':
+               case 'X':
+                       base = 16;
+                       skip = 2;
+                       expc = 'p';
+                       break;
+               case 'o':
+               case 'O':
+                       base = 8;
+                       skip = 2;
+                       expc = 'p';
+                       break;
+               case 'b':
+               case 'B':
+                       base = 2;
+                       skip = 2;
+                       expc = 'p';
+                       break;
+               case '0':
+               case '1':
+               case '2':
+               case '3':
+               case '4':
+               case '5':
+               case '6':
+               case '7':
+               case '8':
+               case '9':
+               case '_':
+               case ' ':
+                       // another digit is not permitted
+                       // after a zero.
+                       return 0;
+               default:
+                       // must be decimal marker or trailing
+                       // letter, which are OK;
+                       break;
+               }
+               tok.txt += skip;
+               tok.len -= skip;
+       }
+
+After the base is the mantissa, which may contain a decimal mark, so
+we need to record the number of places.  We won't impose the number of
+places until we have the exponent as well.
+
+###### number vars
+       int places =0;
+       mpz_t mant;
+       int d;
+
+###### parse mantissa
+
+       d = parse_digits(mant, tok, base, &places);
+       if (d == 0)
+               return 0;
+       tok.txt += d;
+       tok.len -= d;
+       mpq_init(num);
+       mpq_set_z(num, mant);
+       mpz_clear(mant);
+
+After the mantissa number may come an exponent which may be positive
+or negative.  We assume at this point that we have seen the exponent
+character `expc`.
+
+###### number vars
+       long lexp = 0;
+       mpz_t exp;
+       int esign = 1;
+
+###### parse exponent
+       if (tok.len > 1) {
+               if (tok.txt[0] == '+') {
+                       tok.txt++;
+                       tok.len--;
+               } else if (tok.txt[0] == '-') {
+                       esign = -1;
+                       tok.txt++;
+                       tok.len--;
+               }
+       }
+       d = parse_digits(exp, tok, 10, NULL);
+       if (d == 0) {
+               mpq_clear(num);
+               return 0;
+       }
+       if (!mpz_fits_slong_p(exp)) {
+               mpq_clear(num);
+               mpz_clear(exp);
+               return 0;
+       }
+       lexp = mpz_get_si(exp) * esign;
+       mpz_clear(exp);
+       tok.txt += d;
+       tok.len -= d;
+
+
+Now that we have the mantissa and the exponent we can multiply them
+together, also allowing for the number of digits after the decimal
+mark.
+
+For base 10, we simply subtract the decimal places from the exponent.
+For the other bases, as the exponent is alway based on 2, even for
+octal and hex, we need a bit more detail.
+We then recover the sign from the exponent, as division is quite
+different from multiplication.
+
+###### calc exponent
+       switch (base) {
+       case 10:
+       case 2:
+               lexp -= places;
+               break;
+       case 16:
+               lexp -= 4*places;
+               break;
+       case 8:
+               lexp -= 3*places;
+               break;
+       }
+       if (lexp < 0) {
+               lexp = -lexp;
+               esign = -1;
+       } else
+               esign = 1;
+
+Imposing the exponent on the number is also very different for base 10
+than for the others.  For the binary shift `gmp` provides a simple
+function.  For base 10 we use something like Russian Peasant
+Multiplication.
+
+###### calc exponent
+       if (expc == 'e') {
+               mpq_t tens;
+               mpq_init(tens);
+               mpq_set_ui(tens, 10, 1);
+               while (1) {
+                       if (lexp & 1) {
+                               if (esign > 1)
+                                       mpq_mul(num, num, tens);
+                               else
+                                       mpq_div(num, num, tens);
+                       }
+                       lexp >>= 1;
+                       if (lexp == 0)
+                               break;
+                       mpq_mul(tens, tens, tens);
+               }
+               mpq_clear(tens);
+       } else {
+               if (esign > 0)
+                       mpq_mul_2exp(num, num, lexp);
+               else
+                       mpq_div_2exp(num, num, lexp);
+       }
+
+Now we are ready to parse a number: the base, mantissa, and exponent.
+If all goes well we check for the possible trailing letters and
+return.  Return value is 1 for success and 0 for failure.
+
+
+###### number functions
+       int number_parse(mpq_t num, char tail[3], struct text tok)
+       {
+               ## number vars
+               int i;
+
+               ## parse base
+               ## parse mantissa
+               if (tok.len > 1 && (tok.txt[0] == expc ||
+                                   tok.txt[0] == toupper(expc))) {
+                       tok.txt++;
+                       tok.len--;
+                       ## parse exponent
+               }
+               ## calc exponent
+
+               for (i = 0; i < 2; i++) {
+                       if (tok.len <= i)
+                               break;
+                       if (!isalpha(tok.txt[i]))
+                               goto err;
+                       tail[i] = tok.txt[i];
+               }
+               tail[i] = 0;
+               if (i == tok.len)
+                       return 1;
+       err:
+               mpq_clear(num);
+               return 0;
+       }
+
+Number parsing goes in `libnumber.c`
+
+###### File: libnumber.c
+
+       #include <unistd.h>
+       #include <stdlib.h>
+
+       ## number includes
+       ## number functions
+
+###### File: number.h
+       int number_parse(mpq_t num, char tail[3], struct text tok);
+
+###### File: scanner.mk
+       all :: libnumber.o
+       libnumber.o : libnumber.c
+               $(CC) $(CFLAGS) -c libnumber.c
+
+## Processing strings
+
+Both `TK_string` and `TK_multi_string` require post-processing which
+can be one of two types: literal or with escapes processed.
+Even literal processing is non-trivial as the file may contain indents
+which need to be stripped.
+
+Errors can only occur when processing escapes.  Any unrecognised
+character following the escape character will cause an error.
+
+Processing escapes and striping indents can only make the string
+shorter, not longer, so we allocate a buffer which is the same size as
+the string and process into that.
+
+To request escape processing, we pass the character we want to use for
+quoting, usually '`\`'.  To avoid escape processing we pass a zero.
+
+###### string main
+       int string_parse(struct token *tok, char escape,
+                        struct text *str, char tail[3])
+       {
+               ## string vars
+               struct text t = tok->txt;
+
+               str->txt = NULL;
+               ## strip tail
+               if (tok->num == TK_string) {
+                       ## strip single
+               } else {
+                       ## strip multi
+               }
+               str->txt = malloc(t.len);
+               str->len = 0;
+
+               ## process string
+               return 1;
+       err:
+               free(str->txt);
+               str->txt = NULL;
+               return 0;
+       }
+
+### strip tail
+
+The tail of the string can be 0, 1, or 2 letters
+
+       i = t.len;
+       if (i >= 0 && isalpha(t.txt[i-1]))
+               i -= 1;
+       if (i >= 0 && isalpha(t.txt[i-1]))
+               i -= 1;
+       strncpy(tail, t.txt+i, t.len-i);
+       tail[t.len-i] = 0;
+       t.len = i;
+
+###### string vars
+       int i;
+
+### strip single
+
+Stripping the quote of a single-line string is trivial.
+The only part that is at all interesting is that quote character must
+be remembered.
+
+       quote = t.txt[0];
+       if (t.txt[t.len-1] != quote)
+               goto err;
+       t.txt += 1;
+       t.len -= 2;
+
+###### string vars
+       char quote;
+
+### strip multi
+
+For a multi-line string we have a little more work to do.  We need to
+remove 3 quotes, not 1, and need to count the indent of the close
+quote as it will need to be stripped from all lines.
+
+       quote = t.txt[0];
+       if (t.len < 7 ||
+           t.txt[1] != quote || t.txt[2] != quote ||
+           !is_newline(t.txt[3]))
+               goto err;
+       t.txt += 4;
+       t.len -= 4;
+       i = t.len;
+       if (i <= 0 || t.txt[i-1] != quote)
+               goto err;
+       i -= 1;
+       if (i <= 0 || t.txt[i-1] != quote)
+               goto err;
+       i -= 1;
+       if (i <= 0 || t.txt[i-1] != quote)
+               goto err;
+       i -= 1;
+       t.len = i;
+       while (i > 0 && !is_newline(t.txt[i-1]))
+               i--;
+       indent = 0;
+       while (i < t.len) {
+               if (t.txt[i] == ' ')
+                       indent += 1;
+               if (t.txt[i] == '\t')
+                       indent = indent_tab(indent);
+               i++;
+       }
+
+###### string vars
+       int indent = 0;
+
+### process string
+
+Now we just take one byte at a time. trans-ASCII unicode won't look
+like anything we are interested in so it will just be copied byte by
+byte.
+
+       cp = str->txt;
+       at_sol = 1;
+       for (i = 0; i < t.len; i++) {
+               char c;
+               if (at_sol) {
+                       at_sol = 0;
+                       ## strip indent
+                       if (i >= t.len)
+                               break;
+               }
+               c = t.txt[i];
+               if (c != escape) {
+                       *cp = c;
+                       cp += 1;
+                       if (is_newline(c))
+                               at_sol = 1;
+               } else if (i+1 >= t.len) {
+                       // escape and end of string
+                       goto err;
+               } else {
+                       i += 1;
+                       c = t.txt[i];
+                       ## parse escape
+               }
+       }
+       str->len = cp - str->txt;
+
+###### string vars
+       char *cp;
+       int at_sol;
+
+### strip indent
+
+Every time we find a start of line, we strip spaces and tabs until the
+required indent is found.
+
+       int skipped = 0;
+       while (i < t.len && skipped < indent) {
+               c = t.txt[i];
+               if (c == ' ')
+                       skipped += 1;
+               else if (c == '\t')
+                       skipped = indent_tab(c);
+               else
+                       break;
+               i+= 1;
+       }
+
+### parse escape
+       switch (c) {
+       case 'n':
+               *cp++ = '\n'; break;
+       case 'r':
+               *cp++ = '\r'; break;
+       case 't':
+               *cp++ = '\t'; break;
+       case 'b':
+               *cp++ = '\b'; break;
+       case 'q':
+               *cp++ = quote; break;
+       case 'f':
+               *cp++ = '\f'; break;
+       case 'v':
+               *cp++ = '\v'; break;
+       case 'a':
+               *cp++ = '\a'; break;
+       case '0':
+       case '1':
+       case '2':
+       case '3':
+               // 3 digit octal number
+               if (i+2 >= t.len)
+                       goto err;
+               if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
+                   t.txt[i+2] < '0' || t.txt[i+1] > '7')
+                       goto err;
+               n = (t.txt[i  ]-'0') * 64 +
+                   (t.txt[i+1]-'0') *  8 +
+                   (t.txt[i+2]-'0') *  1;
+               *cp++ = n;
+               i += 2;
+               break;
+       case 'x':
+               // 2 hex digits
+               n = take_hex(2, t.txt+i+1, t.len-i-1);
+               if (n < 0)
+                       goto err;
+               *cp++ = n;
+               i += 2;
+               break;
+       case 'u':
+       case 'U':
+               // 4 or 8 hex digits for unicode
+               n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
+               if (n < 0)
+                       goto err;
+               memset(&pstate, 0, sizeof(pstate));
+               n = wcrtomb(cp, n, &pstate);
+               if (n <= 0)
+                       goto err;
+               cp += n;
+               i += c == 'u' ? 4 : 8;
+               break;
+       default:
+               if (c == escape)
+                       *cp++ = c;
+               else if (is_newline(c))
+                       at_sol = 1;
+               else
+                       goto err;
+       }
+
+###### string vars
+       long n;
+       mbstate_t pstate;
+
+For `\x` `\u` and `\U` we need to collect a specific number of
+hexadecimal digits
+
+###### string functions
+
+       static long take_hex(int digits, char *cp, int l)
+       {
+               long n = 0;
+               if (l < digits)
+                       return -1;
+               while (digits) {
+                       char  c = *cp;
+                       int d;
+                       if (!isxdigit(c))
+                               return -1;
+                       if (isdigit(c))
+                               d = c - '0';
+                       else if (isupper(c))
+                               d = 10 + c - 'A';
+                       else
+                               d = 10 + c - 'a';
+                       n = n * 16 + d;
+                       digits--;
+                       cp++;
+               }
+               return n;
+       }
+
+#### File: libstring.c
+
+String parsing goes in `libstring.c`
+
+       #include <unistd.h>
+       #include <stdlib.h>
+       #include <stdio.h>
+       #include <string.h>
+       #include <ctype.h>
+       #include <wchar.h>
+       #include "mdcode.h"
+       #include "scanner.h"
+       ## string functions
+       ## string main
+
+###### File: string.h
+       int string_parse(struct token *tok, char escape,
+                        struct text *str, char tail[3]);
+
+###### File: scanner.mk
+       all :: libstring.o
+       libstring.o : libstring.c
+               $(CC) $(CFLAGS) -c libstring.c
+
+
+## Testing
+
+As "untested code is buggy code" we need a program to easily test
+the scanner library.  This will simply parse a given file and report
+the tokens one per line.
+
+###### File: scanner.c
+
+       #include <unistd.h>
+       #include <stdlib.h>
+       #include <fcntl.h>
+       #include <errno.h>
+       #include <sys/mman.h>
+       #include <string.h>
+       #include <stdio.h>
+       #include <gmp.h>
+       #include <locale.h>
+       #include "mdcode.h"
+       #include "scanner.h"
+       #include "number.h"
+       #include "string.h"
+
+       static int errs;
+       static void pr_err(char *msg)
+       {
+               errs++;
+               fprintf(stderr, "%s\n", msg);
+       }
+
+       int main(int argc, char *argv[])
+       {
+               int fd;
+               int len;
+               char *file;
+               struct token_state *state;
+               char *known[] = {
+                       "==",
+                       "else",
+                       "if",
+                       "then",
+                       "while",
+                       "{",
+                       "}",
+               };
+               struct token_config conf = {
+                       .word_start = "_$",
+                       .word_cont = "",
+                       .words_marks = known,
+                       .number_chars = "., _+-",
+                       .known_count = sizeof(known)/sizeof(known[0]),
+                       .ignored = (0 << TK_line_comment)
+                                 |(0 << TK_block_comment),
+               };
+               struct section *table, *s, *prev;
+               setlocale(LC_ALL,"");
+               if (argc != 2) {
+                       fprintf(stderr, "Usage: scanner file\n");
+                       exit(2);
+               }
+               fd = open(argv[1], O_RDONLY);
+               if (fd < 0) {
+                       fprintf(stderr, "scanner: cannot open %s: %s\n",
+                               argv[1], strerror(errno));
+                       exit(1);
+               }
+               len = lseek(fd, 0, 2);
+               file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+               table = code_extract(file, file+len, pr_err);
+
+               for (s = table; s;
+                       (code_free(s->code), prev = s, s = s->next, free(prev))) {
+                       printf("Tokenizing: %.*s\n", s->section.len,
+                               s->section.txt);
+                       state = token_open(s->code, &conf);
+                       while(1) {
+                               struct token tk = token_next(state);
+                               printf("%d:%d ", tk.line, tk.col);
+                               token_trace(stdout, tk, 20);
+                               if (tk.num == TK_number) {
+                                       mpq_t num;
+                                       char tail[3];
+                                       if (number_parse(num, tail,tk.txt)) {
+                                               printf(" %s ", tail);
+                                               mpq_out_str(stdout, 10, num);
+                                               mpq_clear(num);
+                                       } else
+                                               printf(" BAD NUMBER");
+                               }
+                               if (tk.num == TK_string ||
+                                   tk.num == TK_multi_string) {
+                                       char esc = '\\';
+                                       struct text str;
+                                       char tail[3];
+                                       if (tk.txt.txt[0] == '`')
+                                               esc = 0;
+                                       if (string_parse(&tk, esc,
+                                                        &str, tail)) {
+                                               printf(" %s ", tail);
+                                               text_dump(stdout, str, 20);
+                                               free(str.txt);
+                                       } else
+                                               printf(" BAD STRING");
+                               }
+                               printf("\n");
+                               if (tk.num == TK_error)
+                                       errs = 1;
+                               if (tk.num == TK_eof)
+                                       break;
+                       }
+               }
+               exit(!!errs);
+       }
+###### File: scanner.mk
+       scanner.c : scanner.mdc
+               ./md2c scanner.mdc
+       all :: scanner
+       scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
+               $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
+                       libmdcode.o libnumber.o libstring.o -licuuc -lgmp
+       scanner.o : scanner.c
+               $(CC) $(CFLAGS) -c scanner.c
+
author	NeilBrown <neilb@suse.de>
	Sat, 22 Jun 2013 09:18:55 +0000 (19:18 +1000)
committer	NeilBrown <neilb@suse.de>
	Sat, 22 Jun 2013 09:18:55 +0000 (19:18 +1000)