From 6b6ee9a12345d8e5c8fc7e27d2106b1e7fb15f67 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Sat, 22 Jun 2013 19:18:55 +1000
Subject: [PATCH] scanner.mdc: lexical scanner for Ocean.

This scanner does lexical analysis and produces tokens.
It also handles numbers and escapes in strings.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 csrc/scanner.mdc | 1967 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1967 insertions(+)
 create mode 100644 csrc/scanner.mdc

diff --git a/csrc/scanner.mdc b/csrc/scanner.mdc
new file mode 100644
index 0000000..547a037
--- /dev/null
+++ b/csrc/scanner.mdc
@@ -0,0 +1,1967 @@
+# Lexical Scanner #
+
+## The Task at Hand ##
+
+The main task of the lexical scanner is to convert a stream of
+characters into a stream of tokens.  The tokens are then typically
+used by a parser to extract the syntactic structure.
+
+The stream of characters are assumed to be in memory identified by a
+linked list of blocks, such as provided by the "[mdcode][]" literate
+program extractor.  A single token may never cross a block boundary.
+
+[mdcode]: mdcode.html
+
+###### includes
+	#include "mdcode.h"
+
+The text is assumed to be UTF-8 though some matching assumes the
+ASCII subset.  If the text provided does not conform to UTF-8 an error
+will be reported and some number of bytes will be skipped.
+
+###### includes
+	#include <wchar.h>
+	#include <wctype.h>
+	#include <unicode/uchar.h>
+
+Tokens are returned by successive calls to the main interface
+function: `token_next()` which has a `state` structure to keep track
+of where it is up to.  Each token carries not just a numeric
+identifier but also the code block, the line and character within that
+block, and the actual start and length using the `struct text` from
+"mdcode".
+
+###### public types
+	struct token {
+		int               num;
+		struct code_node *node;
+		struct text       txt;
+		int               line, col;
+	};
+	struct token_state;
+
+###### private types
+	struct token_state {
+		## state fields
+	};
+
+###### exported functions
+	struct token token_next(struct token_state *state);
+
+###### main functions
+	struct token token_next(struct token_state *state)
+	{
+		## token_next init
+		while (1) {
+			wint_t ch;
+			struct token tk;
+
+			## one token
+		}
+	}
+
+The `line` and `col` offsets are useful for reporting errors.
+The `txt` provides the content when that is important.
+
+### Token types and configuration ##
+
+The scanner is not completely general, yet not completely specified.
+There are a fixed set of token types, though particular tokens within
+those types can be distinguish via configuration.
+
+Most token types may be explicitly ignored, as typically comments
+would be.  The exact consequence of ignoring each token type varies
+from token to token.
+
+###### public types
+	struct token_config {
+		int ignored;	// bit set of ignored tokens.
+		## token config parameters
+	};
+
+###### state fields
+	struct token_config *conf;
+
+###### token_next init
+	int ignored = state->conf->ignored;
+
+
+The different tokens are numbers, words, marks, strings, comments,
+newlines, EOF, and indents, each of which is examined in detail below.
+
+There are various cases where no token can be found in part of the
+input.  All of these will be reported as an `TK_error` token.
+
+It is possible to declare a number of strings which form distinct
+tokens (rather than being grouped as e.g. 'word').  These are given
+token numbers from `TK_reserved` upwards.
+
+###### public types
+	enum token_num {
+		TK_error,
+		## token types
+		TK_reserved
+	};
+
+### Numbers
+
+Numbers are the messiest tokens to parse, primarily because they can
+contain characters that also have meaning outside of number and,
+particularly, immediately after numbers.
+
+The obvious example is the '`-`' sign.  It can come inside a number for
+a negative exponent, or after a number as a subtraction operator.  To
+be sure we have parsed as best as possible we need to only allow the
+'`-`' inside a number if it is after an exponent character.  This can be
+`e` or `p` (for hex exponents), but `e` can also be a hexadecimal
+digit, so we don't allow '`-`' after just any `e`.
+
+To make matters worse, our language designer has decided to experiment
+with allowing commas to be used as the decimal indicator, and spaces
+to be used to separate groups of digits in large numbers.  Both of
+these can reasonably be restricted to appear between two digits, so we
+have to add that condition to our tests.
+
+So we cannot just treat numbers as starting with a digit and being
+followed by some set of characters.  We need more structure than that.
+
+So:
+
+- Numbers must start with a digit.
+- If the first digit is zero, the next character must be a base
+  signifier (one of `xob`) or a decimal marker (`.` or `,`).
+  In the first case the first `p` or `P` may be followed by a sign.
+- If the number doesn't start with `0` followed by one of `xob`, the
+  first `e` may be followed by a sign.
+- Any digit or hex digit may be followed by a space or underscore
+  providing that the subsequence character is also a (hex) digit.
+  This rule will require an extra level of 'unget' to be
+  supported when handling characters.
+- Otherwise any digits or ASCII letters are allowed.  We do not at
+  this point check that the digits given are permitted by the base.
+  That will happen when the token is converted to a number.
+
+To allow easy configuration, the various non alphanumeric characters
+are only permitted if they are listed in a configuration parameter.
+
+###### token config parameters
+	char *number_chars;
+
+Note that numbers may not start with a period, so `.75` is not a
+number.  This is not the norm, but is not unheard of.  Excluding these
+numbers simplifies the rule at very little cost.
+
+###### token types
+	TK_number,
+
+If TK_number is ignored, digits will result in an error unless they
+are declared to be a start character for words.
+
+###### includes
+
+	#include <string.h>
+
+###### parse number
+
+	if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
+		int prev_special = 0;
+		int expect_p = 0;
+		int decimal_mark = 0;
+		if (ch == '0') {
+			wchar_t ch2 = get_char(state);
+			if (strchr("xobXOB", ch2) != NULL)
+				expect_p = 1;
+			unget_char(state);
+		}
+		while (1) {
+			int sign_ok = 0;
+			switch(expect_p) {
+			case 0:
+				if (ch == 'e')
+					sign_ok = 1;
+				break;
+			case 1:
+				if (ch == 'p')
+					sign_ok = 1;
+				break;
+			}
+			save_unget_state(state);
+			ch = get_char(state);
+			if (iswalnum(ch)) {
+				prev_special = 0;
+				continue;
+			}
+			if (ch == '+' || ch == '-') {
+				if (!sign_ok)
+					break;
+				expect_p = -1;
+			}
+			if (ch == '.' || ch == ',') {
+				if (decimal_mark)
+					break;
+				decimal_mark = 1;
+			}
+			if (prev_special) {
+				/* Don't allow that special char,
+				 * need two 'ungets'
+				 */
+				restore_unget_state(state);
+				break;
+			}
+			if (strchr(state->conf->number_chars, ch)) {
+				prev_special = 1;
+				continue;
+			}
+			/* non-number char */
+			break;
+		}
+		/* We seem to have a "number" token */
+		unget_char(state);
+		close_token(state, &tk);
+		tk.num = TK_number;
+		return tk;
+	}
+
+### Words
+Words start with a "start" character followed by the longest
+sequence of "continue" characters.  The Unicode ID_START and
+ID_CONTINUE sets are always permitted, but other ASCII characters
+can be added to these sets.
+
+###### token config parameters
+	char *word_start;
+	char *word_cont;
+
+###### internal functions
+	static int is_word_start(wchar_t ch, struct token_config *conf)
+	{
+		return iswalpha(ch) ||
+		       strchr(conf->word_start, ch) != NULL ||
+		       u_hasBinaryProperty(ch, UCHAR_ID_START);
+	}
+
+	static int is_word_continue(wchar_t ch, struct token_config *conf)
+	{
+		return iswalnum(ch) ||
+		       strchr(conf->word_cont, ch) != NULL ||
+		       u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
+	}
+
+Words can be either known or unknown.  Known words are referred to as
+"reserved words" and get a unique token number.  Unknown words are
+"identifiers" and are syntactically a single token.
+
+###### token types
+	TK_ident,
+
+A list of known words must be provided.  This list is shared with the
+"marks" which are described next.  The list must be lexically sorted
+and the length of the list must be given (`known_count`).
+Tokens matching these known words are reported as the index of the
+list added to `TK_reserved`.
+
+###### token config parameters
+	char **words_marks;
+	int known_count;
+
+###### parse word
+
+	if (is_word_start(ch, state->conf)) {
+		int n;
+		/* A word: identifier or reserved */
+		do
+			ch = get_char(state);
+		while (is_word_continue(ch, state->conf));
+		unget_char(state);
+		close_token(state, &tk);
+		tk.num = TK_ident;
+		if (ignored & (1<<TK_ident))
+			tk.num = TK_error;
+		n = find_known(state->conf, tk.txt);
+		if (n >= 0)
+			tk.num = TK_reserved + n;
+		return tk;
+	}
+
+### Marks
+
+Marks are generally one or more punctuation marks joined together.  It
+would be nice to use the term "symbol" for these, but that causes
+confusion in a subsequent discussion of the grammar, which has terminal
+symbols and non-terminal symbols which are conceptually quite
+different.  So strings of punctuation characters will be marks.
+
+A "mark" consists of ASCII characters that are not white space, are not
+"start" characters for words, and are not digits.
+These will collectively be called mark characters.
+
+###### internal functions
+	static int is_mark(wchar_t ch, struct token_config *conf)
+	{
+		return ch > ' ' &&
+		       ch < 0x7f &&
+		       !iswalnum(ch) &&
+		       strchr(conf->word_start, ch) == NULL;
+	}
+
+As with words, there can be known and unknown marks, though the rules
+are slightly different.
+
+Two marks do not need to be separated by a non-mark characters.  This
+is different from words which do need to be separated by at least one
+non-continue character.
+
+The scanner will normally prefer longer sequences of mark characters,
+but will more strongly prefer known marks over unknown marks.  So if
+it finds a known mark where adding one more character does not result
+in a known mark, it will return that first known mark.
+
+If no known mark is found we will test against strings and comments
+below before giving up and assuming an unknown mark.
+If `TK_mark` is ignored, then unknown marks as returned as an error.
+
+###### token types
+	TK_mark,
+
+Known marks are included in the same list as the list of known words.
+
+###### parse mark
+	tk.num = TK_error;
+	while (is_mark(ch, state->conf)) {
+		int n;
+		close_token(state, &tk);
+		n = find_known(state->conf, tk.txt);
+		if (n >= 0)
+			tk.num = TK_reserved + n;
+		else if (tk.num != TK_error) {
+			/* found a longest-known-mark */
+			unget_char(state);
+			close_token(state, &tk);
+			return tk;
+		}
+		ch = get_char(state);
+	}
+	unget_char(state);
+	if (tk.num != TK_error)
+		return tk;
+
+###### unknown mark
+	if (tk.txt.len) {
+		if (ignored & (1<<TK_mark))
+			tk.num = TK_error;
+		else
+			tk.num = TK_mark;
+		return tk;
+	}
+
+### Strings
+
+Strings start with one of single quote, double quote, or back quote
+and continue until a matching character on the same line.  Any of
+these characters can be included in the list of known marks and then
+they will not be used for identifying strings.
+
+Immediately following the close quote one or two ASCII letters may
+appear.  These are somewhat like the arbitrary letters allowed in
+"Numbers" above.  They can be used by the language in various ways.
+
+If 3 identical quote characters appear in a row and are
+followed by a newline, then this forms a multi-line string which
+continues until an identical triple quote appears on a line preceded
+only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
+
+Multi-line strings may not extend beyond the end of the `code_node` in
+which they start.
+
+Normal strings and multi-line strings are encoded as two different
+token types.
+
+###### token types
+	TK_string,
+	TK_multi_string,
+
+###### internal functions
+	static int is_quote(wchar_t ch)
+	{
+		return ch == '\'' || ch == '"' || ch == '`';
+	}
+
+#### Multi-line strings
+
+The multi-line string is checked for first.  If they are being
+ignored, we fall through and treat a triple quote as an empty string
+followed by the start of a new string.
+
+###### parse string
+	if (tk.txt.len == 3 &&
+	    !(ignored & (1 << TK_multi_string)) &&
+	    is_quote(tk.txt.txt[0]) &&
+	    memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
+	    is_newline(tk.txt.txt[3])) {
+		// triple quote
+		wchar_t first = tk.txt.txt[0];
+		int qseen = 0;
+		int at_sol = 1;
+		while (!at_eon(state) && qseen < 3) {
+			ch = get_char(state);
+			if (is_newline(ch)) {
+				at_sol = 1;
+				qseen = 0;
+			} else if (at_sol && ch == first) {
+				qseen += 1;
+			} else if (ch != ' ' && ch != '\t') {
+				at_sol = 0;
+				qseen = 0;
+			}
+		}
+		if (qseen != 3) {
+			/* Hit end of node - error.
+			 * unget so the newline is seen,
+			 * but return rest of string as an error.
+			 */
+			unget_char(state);
+			close_token(state, &tk);
+			tk.num = TK_error;
+			return tk;
+		}
+		/* 2 letters are allowed */
+		ch = get_char(state);
+		if (iswalpha(ch))
+			ch = get_char(state);
+		if (iswalpha(ch))
+			ch = get_char(state);
+		/* Now we must have a newline, but we don't return it
+		 * whatever it is.*/
+		unget_char(state);
+		close_token(state, &tk);
+		tk.num = TK_multi_string;
+		if (!is_newline(ch))
+			tk.num = TK_error;
+		return tk;
+	}
+
+#### Single-line strings
+
+The sequence of marks collected may be more than a single-line
+string, so we reset to the start and collect characters until
+we find a close quote or a newline.
+
+If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
+
+###### parse string
+	if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
+	    !(ignored & (1<<TK_string))) {
+		wchar_t first = tk.txt.txt[0];
+		reset_token(state, &tk);
+		get_char(state);
+		do
+			ch = get_char(state);
+		while (ch != first && !is_newline(ch));
+		tk.num = TK_string;
+		if (is_newline(ch)) {
+			unget_char(state);
+			tk.num = TK_error;
+		}
+		close_token(state, &tk);
+		return tk;
+	}
+
+### Comments
+
+Single line comments may start with '`//`' or '`#`' providing that these
+are not known marks.  They continue to the end of the line.
+
+Block comments start with '`/*`' if this is not a known mark.  They
+continue to the first occurrence of '`*/`' and may not contain any
+occurrence of '`/*`'.
+
+Block comments can be wholly within one line or can continue over
+multiple lines.  The multi-line version should be followed immediately
+by a newline.  The Linux kernel contains over 285000 multi-line
+comments are only 34 are followed by characters other than white space
+(which should be removed) or a backslash (only needed in macros).  So
+it would not suffer from this rule.
+
+These two comment types are reported as two separate token types, and
+consequently can be ignored separately.  When ignored a comment is
+parsed and discarded.
+
+###### token types
+	TK_line_comment,
+	TK_block_comment,
+
+###### internal functions
+	static int is_line_comment(struct text txt)
+	{
+		return (txt.len >= 1 && txt.txt[0] == '#') ||
+		       (txt.len >= 2 && txt.txt[0] == '/' &&
+		                        txt.txt[1] == '/');
+	}
+
+	static int is_block_comment(struct text txt)
+	{
+		return txt.len >= 2 && txt.txt[0] == '/' &&
+		       txt.txt[1] == '*';
+	}
+
+#### Single line comments
+
+A single-line comment continues up to, but not including the newline.
+
+###### parse comment
+
+	if (is_line_comment(tk.txt)) {
+		while (!is_newline(ch))
+			ch = get_char(state);
+		unget_char(state);
+		close_token(state, &tk);
+		tk.num = TK_line_comment;
+		if (ignored & (1 << TK_line_comment))
+			continue;
+		return tk;
+	}
+
+#### Block comments
+
+The token text collected so far could exceed the comment, so we need
+to reset it first.
+
+If we find an embedded `/*` we reset to just before the '/' and report
+an error.  That way the next thing to be parsed will be the rest of
+the comment.  This requires a double unget, so we need to save/restore
+the unget state (explained later).
+
+###### parse comment
+
+	if (is_block_comment(tk.txt)) {
+		wchar_t prev;
+		int newlines = 0;
+		reset_token(state, &tk);
+		get_char(state);
+		get_char(state);
+		save_unget_state(state);
+		ch = get_char(state);
+		prev = 0;
+		while (!at_eon(state) &&
+		       (prev != '/' || ch != '*') &&
+		       (prev != '*' || ch != '/')) {
+			if (is_newline(ch))
+				newlines = 1;
+			prev = ch;
+			save_unget_state(state);
+			ch = get_char(state);
+		}
+		close_token(state, &tk);
+		if (at_eon(state)) {
+			tk.num = TK_error;
+			return tk;
+		}
+		if (prev == '/') {
+			/* embedded.  Need to unget twice! */
+			restore_unget_state(state);
+			unget_char(state);
+			tk.num = TK_error;
+			return tk;
+		}
+		tk.num = TK_block_comment;
+		if (newlines && !(ignored & (1<<TK_newline))) {
+			/* next char must be newline */
+			ch = get_char(state);
+			unget_char(state);
+			if (!is_newline(ch))
+				tk.num = TK_error;
+		}
+		if (tk.num == TK_error ||
+		    !(ignored & (1 << TK_block_comment)))
+			return tk;
+		continue;
+	}
+
+### Indents, Newlines, and White Space.
+
+Normally white space is ignored.  However newlines can be important as
+can indents, which are either after a newline or at the start of a
+node (detected by `at_son()`);
+
+###### exported functions
+	static inline int is_newline(wchar_t ch)
+	{
+		return ch == '\n' || ch == '\f' || ch == '\v';
+	}
+
+###### white space
+	if (ch <= ' ' && !is_newline(ch)
+	    && ! at_son(state))
+		continue;
+
+If a line starts with more white-space than the previous non-blank
+line - or if the first non-blank line in the document starts with any
+white-space - then an Indent is reported at the start of the line.
+
+Before the next non-blank line which starts with less white space, or
+at the latest at the end of the document, a matching Undent token
+is reported.  There will always be an exact match between Indent and
+Undent tokens.
+
+It is possible for Undent to be followed (almost) immediately by an
+Indent.  This happens if, for example, the indent of three consecutive
+lines are 0, 8, 4 spaces.  Before the second line we report an
+Indent.  Before the third line we must report an Undent, as 4 is less
+than 8, then also an Ident as 4 is greater than 0.
+
+###### token types
+	TK_indent,
+	TK_undent,
+
+For the purpose of measuring the length of white space, a tab adds at
+least one space, and rounds up to a multiple of 8.
+
+###### exported functions
+	static inline int indent_tab(int indent)
+	{
+		return (indent|7)+1;
+	}
+
+We need to track the current levels of indent.  This requires some
+sort of stack as indent levels are pushed on and popped off.  In
+practice this stack is unlikely to often exceed 5 so we will used a
+fixed stack of 20 indent levels.  More than this will be silently
+ignored.
+
+###### state fields
+	int	indent_level;
+	int	indent_sizes[20];
+
+#### Newlines
+
+Newlines can optionally be reported.  Newlines within a block comment
+or a multi-line string are not reported separately, but each of these
+must be followed immediately by a newline so these constructs cannot
+hide the fact that a newline was present.
+
+When Indents are being reported, the Newline which would normally be
+reported immediately before the Indent is delayed until after the
+matching undent.  This makes an indented section act like a
+continuation of the previous line to some extent.
+
+A blank line would normally be reported simply as two consecutive Newline
+tokens.  However if the subsequent line is indented (and indents are being
+reported) then the right thing to do is less obvious as Newlines should be
+delayed - but how many Newlines?
+
+The approach we will take is to report the extra Newlines immediately after
+the Indent token, so the blank line is treated as though it were an indented
+blank line.
+
+###### token types
+	TK_newline,
+
+If we find a newline or white space at the start of a block, we keep
+collecting spaces, tabs, and newlines until we find some real text.
+Then depending on the indent we generate some number of tokens.  These
+will be a sequence of "Newline Undent" pairs representing a decrease
+in indent, then either a Newline or an Indent depending on whether the
+next line is indented, then zero or more Newlines representing all the
+blank lines that have been skipped.
+
+When a Newline leads to the next block of code there is a question of
+whether the various Newline and Undent/Indent tokens should appear to
+pbelong to the earlier or later block.  This is addressed by processing
+the tokens in two stages based on the relative indent levels of the
+two blocks (each block has a base indent to which the actual indents
+are added).
+
+Any "Newline Undent" pairs needed to reduce the current indent to the
+maximum of the base indents of the old and new blocks are generated
+against the old block.  Then if the next block does not have an
+increased indent, one more "Newline" is generated.
+
+If further "Newline Undent" pairs are needed to get to the indent
+level of the 'next' block, they are generated against that block,
+though the first Newline is suppressed (it having already been
+generated).
+
+Finally the Newline or Indent for the first line of the new block is
+generated, unless the Newline needs to be suppressed because it
+appeared at the end of the previous block.
+
+This means that a block may start with an Undent or an Indent, but
+will only start with a Newline if it actually starts with a blank
+line.
+
+We will need to represent in the `token_state` where in this sequence
+of delayed tokens we are.  As `state.col` records the target indent we
+don't need to record how many undents or indents are needed.  We do
+need to record the number of blank lines, and which of Newline and
+Undent is needed next in the initial sequence of pairs.
+
+For this we store one more than the number of blank lines as
+`delayed_lines` and a flag for `undent_next`.
+
+###### state fields
+	int check_indent;
+	int delayed_lines;
+	int undent_next;
+
+Generating these tokens involve two separate pieces of code.
+
+Firstly we need to recognise white space and count the indents and
+newlines.  These are recorded in the above state fields.
+
+Separately we need, on each call to `token_next`, we need to check if
+there are some delayed tokens and if so we need to advance the state
+information and return one token.
+
+###### white space
+	if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
+		int newlines = 0;
+		int was_son = at_son(state);
+		if (ignored & (1<<TK_indent)) {
+			if (!is_newline(ch))
+				continue;
+			if (ignored & (1<<TK_newline))
+				continue;
+			tk.num = TK_newline;
+			return tk;
+		}
+		// Indents are needed, so check all white space.
+		while (ch <= ' ' && !at_eon(state)) {
+			if (is_newline(ch))
+				newlines += 1;
+			ch = get_char(state);
+		}
+		if (at_eon(state)) {
+			newlines += 1;
+			if (state->node->next &&
+			    state->node->next->indent > state->node->indent)
+				state->col = state->node->next->indent;
+			else
+				state->col = state->node->indent;
+		} else
+			unget_char(state);
+		state->delayed_lines = newlines;
+		state->undent_next = was_son;
+		state->check_indent = 1;
+		continue;
+	}
+
+
+###### delayed tokens
+
+	if (state->check_indent || state->delayed_lines) {
+		if (state->col < state->indent_sizes[state->indent_level]) {
+			if (!state->undent_next &&
+			    !(ignored & (1<<TK_newline))) {
+				state->undent_next = 1;
+				tk.num = TK_newline;
+				return tk;
+			}
+			state->indent_level -= 1;
+			state->undent_next = 0;
+			tk.num = TK_undent;
+			return tk;
+		}
+		if (state->col > state->indent_sizes[state->indent_level] &&
+		    state->indent_level < sizeof(state->indent_sizes)-1) {
+			state->indent_level += 1;
+			state->indent_sizes[state->indent_level] = state->col;
+			state->delayed_lines -= 1;
+			tk.num = TK_indent;
+			return tk;
+		}
+		state->check_indent = 0;
+		if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
+			tk.num = TK_newline;
+			state->delayed_lines -= 1;
+			return tk;
+		}
+		state->delayed_lines = 0;
+		continue;
+	}
+
+### End of File
+
+After the last newline in the file has been processed, a special
+end-of-file token will be returned.  any further attempts to get more
+tokens will continue to return the same end-of-file token.
+
+###### token types
+	TK_eof,
+
+
+###### white space
+	if (ch == WEOF) {
+		tk.num = TK_eof;
+		return tk;
+	}
+
+### Unknown Marks, or errors.
+
+We have now handled all the possible known mark-like tokens.
+If the token we have is not empty and `TK_mark` is allowed,
+we have an unknown mark, otherwise this must be an error.
+
+###### unknown mark
+	/* one unknown character */
+	close_token(state, &tk);
+	tk.num = TK_error;
+	return tk;
+
+## Tools For The Task
+
+You may have noticed that are few gaps we left in the above -
+functions used without first defining them.  Doing so above would have
+broken the flow.
+
+### Character by character
+
+As we walk through the various `code_node`s we need to process whole
+Unicode codepoints, and keep track of which line and column we are on.
+We will assume for now that any printing character uses one column,
+though that is not true in general.
+
+As the text in a `code_node` may include an indent that identifies it as
+being code, we need to be careful to strip that.  The `code_node` has
+a flag that tells us whether or not we need to strip.
+
+###### includes
+	#include <memory.h>
+
+###### state fields
+	struct code_node *node;
+	int    offset;
+	int    line;
+	int    col;
+
+###### internal functions
+
+	static void do_strip(struct token_state *state)
+	{
+		if (state->node->needs_strip) {
+			int n = 4;
+			while (n && state->node->code.txt[state->offset] == ' ') {
+				state->offset += 1;
+				n -= 1;
+			}
+			while (n == 4 && state->node->code.txt[0] == '\t') {
+				state->offset += 1;
+				n -= 4;
+			}
+		}
+	}
+
+	static wint_t get_char(struct token_state *state)
+	{
+		wchar_t next;
+		size_t n;
+		mbstate_t mbstate;
+
+		if (state->node == NULL)
+			return WEOF;
+		if (state->node->code.len <= state->offset) {
+			do
+				state->node = state->node->next;
+			while (state->node && state->node->code.txt == NULL);
+			state->offset = 0;
+			if (state->node == NULL)
+				return WEOF;
+			do_strip(state);
+			state->line = state->node->line_no;
+			state->col = state->node->indent;
+		}
+
+		## before get_char
+
+		memset(&mbstate, 0, sizeof(mbstate));
+
+		n = mbrtowc(&next, state->node->code.txt + state->offset,
+			    state->node->code.len - state->offset,
+			    &mbstate);
+		if (n == -2 || n == 0) {
+			/* Not enough bytes - not really possible */
+			next = '\n';
+			state->offset = state->node->code.len;
+		} else if (n == -1) {
+			/* error */
+			state->offset += 1;
+			next = 0x7f; // an illegal character
+		} else
+			state->offset += n;
+
+		if (next >= ' ') {
+			state->col += 1;
+		} else if (is_newline(next)) {
+			state->line += 1;
+			state->col = state->node->indent;
+			do_strip(state);
+		} else if (next == '\t') {
+			state->col = indent_tab(state->col);
+		}
+		return next;
+	}
+
+We will sometimes want to "unget" the last character as it needs to be
+considered again as part of the next token.  So we need to store a
+'previous' version of all metadata.
+
+###### state fields
+	int    prev_offset;
+	int    prev_line;
+	int    prev_col;
+
+###### before get_char
+	state->prev_offset = state->offset;
+	state->prev_line   = state->line;
+	state->prev_col    = state->col;
+
+###### internal functions
+
+	static void unget_char(struct token_state *state)
+	{
+		if (state->node) {
+			state->offset = state->prev_offset;
+			state->line   = state->prev_line;
+			state->col    = state->prev_col;
+		}
+	}
+
+We occasionally need a double-unget, particularly for numbers and
+block comments.  We don't impose this cost on all scanning, but
+require those code sections that need it to call `save_unget_state`
+before each `get_char`, and then `restore_unget_state` when a
+double-unget is needed.
+
+###### state fields
+	int	prev_offset2;
+	int	prev_line2;
+	int	prev_col2;
+
+###### internal functions
+	static void save_unget_state(struct token_state *state)
+	{
+		state->prev_offset2 = state->prev_offset;
+		state->prev_line2 = state->prev_line;
+		state->prev_col2 = state->prev_col;
+	}
+
+	static void restore_unget_state(struct token_state *state)
+	{
+		state->prev_offset = state->prev_offset2;
+		state->prev_line = state->prev_line2;
+		state->prev_col = state->prev_col2;
+	}
+
+At the start of a token we don't want to be at the end of a code block
+if we can help it.  To avoid this possibility, we 'get' and 'unget' a
+single character.  This will move into the next non-empty code block
+and leave the current pointer at the start of it.
+
+This has to happen _after_ dealing with delayed tokens as some of them
+must appear in the previous node.  When we do this, we need to reset
+the data in the token.
+
+###### delayed tokens
+	if (at_eon(state)) {
+		get_char(state);
+		unget_char(state);
+		tk.node = state->node;
+		if (state->node)
+			tk.txt.txt = state->node->code.txt + state->offset;
+		tk.line = state->line;
+		tk.col = state->col;
+		tk.txt.len = 0;
+	}
+
+### Managing tokens
+
+The current token is initialized to line up with the first character
+that we 'get' for each token.  When we have, or might have, a full
+token we can call `close_token` to set the `len` of the token
+appropriately.  This can safely be called multiple times.
+
+Finally we occasionally (for single-line strings and block comments)
+need to reset to the beginning of the current token as we might have
+parsed too much already.  For that there is `reset_token`.
+
+###### one token
+	tk.node = state->node;
+	if (state->node)
+		tk.txt.txt = state->node->code.txt + state->offset;
+	tk.line = state->line;
+	tk.col = state->col;
+	tk.txt.len = 0;
+
+###### internal functions
+
+	static void close_token(struct token_state *state,
+	                        struct token *tk)
+	{
+		tk->txt.len = (state->node->code.txt + state->offset)
+		              - tk->txt.txt;
+	}
+
+	static void reset_token(struct token_state *state, struct token *tok)
+	{
+		state->prev_line = tok->line;
+		state->prev_col = tok->col;
+		state->prev_offset = tok->txt.txt - state->node->code.txt;
+		unget_char(state);
+		tok->txt.len = 0;
+	}
+
+
+Tokens make not cross into the next `code_node`, and some tokens can
+include the newline at the and of a `code_node`, we must be able to
+easily check if we have reached the end.  Equally we need to know if
+we are at the start of a node, as white space is treated a little
+differently there.
+
+###### internal functions
+
+	static int at_son(struct token_state *state)
+	{
+		return state->offset == 0;
+	}
+
+	static int at_eon(struct token_state *state)
+	{
+		// at end-of-node ??
+		return state->node == NULL ||
+		       state->offset >= state->node->code.len;
+	}
+
+### Find a known word
+
+As the known-word list is sorted we can use a simple binary search.
+Following the pattern established in "mdcode", we will use a `struct
+text` with start and length to represent the code fragment we are
+searching for.
+
+###### internal functions
+	static int find_known(struct token_config *conf, struct text txt)
+	{
+		int lo = 0;
+		int hi = conf->known_count;
+
+		while (lo + 1 < hi) {
+			int mid = (lo + hi) / 2;
+			int cmp = strncmp(conf->words_marks[mid],
+			                  txt.txt, txt.len);
+			if (cmp == 0 && conf->words_marks[mid][txt.len])
+				cmp = 1;
+			if (cmp <= 0)
+				lo = mid;
+			else
+				hi = mid;
+		}
+		if (strncmp(conf->words_marks[lo],
+		           txt.txt, txt.len) == 0
+		    && conf->words_marks[lo][txt.len] == 0)
+			return lo;
+		else
+			return -1;
+	}
+
+### Bringing it all together
+
+Now we have all the bits there is just one section missing:  combining
+all the token parsing code into one block.
+
+The handling of delayed tokens (newlines, indents, undents) must come
+first before we try getting another character.
+
+Then we parse all the test, making sure that we check for known marks
+before strings and comments, but unknown marks after strings and comments.
+
+This block of code will either return a token, or will choose to
+ignore one, in which case it will `continue` around to the top of the
+loop.
+
+###### one token
+	## delayed tokens
+
+	ch = get_char(state);
+
+	## white space
+	## parse number
+	## parse word
+	## parse mark
+	## parse string
+	## parse comment
+	## unknown mark
+
+### Start and stop
+
+As well as getting tokens, we need to be able to create the
+`token_state` to start with, and discard it later.
+
+###### includes
+	#include <malloc.h>
+
+###### main functions
+	struct token_state *token_open(struct code_node *code, struct
+	                               token_config *conf)
+	{
+		struct token_state *state = malloc(sizeof(*state));
+		memset(state, 0, sizeof(*state));
+		state->node = code;
+		state->line = code->line_no;
+		state->conf = conf;
+		return state;
+	}
+	void token_close(struct token_state *state)
+	{
+		free(state);
+	}
+
+###### exported functions
+	struct token_state *token_open(struct code_node *code, struct
+	                               token_config *conf);
+	void token_close(struct token_state *state);
+
+### Trace tokens
+
+Getting tokens is the main thing but it is also useful to be able to
+print out token information, particularly for tracing and testing.
+
+Known tokens are printed verbatim.  Other tokens are printed as
+`type(content)` where content is truncated to a given number of characters.
+
+The function for printing a truncated string (`text_dump`) is also exported
+so that it can be used to tracing processed strings too.
+
+###### includes
+	#include <stdio.h>
+
+###### exported functions
+	void token_trace(FILE *f, struct token tok, int max);
+	void text_dump(FILE *f, struct text t, int max);
+
+###### main functions
+
+	void text_dump(FILE *f, struct text txt, int max)
+	{
+		int i;
+		if (txt.len > max)
+			max -= 2;
+		else
+			max = txt.len;
+		for (i = 0; i < max; i++) {
+			char c = txt.txt[i];
+			if (c < ' ' || c > '~')
+				fprintf(f, "\\x%02x", c & 0xff);
+			else if (c == '\\')
+				fprintf(f, "\\\\");
+			else
+				fprintf(f, "%c", c);
+		}
+		if (i < txt.len)
+			fprintf(f, "..");
+	}
+
+	void token_trace(FILE *f, struct token tok, int max)
+	{
+		static char *types[] = {
+			[TK_ident] = "ident",
+			[TK_mark] = "mark",
+			[TK_number] = "number",
+			[TK_string] = "string",
+			[TK_multi_string] = "mstring",
+			[TK_line_comment] = "lcomment",
+			[TK_block_comment] = "bcomment",
+			[TK_indent] = "indent",
+			[TK_undent] = "undent",
+			[TK_newline] = "newline",
+			[TK_eof] = "eof",
+			[TK_error] = "ERROR",
+			};
+
+		switch (tok.num) {
+		default: /* known word or mark */
+			fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
+			break;
+		case TK_indent:
+		case TK_undent:
+		case TK_newline:
+		case TK_eof:
+			/* No token text included */
+			fprintf(f, "%s()", types[tok.num]);
+			break;
+		case TK_ident:
+		case TK_mark:
+		case TK_number:
+		case TK_string:
+		case TK_multi_string:
+		case TK_line_comment:
+		case TK_block_comment:
+		case TK_error:
+			fprintf(f, "%s(", types[tok.num]);
+			text_dump(f, tok.txt, max);
+			fprintf(f, ")");
+			break;
+		}
+	}
+
+### And there we have it
+
+We now have all the library functions defined for reading and printing
+tokens.  Now we just need C files to store them, and a mk file to make them.
+
+###### File: scanner.h
+	## public types
+	## exported functions
+
+###### File: libscanner.c
+	## includes
+	#include "scanner.h"
+	## private types
+	## internal functions
+	## main functions
+
+###### File: scanner.mk
+
+	CFLAGS += -Wall -g
+	all ::
+	scanner.mk scanner.h libscanner.c : scanner.mdc
+		./md2c scanner.mdc
+	all :: libscanner.o
+	libscanner.o : libscanner.c
+		$(CC) $(CFLAGS) -c libscanner.c
+
+## Processing numbers
+
+Converting a `TK_number` token to a numerical value is a slightly
+higher level task than lexical analysis, and slightly lower than
+grammar parsing, so put it here - as an index if you like.
+
+Importantly it will be used by the same testing rig that is used for
+testing the token scanner.
+
+The numeric value that we will convert all numbers into is the `mpq_t`
+from the GNU high precision number library "libgmp".
+
+###### number includes
+	#include <gmp.h>
+	#include "mdcode.h"
+
+Firstly we need to be able to parse a string of digits in a given base
+and possibly with a decimal marker.  We store this in an `mpz_t`
+integer and report the number of digits after the decimal mark.
+
+On error we return zero and ensure that the 'mpz_t' has been freed, or
+had never been initialised.
+
+###### number functions
+
+	static int parse_digits(mpz_t num, struct text tok, int base,
+	                        int *placesp)
+	{
+		/* Accept digits up to 'base', ignore '_' and
+		 * ' ' if they appear between two legal digits,
+		 * and if `placesp` is not NULL, allow a single
+		 * '.' or ',' and report the number of digits
+		 * beyond there.
+		 * Return number of characters processed (p),
+		 * or 0 if something illegal was found.
+		 */
+		int p;
+		int decimal = -1; // digits after marker
+		enum {Digit, Space, Other} prev = Other;
+		int digits = 0;
+
+		for (p = 0; p < tok.len; p++) {
+			int dig;
+			char c = tok.txt[p];
+
+			if (c == '_' || c == ' ') {
+				if (prev != Digit)
+					goto bad;
+				prev = Space;
+				continue;
+			}
+			if (c == '.' || c == ',') {
+				if (prev != Digit)
+					goto bad;
+				if (!placesp || decimal >= 0)
+					return p-1;
+				decimal = 0;
+				prev = Other;
+				continue;
+			}
+			if (isdigit(c))
+				dig = c - '0';
+			else if (isupper(c))
+				dig = 10 + c - 'A';
+			else if (islower(c))
+				dig = 10 + c - 'a';
+			else
+				dig = base;
+			if (dig >= base) {
+				if (prev == Space)
+					p--;
+				break;
+			}
+			prev = Digit;
+			if (digits)
+				mpz_mul_ui(num, num, base);
+			else
+				mpz_init(num);
+			digits += 1;
+			mpz_add_ui(num, num, dig);
+			if (decimal >= 0)
+				decimal++;
+		}
+		if (digits == 0)
+			return 0;
+		if (placesp) {
+			if (decimal >= 0)
+				*placesp = decimal;
+			else
+				*placesp = 0;
+		}
+		return p;
+	bad:
+		if (digits)
+			mpz_clear(num);
+		return 0;
+	}
+
+###### number includes
+	#include <ctype.h>
+
+To parse a full number we need to consider the optional base, the
+mantissa, and the optional exponent.  We will treat these one at a
+time.
+
+The base is indicated by a letter after a leading zero, which must be
+followed by a base letter or a period.  The base also determines the
+character which will mark an exponent.
+
+###### number vars
+	int base = 10;
+	char expc = 'e';
+
+###### parse base
+
+	if (tok.txt[0] == '0' && tok.len > 1) {
+		int skip = 0;
+		switch(tok.txt[1]) {
+		case 'x':
+		case 'X':
+			base = 16;
+			skip = 2;
+			expc = 'p';
+			break;
+		case 'o':
+		case 'O':
+			base = 8;
+			skip = 2;
+			expc = 'p';
+			break;
+		case 'b':
+		case 'B':
+			base = 2;
+			skip = 2;
+			expc = 'p';
+			break;
+		case '0':
+		case '1':
+		case '2':
+		case '3':
+		case '4':
+		case '5':
+		case '6':
+		case '7':
+		case '8':
+		case '9':
+		case '_':
+		case ' ':
+			// another digit is not permitted
+			// after a zero.
+			return 0;
+		default:
+			// must be decimal marker or trailing
+			// letter, which are OK;
+			break;
+		}
+		tok.txt += skip;
+		tok.len -= skip;
+	}
+
+After the base is the mantissa, which may contain a decimal mark, so
+we need to record the number of places.  We won't impose the number of
+places until we have the exponent as well.
+
+###### number vars
+	int places =0;
+	mpz_t mant;
+	int d;
+
+###### parse mantissa
+
+	d = parse_digits(mant, tok, base, &places);
+	if (d == 0)
+		return 0;
+	tok.txt += d;
+	tok.len -= d;
+	mpq_init(num);
+	mpq_set_z(num, mant);
+	mpz_clear(mant);
+
+After the mantissa number may come an exponent which may be positive
+or negative.  We assume at this point that we have seen the exponent
+character `expc`.
+
+###### number vars
+	long lexp = 0;
+	mpz_t exp;
+	int esign = 1;
+
+###### parse exponent
+	if (tok.len > 1) {
+		if (tok.txt[0] == '+') {
+			tok.txt++;
+			tok.len--;
+		} else if (tok.txt[0] == '-') {
+			esign = -1;
+			tok.txt++;
+			tok.len--;
+		}
+	}
+	d = parse_digits(exp, tok, 10, NULL);
+	if (d == 0) {
+		mpq_clear(num);
+		return 0;
+	}
+	if (!mpz_fits_slong_p(exp)) {
+		mpq_clear(num);
+		mpz_clear(exp);
+		return 0;
+	}
+	lexp = mpz_get_si(exp) * esign;
+	mpz_clear(exp);
+	tok.txt += d;
+	tok.len -= d;
+
+
+Now that we have the mantissa and the exponent we can multiply them
+together, also allowing for the number of digits after the decimal
+mark.
+
+For base 10, we simply subtract the decimal places from the exponent.
+For the other bases, as the exponent is alway based on 2, even for
+octal and hex, we need a bit more detail.
+We then recover the sign from the exponent, as division is quite
+different from multiplication.
+
+###### calc exponent
+	switch (base) {
+	case 10:
+	case 2:
+		lexp -= places;
+		break;
+	case 16:
+		lexp -= 4*places;
+		break;
+	case 8:
+		lexp -= 3*places;
+		break;
+	}
+	if (lexp < 0) {
+		lexp = -lexp;
+		esign = -1;
+	} else
+		esign = 1;
+
+Imposing the exponent on the number is also very different for base 10
+than for the others.  For the binary shift `gmp` provides a simple
+function.  For base 10 we use something like Russian Peasant
+Multiplication.
+
+###### calc exponent
+	if (expc == 'e') {
+		mpq_t tens;
+		mpq_init(tens);
+		mpq_set_ui(tens, 10, 1);
+		while (1) {
+			if (lexp & 1) {
+				if (esign > 1)
+					mpq_mul(num, num, tens);
+				else
+					mpq_div(num, num, tens);
+			}
+			lexp >>= 1;
+			if (lexp == 0)
+				break;
+			mpq_mul(tens, tens, tens);
+		}
+		mpq_clear(tens);
+	} else {
+		if (esign > 0)
+			mpq_mul_2exp(num, num, lexp);
+		else
+			mpq_div_2exp(num, num, lexp);
+	}
+
+Now we are ready to parse a number: the base, mantissa, and exponent.
+If all goes well we check for the possible trailing letters and
+return.  Return value is 1 for success and 0 for failure.
+
+
+###### number functions
+	int number_parse(mpq_t num, char tail[3], struct text tok)
+	{
+		## number vars
+		int i;
+
+		## parse base
+		## parse mantissa
+		if (tok.len > 1 && (tok.txt[0] == expc ||
+		                    tok.txt[0] == toupper(expc))) {
+			tok.txt++;
+			tok.len--;
+			## parse exponent
+		}
+		## calc exponent
+
+		for (i = 0; i < 2; i++) {
+			if (tok.len <= i)
+				break;
+			if (!isalpha(tok.txt[i]))
+				goto err;
+			tail[i] = tok.txt[i];
+		}
+		tail[i] = 0;
+		if (i == tok.len)
+			return 1;
+	err:
+		mpq_clear(num);
+		return 0;
+	}
+
+Number parsing goes in `libnumber.c`
+
+###### File: libnumber.c
+
+	#include <unistd.h>
+	#include <stdlib.h>
+
+	## number includes
+	## number functions
+
+###### File: number.h
+	int number_parse(mpq_t num, char tail[3], struct text tok);
+
+###### File: scanner.mk
+	all :: libnumber.o
+	libnumber.o : libnumber.c
+		$(CC) $(CFLAGS) -c libnumber.c
+
+## Processing strings
+
+Both `TK_string` and `TK_multi_string` require post-processing which
+can be one of two types: literal or with escapes processed.
+Even literal processing is non-trivial as the file may contain indents
+which need to be stripped.
+
+Errors can only occur when processing escapes.  Any unrecognised
+character following the escape character will cause an error.
+
+Processing escapes and striping indents can only make the string
+shorter, not longer, so we allocate a buffer which is the same size as
+the string and process into that.
+
+To request escape processing, we pass the character we want to use for
+quoting, usually '`\`'.  To avoid escape processing we pass a zero.
+
+###### string main
+	int string_parse(struct token *tok, char escape,
+	                 struct text *str, char tail[3])
+	{
+		## string vars
+		struct text t = tok->txt;
+
+		str->txt = NULL;
+		## strip tail
+		if (tok->num == TK_string) {
+			## strip single
+		} else {
+			## strip multi
+		}
+		str->txt = malloc(t.len);
+		str->len = 0;
+
+		## process string
+		return 1;
+	err:
+		free(str->txt);
+		str->txt = NULL;
+		return 0;
+	}
+
+### strip tail
+
+The tail of the string can be 0, 1, or 2 letters
+
+	i = t.len;
+	if (i >= 0 && isalpha(t.txt[i-1]))
+		i -= 1;
+	if (i >= 0 && isalpha(t.txt[i-1]))
+		i -= 1;
+	strncpy(tail, t.txt+i, t.len-i);
+	tail[t.len-i] = 0;
+	t.len = i;
+
+###### string vars
+	int i;
+
+### strip single
+
+Stripping the quote of a single-line string is trivial.
+The only part that is at all interesting is that quote character must
+be remembered.
+
+	quote = t.txt[0];
+	if (t.txt[t.len-1] != quote)
+		goto err;
+	t.txt += 1;
+	t.len -= 2;
+
+###### string vars
+	char quote;
+
+### strip multi
+
+For a multi-line string we have a little more work to do.  We need to
+remove 3 quotes, not 1, and need to count the indent of the close
+quote as it will need to be stripped from all lines.
+
+	quote = t.txt[0];
+	if (t.len < 7 ||
+	    t.txt[1] != quote || t.txt[2] != quote ||
+	    !is_newline(t.txt[3]))
+		goto err;
+	t.txt += 4;
+	t.len -= 4;
+	i = t.len;
+	if (i <= 0 || t.txt[i-1] != quote)
+		goto err;
+	i -= 1;
+	if (i <= 0 || t.txt[i-1] != quote)
+		goto err;
+	i -= 1;
+	if (i <= 0 || t.txt[i-1] != quote)
+		goto err;
+	i -= 1;
+	t.len = i;
+	while (i > 0 && !is_newline(t.txt[i-1]))
+		i--;
+	indent = 0;
+	while (i < t.len) {
+		if (t.txt[i] == ' ')
+			indent += 1;
+		if (t.txt[i] == '\t')
+			indent = indent_tab(indent);
+		i++;
+	}
+
+###### string vars
+	int indent = 0;
+
+### process string
+
+Now we just take one byte at a time. trans-ASCII unicode won't look
+like anything we are interested in so it will just be copied byte by
+byte.
+
+	cp = str->txt;
+	at_sol = 1;
+	for (i = 0; i < t.len; i++) {
+		char c;
+		if (at_sol) {
+			at_sol = 0;
+			## strip indent
+			if (i >= t.len)
+				break;
+		}
+		c = t.txt[i];
+		if (c != escape) {
+			*cp = c;
+			cp += 1;
+			if (is_newline(c))
+				at_sol = 1;
+		} else if (i+1 >= t.len) {
+			// escape and end of string
+			goto err;
+		} else {
+			i += 1;
+			c = t.txt[i];
+			## parse escape
+		}
+	}
+	str->len = cp - str->txt;
+
+###### string vars
+	char *cp;
+	int at_sol;
+
+### strip indent
+
+Every time we find a start of line, we strip spaces and tabs until the
+required indent is found.
+
+	int skipped = 0;
+	while (i < t.len && skipped < indent) {
+		c = t.txt[i];
+		if (c == ' ')
+			skipped += 1;
+		else if (c == '\t')
+			skipped = indent_tab(c);
+		else
+			break;
+		i+= 1;
+	}
+
+### parse escape
+	switch (c) {
+	case 'n':
+		*cp++ = '\n'; break;
+	case 'r':
+		*cp++ = '\r'; break;
+	case 't':
+		*cp++ = '\t'; break;
+	case 'b':
+		*cp++ = '\b'; break;
+	case 'q':
+		*cp++ = quote; break;
+	case 'f':
+		*cp++ = '\f'; break;
+	case 'v':
+		*cp++ = '\v'; break;
+	case 'a':
+		*cp++ = '\a'; break;
+	case '0':
+	case '1':
+	case '2':
+	case '3':
+		// 3 digit octal number
+		if (i+2 >= t.len)
+			goto err;
+		if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
+		    t.txt[i+2] < '0' || t.txt[i+1] > '7')
+			goto err;
+		n = (t.txt[i  ]-'0') * 64 +
+		    (t.txt[i+1]-'0') *  8 +
+		    (t.txt[i+2]-'0') *  1;
+		*cp++ = n;
+		i += 2;
+		break;
+	case 'x':
+		// 2 hex digits
+		n = take_hex(2, t.txt+i+1, t.len-i-1);
+		if (n < 0)
+			goto err;
+		*cp++ = n;
+		i += 2;
+		break;
+	case 'u':
+	case 'U':
+		// 4 or 8 hex digits for unicode
+		n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
+		if (n < 0)
+			goto err;
+		memset(&pstate, 0, sizeof(pstate));
+		n = wcrtomb(cp, n, &pstate);
+		if (n <= 0)
+			goto err;
+		cp += n;
+		i += c == 'u' ? 4 : 8;
+		break;
+	default:
+		if (c == escape)
+			*cp++ = c;
+		else if (is_newline(c))
+			at_sol = 1;
+		else
+			goto err;
+	}
+
+###### string vars
+	long n;
+	mbstate_t pstate;
+
+For `\x` `\u` and `\U` we need to collect a specific number of
+hexadecimal digits
+
+###### string functions
+
+	static long take_hex(int digits, char *cp, int l)
+	{
+		long n = 0;
+		if (l < digits)
+			return -1;
+		while (digits) {
+			char  c = *cp;
+			int d;
+			if (!isxdigit(c))
+				return -1;
+			if (isdigit(c))
+				d = c - '0';
+			else if (isupper(c))
+				d = 10 + c - 'A';
+			else
+				d = 10 + c - 'a';
+			n = n * 16 + d;
+			digits--;
+			cp++;
+		}
+		return n;
+	}
+
+#### File: libstring.c
+
+String parsing goes in `libstring.c`
+
+	#include <unistd.h>
+	#include <stdlib.h>
+	#include <stdio.h>
+	#include <string.h>
+	#include <ctype.h>
+	#include <wchar.h>
+	#include "mdcode.h"
+	#include "scanner.h"
+	## string functions
+	## string main
+
+###### File: string.h
+	int string_parse(struct token *tok, char escape,
+	                 struct text *str, char tail[3]);
+
+###### File: scanner.mk
+	all :: libstring.o
+	libstring.o : libstring.c
+		$(CC) $(CFLAGS) -c libstring.c
+
+
+## Testing
+
+As "untested code is buggy code" we need a program to easily test
+the scanner library.  This will simply parse a given file and report
+the tokens one per line.
+
+###### File: scanner.c
+
+	#include <unistd.h>
+	#include <stdlib.h>
+	#include <fcntl.h>
+	#include <errno.h>
+	#include <sys/mman.h>
+	#include <string.h>
+	#include <stdio.h>
+	#include <gmp.h>
+	#include <locale.h>
+	#include "mdcode.h"
+	#include "scanner.h"
+	#include "number.h"
+	#include "string.h"
+
+	static int errs;
+	static void pr_err(char *msg)
+	{
+		errs++;
+		fprintf(stderr, "%s\n", msg);
+	}
+
+	int main(int argc, char *argv[])
+	{
+		int fd;
+		int len;
+		char *file;
+		struct token_state *state;
+		char *known[] = {
+			"==",
+			"else",
+			"if",
+			"then",
+			"while",
+			"{",
+			"}",
+		};
+		struct token_config conf = {
+			.word_start = "_$",
+			.word_cont = "",
+			.words_marks = known,
+			.number_chars = "., _+-",
+			.known_count = sizeof(known)/sizeof(known[0]),
+			.ignored = (0 << TK_line_comment)
+			          |(0 << TK_block_comment),
+		};
+		struct section *table, *s, *prev;
+		setlocale(LC_ALL,"");
+		if (argc != 2) {
+			fprintf(stderr, "Usage: scanner file\n");
+			exit(2);
+		}
+		fd = open(argv[1], O_RDONLY);
+		if (fd < 0) {
+			fprintf(stderr, "scanner: cannot open %s: %s\n",
+				argv[1], strerror(errno));
+			exit(1);
+		}
+		len = lseek(fd, 0, 2);
+		file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+		table = code_extract(file, file+len, pr_err);
+
+		for (s = table; s;
+			(code_free(s->code), prev = s, s = s->next, free(prev))) {
+			printf("Tokenizing: %.*s\n", s->section.len,
+			        s->section.txt);
+			state = token_open(s->code, &conf);
+			while(1) {
+				struct token tk = token_next(state);
+				printf("%d:%d ", tk.line, tk.col);
+				token_trace(stdout, tk, 20);
+				if (tk.num == TK_number) {
+					mpq_t num;
+					char tail[3];
+					if (number_parse(num, tail,tk.txt)) {
+						printf(" %s ", tail);
+						mpq_out_str(stdout, 10, num);
+						mpq_clear(num);
+					} else
+						printf(" BAD NUMBER");
+				}
+				if (tk.num == TK_string ||
+				    tk.num == TK_multi_string) {
+					char esc = '\\';
+					struct text str;
+					char tail[3];
+					if (tk.txt.txt[0] == '`')
+						esc = 0;
+					if (string_parse(&tk, esc,
+					                 &str, tail)) {
+						printf(" %s ", tail);
+						text_dump(stdout, str, 20);
+						free(str.txt);
+					} else
+						printf(" BAD STRING");
+				}
+				printf("\n");
+				if (tk.num == TK_error)
+					errs = 1;
+				if (tk.num == TK_eof)
+					break;
+			}
+		}
+		exit(!!errs);
+	}
+###### File: scanner.mk
+	scanner.c : scanner.mdc
+		./md2c scanner.mdc
+	all :: scanner
+	scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
+		$(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
+			libmdcode.o libnumber.o libstring.o -licuuc -lgmp
+	scanner.o : scanner.c
+		$(CC) $(CFLAGS) -c scanner.c
+
-- 
2.43.0