--- /dev/null
+# Lexical Scanner #
+
+## The Task at Hand ##
+
+The main task of the lexical scanner is to convert a stream of
+characters into a stream of tokens. The tokens are then typically
+used by a parser to extract the syntactic structure.
+
+The stream of characters are assumed to be in memory identified by a
+linked list of blocks, such as provided by the "[mdcode][]" literate
+program extractor. A single token may never cross a block boundary.
+
+[mdcode]: mdcode.html
+
+###### includes
+ #include "mdcode.h"
+
+The text is assumed to be UTF-8 though some matching assumes the
+ASCII subset. If the text provided does not conform to UTF-8 an error
+will be reported and some number of bytes will be skipped.
+
+###### includes
+ #include <wchar.h>
+ #include <wctype.h>
+ #include <unicode/uchar.h>
+
+Tokens are returned by successive calls to the main interface
+function: `token_next()` which has a `state` structure to keep track
+of where it is up to. Each token carries not just a numeric
+identifier but also the code block, the line and character within that
+block, and the actual start and length using the `struct text` from
+"mdcode".
+
+###### public types
+ struct token {
+ int num;
+ struct code_node *node;
+ struct text txt;
+ int line, col;
+ };
+ struct token_state;
+
+###### private types
+ struct token_state {
+ ## state fields
+ };
+
+###### exported functions
+ struct token token_next(struct token_state *state);
+
+###### main functions
+ struct token token_next(struct token_state *state)
+ {
+ ## token_next init
+ while (1) {
+ wint_t ch;
+ struct token tk;
+
+ ## one token
+ }
+ }
+
+The `line` and `col` offsets are useful for reporting errors.
+The `txt` provides the content when that is important.
+
+### Token types and configuration ##
+
+The scanner is not completely general, yet not completely specified.
+There are a fixed set of token types, though particular tokens within
+those types can be distinguish via configuration.
+
+Most token types may be explicitly ignored, as typically comments
+would be. The exact consequence of ignoring each token type varies
+from token to token.
+
+###### public types
+ struct token_config {
+ int ignored; // bit set of ignored tokens.
+ ## token config parameters
+ };
+
+###### state fields
+ struct token_config *conf;
+
+###### token_next init
+ int ignored = state->conf->ignored;
+
+
+The different tokens are numbers, words, marks, strings, comments,
+newlines, EOF, and indents, each of which is examined in detail below.
+
+There are various cases where no token can be found in part of the
+input. All of these will be reported as an `TK_error` token.
+
+It is possible to declare a number of strings which form distinct
+tokens (rather than being grouped as e.g. 'word'). These are given
+token numbers from `TK_reserved` upwards.
+
+###### public types
+ enum token_num {
+ TK_error,
+ ## token types
+ TK_reserved
+ };
+
+### Numbers
+
+Numbers are the messiest tokens to parse, primarily because they can
+contain characters that also have meaning outside of number and,
+particularly, immediately after numbers.
+
+The obvious example is the '`-`' sign. It can come inside a number for
+a negative exponent, or after a number as a subtraction operator. To
+be sure we have parsed as best as possible we need to only allow the
+'`-`' inside a number if it is after an exponent character. This can be
+`e` or `p` (for hex exponents), but `e` can also be a hexadecimal
+digit, so we don't allow '`-`' after just any `e`.
+
+To make matters worse, our language designer has decided to experiment
+with allowing commas to be used as the decimal indicator, and spaces
+to be used to separate groups of digits in large numbers. Both of
+these can reasonably be restricted to appear between two digits, so we
+have to add that condition to our tests.
+
+So we cannot just treat numbers as starting with a digit and being
+followed by some set of characters. We need more structure than that.
+
+So:
+
+- Numbers must start with a digit.
+- If the first digit is zero, the next character must be a base
+ signifier (one of `xob`) or a decimal marker (`.` or `,`).
+ In the first case the first `p` or `P` may be followed by a sign.
+- If the number doesn't start with `0` followed by one of `xob`, the
+ first `e` may be followed by a sign.
+- Any digit or hex digit may be followed by a space or underscore
+ providing that the subsequence character is also a (hex) digit.
+ This rule will require an extra level of 'unget' to be
+ supported when handling characters.
+- Otherwise any digits or ASCII letters are allowed. We do not at
+ this point check that the digits given are permitted by the base.
+ That will happen when the token is converted to a number.
+
+To allow easy configuration, the various non alphanumeric characters
+are only permitted if they are listed in a configuration parameter.
+
+###### token config parameters
+ char *number_chars;
+
+Note that numbers may not start with a period, so `.75` is not a
+number. This is not the norm, but is not unheard of. Excluding these
+numbers simplifies the rule at very little cost.
+
+###### token types
+ TK_number,
+
+If TK_number is ignored, digits will result in an error unless they
+are declared to be a start character for words.
+
+###### includes
+
+ #include <string.h>
+
+###### parse number
+
+ if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
+ int prev_special = 0;
+ int expect_p = 0;
+ int decimal_mark = 0;
+ if (ch == '0') {
+ wchar_t ch2 = get_char(state);
+ if (strchr("xobXOB", ch2) != NULL)
+ expect_p = 1;
+ unget_char(state);
+ }
+ while (1) {
+ int sign_ok = 0;
+ switch(expect_p) {
+ case 0:
+ if (ch == 'e')
+ sign_ok = 1;
+ break;
+ case 1:
+ if (ch == 'p')
+ sign_ok = 1;
+ break;
+ }
+ save_unget_state(state);
+ ch = get_char(state);
+ if (iswalnum(ch)) {
+ prev_special = 0;
+ continue;
+ }
+ if (ch == '+' || ch == '-') {
+ if (!sign_ok)
+ break;
+ expect_p = -1;
+ }
+ if (ch == '.' || ch == ',') {
+ if (decimal_mark)
+ break;
+ decimal_mark = 1;
+ }
+ if (prev_special) {
+ /* Don't allow that special char,
+ * need two 'ungets'
+ */
+ restore_unget_state(state);
+ break;
+ }
+ if (strchr(state->conf->number_chars, ch)) {
+ prev_special = 1;
+ continue;
+ }
+ /* non-number char */
+ break;
+ }
+ /* We seem to have a "number" token */
+ unget_char(state);
+ close_token(state, &tk);
+ tk.num = TK_number;
+ return tk;
+ }
+
+### Words
+Words start with a "start" character followed by the longest
+sequence of "continue" characters. The Unicode ID_START and
+ID_CONTINUE sets are always permitted, but other ASCII characters
+can be added to these sets.
+
+###### token config parameters
+ char *word_start;
+ char *word_cont;
+
+###### internal functions
+ static int is_word_start(wchar_t ch, struct token_config *conf)
+ {
+ return iswalpha(ch) ||
+ strchr(conf->word_start, ch) != NULL ||
+ u_hasBinaryProperty(ch, UCHAR_ID_START);
+ }
+
+ static int is_word_continue(wchar_t ch, struct token_config *conf)
+ {
+ return iswalnum(ch) ||
+ strchr(conf->word_cont, ch) != NULL ||
+ u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
+ }
+
+Words can be either known or unknown. Known words are referred to as
+"reserved words" and get a unique token number. Unknown words are
+"identifiers" and are syntactically a single token.
+
+###### token types
+ TK_ident,
+
+A list of known words must be provided. This list is shared with the
+"marks" which are described next. The list must be lexically sorted
+and the length of the list must be given (`known_count`).
+Tokens matching these known words are reported as the index of the
+list added to `TK_reserved`.
+
+###### token config parameters
+ char **words_marks;
+ int known_count;
+
+###### parse word
+
+ if (is_word_start(ch, state->conf)) {
+ int n;
+ /* A word: identifier or reserved */
+ do
+ ch = get_char(state);
+ while (is_word_continue(ch, state->conf));
+ unget_char(state);
+ close_token(state, &tk);
+ tk.num = TK_ident;
+ if (ignored & (1<<TK_ident))
+ tk.num = TK_error;
+ n = find_known(state->conf, tk.txt);
+ if (n >= 0)
+ tk.num = TK_reserved + n;
+ return tk;
+ }
+
+### Marks
+
+Marks are generally one or more punctuation marks joined together. It
+would be nice to use the term "symbol" for these, but that causes
+confusion in a subsequent discussion of the grammar, which has terminal
+symbols and non-terminal symbols which are conceptually quite
+different. So strings of punctuation characters will be marks.
+
+A "mark" consists of ASCII characters that are not white space, are not
+"start" characters for words, and are not digits.
+These will collectively be called mark characters.
+
+###### internal functions
+ static int is_mark(wchar_t ch, struct token_config *conf)
+ {
+ return ch > ' ' &&
+ ch < 0x7f &&
+ !iswalnum(ch) &&
+ strchr(conf->word_start, ch) == NULL;
+ }
+
+As with words, there can be known and unknown marks, though the rules
+are slightly different.
+
+Two marks do not need to be separated by a non-mark characters. This
+is different from words which do need to be separated by at least one
+non-continue character.
+
+The scanner will normally prefer longer sequences of mark characters,
+but will more strongly prefer known marks over unknown marks. So if
+it finds a known mark where adding one more character does not result
+in a known mark, it will return that first known mark.
+
+If no known mark is found we will test against strings and comments
+below before giving up and assuming an unknown mark.
+If `TK_mark` is ignored, then unknown marks as returned as an error.
+
+###### token types
+ TK_mark,
+
+Known marks are included in the same list as the list of known words.
+
+###### parse mark
+ tk.num = TK_error;
+ while (is_mark(ch, state->conf)) {
+ int n;
+ close_token(state, &tk);
+ n = find_known(state->conf, tk.txt);
+ if (n >= 0)
+ tk.num = TK_reserved + n;
+ else if (tk.num != TK_error) {
+ /* found a longest-known-mark */
+ unget_char(state);
+ close_token(state, &tk);
+ return tk;
+ }
+ ch = get_char(state);
+ }
+ unget_char(state);
+ if (tk.num != TK_error)
+ return tk;
+
+###### unknown mark
+ if (tk.txt.len) {
+ if (ignored & (1<<TK_mark))
+ tk.num = TK_error;
+ else
+ tk.num = TK_mark;
+ return tk;
+ }
+
+### Strings
+
+Strings start with one of single quote, double quote, or back quote
+and continue until a matching character on the same line. Any of
+these characters can be included in the list of known marks and then
+they will not be used for identifying strings.
+
+Immediately following the close quote one or two ASCII letters may
+appear. These are somewhat like the arbitrary letters allowed in
+"Numbers" above. They can be used by the language in various ways.
+
+If 3 identical quote characters appear in a row and are
+followed by a newline, then this forms a multi-line string which
+continues until an identical triple quote appears on a line preceded
+only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
+
+Multi-line strings may not extend beyond the end of the `code_node` in
+which they start.
+
+Normal strings and multi-line strings are encoded as two different
+token types.
+
+###### token types
+ TK_string,
+ TK_multi_string,
+
+###### internal functions
+ static int is_quote(wchar_t ch)
+ {
+ return ch == '\'' || ch == '"' || ch == '`';
+ }
+
+#### Multi-line strings
+
+The multi-line string is checked for first. If they are being
+ignored, we fall through and treat a triple quote as an empty string
+followed by the start of a new string.
+
+###### parse string
+ if (tk.txt.len == 3 &&
+ !(ignored & (1 << TK_multi_string)) &&
+ is_quote(tk.txt.txt[0]) &&
+ memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
+ is_newline(tk.txt.txt[3])) {
+ // triple quote
+ wchar_t first = tk.txt.txt[0];
+ int qseen = 0;
+ int at_sol = 1;
+ while (!at_eon(state) && qseen < 3) {
+ ch = get_char(state);
+ if (is_newline(ch)) {
+ at_sol = 1;
+ qseen = 0;
+ } else if (at_sol && ch == first) {
+ qseen += 1;
+ } else if (ch != ' ' && ch != '\t') {
+ at_sol = 0;
+ qseen = 0;
+ }
+ }
+ if (qseen != 3) {
+ /* Hit end of node - error.
+ * unget so the newline is seen,
+ * but return rest of string as an error.
+ */
+ unget_char(state);
+ close_token(state, &tk);
+ tk.num = TK_error;
+ return tk;
+ }
+ /* 2 letters are allowed */
+ ch = get_char(state);
+ if (iswalpha(ch))
+ ch = get_char(state);
+ if (iswalpha(ch))
+ ch = get_char(state);
+ /* Now we must have a newline, but we don't return it
+ * whatever it is.*/
+ unget_char(state);
+ close_token(state, &tk);
+ tk.num = TK_multi_string;
+ if (!is_newline(ch))
+ tk.num = TK_error;
+ return tk;
+ }
+
+#### Single-line strings
+
+The sequence of marks collected may be more than a single-line
+string, so we reset to the start and collect characters until
+we find a close quote or a newline.
+
+If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
+
+###### parse string
+ if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
+ !(ignored & (1<<TK_string))) {
+ wchar_t first = tk.txt.txt[0];
+ reset_token(state, &tk);
+ get_char(state);
+ do
+ ch = get_char(state);
+ while (ch != first && !is_newline(ch));
+ tk.num = TK_string;
+ if (is_newline(ch)) {
+ unget_char(state);
+ tk.num = TK_error;
+ }
+ close_token(state, &tk);
+ return tk;
+ }
+
+### Comments
+
+Single line comments may start with '`//`' or '`#`' providing that these
+are not known marks. They continue to the end of the line.
+
+Block comments start with '`/*`' if this is not a known mark. They
+continue to the first occurrence of '`*/`' and may not contain any
+occurrence of '`/*`'.
+
+Block comments can be wholly within one line or can continue over
+multiple lines. The multi-line version should be followed immediately
+by a newline. The Linux kernel contains over 285000 multi-line
+comments are only 34 are followed by characters other than white space
+(which should be removed) or a backslash (only needed in macros). So
+it would not suffer from this rule.
+
+These two comment types are reported as two separate token types, and
+consequently can be ignored separately. When ignored a comment is
+parsed and discarded.
+
+###### token types
+ TK_line_comment,
+ TK_block_comment,
+
+###### internal functions
+ static int is_line_comment(struct text txt)
+ {
+ return (txt.len >= 1 && txt.txt[0] == '#') ||
+ (txt.len >= 2 && txt.txt[0] == '/' &&
+ txt.txt[1] == '/');
+ }
+
+ static int is_block_comment(struct text txt)
+ {
+ return txt.len >= 2 && txt.txt[0] == '/' &&
+ txt.txt[1] == '*';
+ }
+
+#### Single line comments
+
+A single-line comment continues up to, but not including the newline.
+
+###### parse comment
+
+ if (is_line_comment(tk.txt)) {
+ while (!is_newline(ch))
+ ch = get_char(state);
+ unget_char(state);
+ close_token(state, &tk);
+ tk.num = TK_line_comment;
+ if (ignored & (1 << TK_line_comment))
+ continue;
+ return tk;
+ }
+
+#### Block comments
+
+The token text collected so far could exceed the comment, so we need
+to reset it first.
+
+If we find an embedded `/*` we reset to just before the '/' and report
+an error. That way the next thing to be parsed will be the rest of
+the comment. This requires a double unget, so we need to save/restore
+the unget state (explained later).
+
+###### parse comment
+
+ if (is_block_comment(tk.txt)) {
+ wchar_t prev;
+ int newlines = 0;
+ reset_token(state, &tk);
+ get_char(state);
+ get_char(state);
+ save_unget_state(state);
+ ch = get_char(state);
+ prev = 0;
+ while (!at_eon(state) &&
+ (prev != '/' || ch != '*') &&
+ (prev != '*' || ch != '/')) {
+ if (is_newline(ch))
+ newlines = 1;
+ prev = ch;
+ save_unget_state(state);
+ ch = get_char(state);
+ }
+ close_token(state, &tk);
+ if (at_eon(state)) {
+ tk.num = TK_error;
+ return tk;
+ }
+ if (prev == '/') {
+ /* embedded. Need to unget twice! */
+ restore_unget_state(state);
+ unget_char(state);
+ tk.num = TK_error;
+ return tk;
+ }
+ tk.num = TK_block_comment;
+ if (newlines && !(ignored & (1<<TK_newline))) {
+ /* next char must be newline */
+ ch = get_char(state);
+ unget_char(state);
+ if (!is_newline(ch))
+ tk.num = TK_error;
+ }
+ if (tk.num == TK_error ||
+ !(ignored & (1 << TK_block_comment)))
+ return tk;
+ continue;
+ }
+
+### Indents, Newlines, and White Space.
+
+Normally white space is ignored. However newlines can be important as
+can indents, which are either after a newline or at the start of a
+node (detected by `at_son()`);
+
+###### exported functions
+ static inline int is_newline(wchar_t ch)
+ {
+ return ch == '\n' || ch == '\f' || ch == '\v';
+ }
+
+###### white space
+ if (ch <= ' ' && !is_newline(ch)
+ && ! at_son(state))
+ continue;
+
+If a line starts with more white-space than the previous non-blank
+line - or if the first non-blank line in the document starts with any
+white-space - then an Indent is reported at the start of the line.
+
+Before the next non-blank line which starts with less white space, or
+at the latest at the end of the document, a matching Undent token
+is reported. There will always be an exact match between Indent and
+Undent tokens.
+
+It is possible for Undent to be followed (almost) immediately by an
+Indent. This happens if, for example, the indent of three consecutive
+lines are 0, 8, 4 spaces. Before the second line we report an
+Indent. Before the third line we must report an Undent, as 4 is less
+than 8, then also an Ident as 4 is greater than 0.
+
+###### token types
+ TK_indent,
+ TK_undent,
+
+For the purpose of measuring the length of white space, a tab adds at
+least one space, and rounds up to a multiple of 8.
+
+###### exported functions
+ static inline int indent_tab(int indent)
+ {
+ return (indent|7)+1;
+ }
+
+We need to track the current levels of indent. This requires some
+sort of stack as indent levels are pushed on and popped off. In
+practice this stack is unlikely to often exceed 5 so we will used a
+fixed stack of 20 indent levels. More than this will be silently
+ignored.
+
+###### state fields
+ int indent_level;
+ int indent_sizes[20];
+
+#### Newlines
+
+Newlines can optionally be reported. Newlines within a block comment
+or a multi-line string are not reported separately, but each of these
+must be followed immediately by a newline so these constructs cannot
+hide the fact that a newline was present.
+
+When Indents are being reported, the Newline which would normally be
+reported immediately before the Indent is delayed until after the
+matching undent. This makes an indented section act like a
+continuation of the previous line to some extent.
+
+A blank line would normally be reported simply as two consecutive Newline
+tokens. However if the subsequent line is indented (and indents are being
+reported) then the right thing to do is less obvious as Newlines should be
+delayed - but how many Newlines?
+
+The approach we will take is to report the extra Newlines immediately after
+the Indent token, so the blank line is treated as though it were an indented
+blank line.
+
+###### token types
+ TK_newline,
+
+If we find a newline or white space at the start of a block, we keep
+collecting spaces, tabs, and newlines until we find some real text.
+Then depending on the indent we generate some number of tokens. These
+will be a sequence of "Newline Undent" pairs representing a decrease
+in indent, then either a Newline or an Indent depending on whether the
+next line is indented, then zero or more Newlines representing all the
+blank lines that have been skipped.
+
+When a Newline leads to the next block of code there is a question of
+whether the various Newline and Undent/Indent tokens should appear to
+pbelong to the earlier or later block. This is addressed by processing
+the tokens in two stages based on the relative indent levels of the
+two blocks (each block has a base indent to which the actual indents
+are added).
+
+Any "Newline Undent" pairs needed to reduce the current indent to the
+maximum of the base indents of the old and new blocks are generated
+against the old block. Then if the next block does not have an
+increased indent, one more "Newline" is generated.
+
+If further "Newline Undent" pairs are needed to get to the indent
+level of the 'next' block, they are generated against that block,
+though the first Newline is suppressed (it having already been
+generated).
+
+Finally the Newline or Indent for the first line of the new block is
+generated, unless the Newline needs to be suppressed because it
+appeared at the end of the previous block.
+
+This means that a block may start with an Undent or an Indent, but
+will only start with a Newline if it actually starts with a blank
+line.
+
+We will need to represent in the `token_state` where in this sequence
+of delayed tokens we are. As `state.col` records the target indent we
+don't need to record how many undents or indents are needed. We do
+need to record the number of blank lines, and which of Newline and
+Undent is needed next in the initial sequence of pairs.
+
+For this we store one more than the number of blank lines as
+`delayed_lines` and a flag for `undent_next`.
+
+###### state fields
+ int check_indent;
+ int delayed_lines;
+ int undent_next;
+
+Generating these tokens involve two separate pieces of code.
+
+Firstly we need to recognise white space and count the indents and
+newlines. These are recorded in the above state fields.
+
+Separately we need, on each call to `token_next`, we need to check if
+there are some delayed tokens and if so we need to advance the state
+information and return one token.
+
+###### white space
+ if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
+ int newlines = 0;
+ int was_son = at_son(state);
+ if (ignored & (1<<TK_indent)) {
+ if (!is_newline(ch))
+ continue;
+ if (ignored & (1<<TK_newline))
+ continue;
+ tk.num = TK_newline;
+ return tk;
+ }
+ // Indents are needed, so check all white space.
+ while (ch <= ' ' && !at_eon(state)) {
+ if (is_newline(ch))
+ newlines += 1;
+ ch = get_char(state);
+ }
+ if (at_eon(state)) {
+ newlines += 1;
+ if (state->node->next &&
+ state->node->next->indent > state->node->indent)
+ state->col = state->node->next->indent;
+ else
+ state->col = state->node->indent;
+ } else
+ unget_char(state);
+ state->delayed_lines = newlines;
+ state->undent_next = was_son;
+ state->check_indent = 1;
+ continue;
+ }
+
+
+###### delayed tokens
+
+ if (state->check_indent || state->delayed_lines) {
+ if (state->col < state->indent_sizes[state->indent_level]) {
+ if (!state->undent_next &&
+ !(ignored & (1<<TK_newline))) {
+ state->undent_next = 1;
+ tk.num = TK_newline;
+ return tk;
+ }
+ state->indent_level -= 1;
+ state->undent_next = 0;
+ tk.num = TK_undent;
+ return tk;
+ }
+ if (state->col > state->indent_sizes[state->indent_level] &&
+ state->indent_level < sizeof(state->indent_sizes)-1) {
+ state->indent_level += 1;
+ state->indent_sizes[state->indent_level] = state->col;
+ state->delayed_lines -= 1;
+ tk.num = TK_indent;
+ return tk;
+ }
+ state->check_indent = 0;
+ if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
+ tk.num = TK_newline;
+ state->delayed_lines -= 1;
+ return tk;
+ }
+ state->delayed_lines = 0;
+ continue;
+ }
+
+### End of File
+
+After the last newline in the file has been processed, a special
+end-of-file token will be returned. any further attempts to get more
+tokens will continue to return the same end-of-file token.
+
+###### token types
+ TK_eof,
+
+
+###### white space
+ if (ch == WEOF) {
+ tk.num = TK_eof;
+ return tk;
+ }
+
+### Unknown Marks, or errors.
+
+We have now handled all the possible known mark-like tokens.
+If the token we have is not empty and `TK_mark` is allowed,
+we have an unknown mark, otherwise this must be an error.
+
+###### unknown mark
+ /* one unknown character */
+ close_token(state, &tk);
+ tk.num = TK_error;
+ return tk;
+
+## Tools For The Task
+
+You may have noticed that are few gaps we left in the above -
+functions used without first defining them. Doing so above would have
+broken the flow.
+
+### Character by character
+
+As we walk through the various `code_node`s we need to process whole
+Unicode codepoints, and keep track of which line and column we are on.
+We will assume for now that any printing character uses one column,
+though that is not true in general.
+
+As the text in a `code_node` may include an indent that identifies it as
+being code, we need to be careful to strip that. The `code_node` has
+a flag that tells us whether or not we need to strip.
+
+###### includes
+ #include <memory.h>
+
+###### state fields
+ struct code_node *node;
+ int offset;
+ int line;
+ int col;
+
+###### internal functions
+
+ static void do_strip(struct token_state *state)
+ {
+ if (state->node->needs_strip) {
+ int n = 4;
+ while (n && state->node->code.txt[state->offset] == ' ') {
+ state->offset += 1;
+ n -= 1;
+ }
+ while (n == 4 && state->node->code.txt[0] == '\t') {
+ state->offset += 1;
+ n -= 4;
+ }
+ }
+ }
+
+ static wint_t get_char(struct token_state *state)
+ {
+ wchar_t next;
+ size_t n;
+ mbstate_t mbstate;
+
+ if (state->node == NULL)
+ return WEOF;
+ if (state->node->code.len <= state->offset) {
+ do
+ state->node = state->node->next;
+ while (state->node && state->node->code.txt == NULL);
+ state->offset = 0;
+ if (state->node == NULL)
+ return WEOF;
+ do_strip(state);
+ state->line = state->node->line_no;
+ state->col = state->node->indent;
+ }
+
+ ## before get_char
+
+ memset(&mbstate, 0, sizeof(mbstate));
+
+ n = mbrtowc(&next, state->node->code.txt + state->offset,
+ state->node->code.len - state->offset,
+ &mbstate);
+ if (n == -2 || n == 0) {
+ /* Not enough bytes - not really possible */
+ next = '\n';
+ state->offset = state->node->code.len;
+ } else if (n == -1) {
+ /* error */
+ state->offset += 1;
+ next = 0x7f; // an illegal character
+ } else
+ state->offset += n;
+
+ if (next >= ' ') {
+ state->col += 1;
+ } else if (is_newline(next)) {
+ state->line += 1;
+ state->col = state->node->indent;
+ do_strip(state);
+ } else if (next == '\t') {
+ state->col = indent_tab(state->col);
+ }
+ return next;
+ }
+
+We will sometimes want to "unget" the last character as it needs to be
+considered again as part of the next token. So we need to store a
+'previous' version of all metadata.
+
+###### state fields
+ int prev_offset;
+ int prev_line;
+ int prev_col;
+
+###### before get_char
+ state->prev_offset = state->offset;
+ state->prev_line = state->line;
+ state->prev_col = state->col;
+
+###### internal functions
+
+ static void unget_char(struct token_state *state)
+ {
+ if (state->node) {
+ state->offset = state->prev_offset;
+ state->line = state->prev_line;
+ state->col = state->prev_col;
+ }
+ }
+
+We occasionally need a double-unget, particularly for numbers and
+block comments. We don't impose this cost on all scanning, but
+require those code sections that need it to call `save_unget_state`
+before each `get_char`, and then `restore_unget_state` when a
+double-unget is needed.
+
+###### state fields
+ int prev_offset2;
+ int prev_line2;
+ int prev_col2;
+
+###### internal functions
+ static void save_unget_state(struct token_state *state)
+ {
+ state->prev_offset2 = state->prev_offset;
+ state->prev_line2 = state->prev_line;
+ state->prev_col2 = state->prev_col;
+ }
+
+ static void restore_unget_state(struct token_state *state)
+ {
+ state->prev_offset = state->prev_offset2;
+ state->prev_line = state->prev_line2;
+ state->prev_col = state->prev_col2;
+ }
+
+At the start of a token we don't want to be at the end of a code block
+if we can help it. To avoid this possibility, we 'get' and 'unget' a
+single character. This will move into the next non-empty code block
+and leave the current pointer at the start of it.
+
+This has to happen _after_ dealing with delayed tokens as some of them
+must appear in the previous node. When we do this, we need to reset
+the data in the token.
+
+###### delayed tokens
+ if (at_eon(state)) {
+ get_char(state);
+ unget_char(state);
+ tk.node = state->node;
+ if (state->node)
+ tk.txt.txt = state->node->code.txt + state->offset;
+ tk.line = state->line;
+ tk.col = state->col;
+ tk.txt.len = 0;
+ }
+
+### Managing tokens
+
+The current token is initialized to line up with the first character
+that we 'get' for each token. When we have, or might have, a full
+token we can call `close_token` to set the `len` of the token
+appropriately. This can safely be called multiple times.
+
+Finally we occasionally (for single-line strings and block comments)
+need to reset to the beginning of the current token as we might have
+parsed too much already. For that there is `reset_token`.
+
+###### one token
+ tk.node = state->node;
+ if (state->node)
+ tk.txt.txt = state->node->code.txt + state->offset;
+ tk.line = state->line;
+ tk.col = state->col;
+ tk.txt.len = 0;
+
+###### internal functions
+
+ static void close_token(struct token_state *state,
+ struct token *tk)
+ {
+ tk->txt.len = (state->node->code.txt + state->offset)
+ - tk->txt.txt;
+ }
+
+ static void reset_token(struct token_state *state, struct token *tok)
+ {
+ state->prev_line = tok->line;
+ state->prev_col = tok->col;
+ state->prev_offset = tok->txt.txt - state->node->code.txt;
+ unget_char(state);
+ tok->txt.len = 0;
+ }
+
+
+Tokens make not cross into the next `code_node`, and some tokens can
+include the newline at the and of a `code_node`, we must be able to
+easily check if we have reached the end. Equally we need to know if
+we are at the start of a node, as white space is treated a little
+differently there.
+
+###### internal functions
+
+ static int at_son(struct token_state *state)
+ {
+ return state->offset == 0;
+ }
+
+ static int at_eon(struct token_state *state)
+ {
+ // at end-of-node ??
+ return state->node == NULL ||
+ state->offset >= state->node->code.len;
+ }
+
+### Find a known word
+
+As the known-word list is sorted we can use a simple binary search.
+Following the pattern established in "mdcode", we will use a `struct
+text` with start and length to represent the code fragment we are
+searching for.
+
+###### internal functions
+ static int find_known(struct token_config *conf, struct text txt)
+ {
+ int lo = 0;
+ int hi = conf->known_count;
+
+ while (lo + 1 < hi) {
+ int mid = (lo + hi) / 2;
+ int cmp = strncmp(conf->words_marks[mid],
+ txt.txt, txt.len);
+ if (cmp == 0 && conf->words_marks[mid][txt.len])
+ cmp = 1;
+ if (cmp <= 0)
+ lo = mid;
+ else
+ hi = mid;
+ }
+ if (strncmp(conf->words_marks[lo],
+ txt.txt, txt.len) == 0
+ && conf->words_marks[lo][txt.len] == 0)
+ return lo;
+ else
+ return -1;
+ }
+
+### Bringing it all together
+
+Now we have all the bits there is just one section missing: combining
+all the token parsing code into one block.
+
+The handling of delayed tokens (newlines, indents, undents) must come
+first before we try getting another character.
+
+Then we parse all the test, making sure that we check for known marks
+before strings and comments, but unknown marks after strings and comments.
+
+This block of code will either return a token, or will choose to
+ignore one, in which case it will `continue` around to the top of the
+loop.
+
+###### one token
+ ## delayed tokens
+
+ ch = get_char(state);
+
+ ## white space
+ ## parse number
+ ## parse word
+ ## parse mark
+ ## parse string
+ ## parse comment
+ ## unknown mark
+
+### Start and stop
+
+As well as getting tokens, we need to be able to create the
+`token_state` to start with, and discard it later.
+
+###### includes
+ #include <malloc.h>
+
+###### main functions
+ struct token_state *token_open(struct code_node *code, struct
+ token_config *conf)
+ {
+ struct token_state *state = malloc(sizeof(*state));
+ memset(state, 0, sizeof(*state));
+ state->node = code;
+ state->line = code->line_no;
+ state->conf = conf;
+ return state;
+ }
+ void token_close(struct token_state *state)
+ {
+ free(state);
+ }
+
+###### exported functions
+ struct token_state *token_open(struct code_node *code, struct
+ token_config *conf);
+ void token_close(struct token_state *state);
+
+### Trace tokens
+
+Getting tokens is the main thing but it is also useful to be able to
+print out token information, particularly for tracing and testing.
+
+Known tokens are printed verbatim. Other tokens are printed as
+`type(content)` where content is truncated to a given number of characters.
+
+The function for printing a truncated string (`text_dump`) is also exported
+so that it can be used to tracing processed strings too.
+
+###### includes
+ #include <stdio.h>
+
+###### exported functions
+ void token_trace(FILE *f, struct token tok, int max);
+ void text_dump(FILE *f, struct text t, int max);
+
+###### main functions
+
+ void text_dump(FILE *f, struct text txt, int max)
+ {
+ int i;
+ if (txt.len > max)
+ max -= 2;
+ else
+ max = txt.len;
+ for (i = 0; i < max; i++) {
+ char c = txt.txt[i];
+ if (c < ' ' || c > '~')
+ fprintf(f, "\\x%02x", c & 0xff);
+ else if (c == '\\')
+ fprintf(f, "\\\\");
+ else
+ fprintf(f, "%c", c);
+ }
+ if (i < txt.len)
+ fprintf(f, "..");
+ }
+
+ void token_trace(FILE *f, struct token tok, int max)
+ {
+ static char *types[] = {
+ [TK_ident] = "ident",
+ [TK_mark] = "mark",
+ [TK_number] = "number",
+ [TK_string] = "string",
+ [TK_multi_string] = "mstring",
+ [TK_line_comment] = "lcomment",
+ [TK_block_comment] = "bcomment",
+ [TK_indent] = "indent",
+ [TK_undent] = "undent",
+ [TK_newline] = "newline",
+ [TK_eof] = "eof",
+ [TK_error] = "ERROR",
+ };
+
+ switch (tok.num) {
+ default: /* known word or mark */
+ fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
+ break;
+ case TK_indent:
+ case TK_undent:
+ case TK_newline:
+ case TK_eof:
+ /* No token text included */
+ fprintf(f, "%s()", types[tok.num]);
+ break;
+ case TK_ident:
+ case TK_mark:
+ case TK_number:
+ case TK_string:
+ case TK_multi_string:
+ case TK_line_comment:
+ case TK_block_comment:
+ case TK_error:
+ fprintf(f, "%s(", types[tok.num]);
+ text_dump(f, tok.txt, max);
+ fprintf(f, ")");
+ break;
+ }
+ }
+
+### And there we have it
+
+We now have all the library functions defined for reading and printing
+tokens. Now we just need C files to store them, and a mk file to make them.
+
+###### File: scanner.h
+ ## public types
+ ## exported functions
+
+###### File: libscanner.c
+ ## includes
+ #include "scanner.h"
+ ## private types
+ ## internal functions
+ ## main functions
+
+###### File: scanner.mk
+
+ CFLAGS += -Wall -g
+ all ::
+ scanner.mk scanner.h libscanner.c : scanner.mdc
+ ./md2c scanner.mdc
+ all :: libscanner.o
+ libscanner.o : libscanner.c
+ $(CC) $(CFLAGS) -c libscanner.c
+
+## Processing numbers
+
+Converting a `TK_number` token to a numerical value is a slightly
+higher level task than lexical analysis, and slightly lower than
+grammar parsing, so put it here - as an index if you like.
+
+Importantly it will be used by the same testing rig that is used for
+testing the token scanner.
+
+The numeric value that we will convert all numbers into is the `mpq_t`
+from the GNU high precision number library "libgmp".
+
+###### number includes
+ #include <gmp.h>
+ #include "mdcode.h"
+
+Firstly we need to be able to parse a string of digits in a given base
+and possibly with a decimal marker. We store this in an `mpz_t`
+integer and report the number of digits after the decimal mark.
+
+On error we return zero and ensure that the 'mpz_t' has been freed, or
+had never been initialised.
+
+###### number functions
+
+ static int parse_digits(mpz_t num, struct text tok, int base,
+ int *placesp)
+ {
+ /* Accept digits up to 'base', ignore '_' and
+ * ' ' if they appear between two legal digits,
+ * and if `placesp` is not NULL, allow a single
+ * '.' or ',' and report the number of digits
+ * beyond there.
+ * Return number of characters processed (p),
+ * or 0 if something illegal was found.
+ */
+ int p;
+ int decimal = -1; // digits after marker
+ enum {Digit, Space, Other} prev = Other;
+ int digits = 0;
+
+ for (p = 0; p < tok.len; p++) {
+ int dig;
+ char c = tok.txt[p];
+
+ if (c == '_' || c == ' ') {
+ if (prev != Digit)
+ goto bad;
+ prev = Space;
+ continue;
+ }
+ if (c == '.' || c == ',') {
+ if (prev != Digit)
+ goto bad;
+ if (!placesp || decimal >= 0)
+ return p-1;
+ decimal = 0;
+ prev = Other;
+ continue;
+ }
+ if (isdigit(c))
+ dig = c - '0';
+ else if (isupper(c))
+ dig = 10 + c - 'A';
+ else if (islower(c))
+ dig = 10 + c - 'a';
+ else
+ dig = base;
+ if (dig >= base) {
+ if (prev == Space)
+ p--;
+ break;
+ }
+ prev = Digit;
+ if (digits)
+ mpz_mul_ui(num, num, base);
+ else
+ mpz_init(num);
+ digits += 1;
+ mpz_add_ui(num, num, dig);
+ if (decimal >= 0)
+ decimal++;
+ }
+ if (digits == 0)
+ return 0;
+ if (placesp) {
+ if (decimal >= 0)
+ *placesp = decimal;
+ else
+ *placesp = 0;
+ }
+ return p;
+ bad:
+ if (digits)
+ mpz_clear(num);
+ return 0;
+ }
+
+###### number includes
+ #include <ctype.h>
+
+To parse a full number we need to consider the optional base, the
+mantissa, and the optional exponent. We will treat these one at a
+time.
+
+The base is indicated by a letter after a leading zero, which must be
+followed by a base letter or a period. The base also determines the
+character which will mark an exponent.
+
+###### number vars
+ int base = 10;
+ char expc = 'e';
+
+###### parse base
+
+ if (tok.txt[0] == '0' && tok.len > 1) {
+ int skip = 0;
+ switch(tok.txt[1]) {
+ case 'x':
+ case 'X':
+ base = 16;
+ skip = 2;
+ expc = 'p';
+ break;
+ case 'o':
+ case 'O':
+ base = 8;
+ skip = 2;
+ expc = 'p';
+ break;
+ case 'b':
+ case 'B':
+ base = 2;
+ skip = 2;
+ expc = 'p';
+ break;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ case '_':
+ case ' ':
+ // another digit is not permitted
+ // after a zero.
+ return 0;
+ default:
+ // must be decimal marker or trailing
+ // letter, which are OK;
+ break;
+ }
+ tok.txt += skip;
+ tok.len -= skip;
+ }
+
+After the base is the mantissa, which may contain a decimal mark, so
+we need to record the number of places. We won't impose the number of
+places until we have the exponent as well.
+
+###### number vars
+ int places =0;
+ mpz_t mant;
+ int d;
+
+###### parse mantissa
+
+ d = parse_digits(mant, tok, base, &places);
+ if (d == 0)
+ return 0;
+ tok.txt += d;
+ tok.len -= d;
+ mpq_init(num);
+ mpq_set_z(num, mant);
+ mpz_clear(mant);
+
+After the mantissa number may come an exponent which may be positive
+or negative. We assume at this point that we have seen the exponent
+character `expc`.
+
+###### number vars
+ long lexp = 0;
+ mpz_t exp;
+ int esign = 1;
+
+###### parse exponent
+ if (tok.len > 1) {
+ if (tok.txt[0] == '+') {
+ tok.txt++;
+ tok.len--;
+ } else if (tok.txt[0] == '-') {
+ esign = -1;
+ tok.txt++;
+ tok.len--;
+ }
+ }
+ d = parse_digits(exp, tok, 10, NULL);
+ if (d == 0) {
+ mpq_clear(num);
+ return 0;
+ }
+ if (!mpz_fits_slong_p(exp)) {
+ mpq_clear(num);
+ mpz_clear(exp);
+ return 0;
+ }
+ lexp = mpz_get_si(exp) * esign;
+ mpz_clear(exp);
+ tok.txt += d;
+ tok.len -= d;
+
+
+Now that we have the mantissa and the exponent we can multiply them
+together, also allowing for the number of digits after the decimal
+mark.
+
+For base 10, we simply subtract the decimal places from the exponent.
+For the other bases, as the exponent is alway based on 2, even for
+octal and hex, we need a bit more detail.
+We then recover the sign from the exponent, as division is quite
+different from multiplication.
+
+###### calc exponent
+ switch (base) {
+ case 10:
+ case 2:
+ lexp -= places;
+ break;
+ case 16:
+ lexp -= 4*places;
+ break;
+ case 8:
+ lexp -= 3*places;
+ break;
+ }
+ if (lexp < 0) {
+ lexp = -lexp;
+ esign = -1;
+ } else
+ esign = 1;
+
+Imposing the exponent on the number is also very different for base 10
+than for the others. For the binary shift `gmp` provides a simple
+function. For base 10 we use something like Russian Peasant
+Multiplication.
+
+###### calc exponent
+ if (expc == 'e') {
+ mpq_t tens;
+ mpq_init(tens);
+ mpq_set_ui(tens, 10, 1);
+ while (1) {
+ if (lexp & 1) {
+ if (esign > 1)
+ mpq_mul(num, num, tens);
+ else
+ mpq_div(num, num, tens);
+ }
+ lexp >>= 1;
+ if (lexp == 0)
+ break;
+ mpq_mul(tens, tens, tens);
+ }
+ mpq_clear(tens);
+ } else {
+ if (esign > 0)
+ mpq_mul_2exp(num, num, lexp);
+ else
+ mpq_div_2exp(num, num, lexp);
+ }
+
+Now we are ready to parse a number: the base, mantissa, and exponent.
+If all goes well we check for the possible trailing letters and
+return. Return value is 1 for success and 0 for failure.
+
+
+###### number functions
+ int number_parse(mpq_t num, char tail[3], struct text tok)
+ {
+ ## number vars
+ int i;
+
+ ## parse base
+ ## parse mantissa
+ if (tok.len > 1 && (tok.txt[0] == expc ||
+ tok.txt[0] == toupper(expc))) {
+ tok.txt++;
+ tok.len--;
+ ## parse exponent
+ }
+ ## calc exponent
+
+ for (i = 0; i < 2; i++) {
+ if (tok.len <= i)
+ break;
+ if (!isalpha(tok.txt[i]))
+ goto err;
+ tail[i] = tok.txt[i];
+ }
+ tail[i] = 0;
+ if (i == tok.len)
+ return 1;
+ err:
+ mpq_clear(num);
+ return 0;
+ }
+
+Number parsing goes in `libnumber.c`
+
+###### File: libnumber.c
+
+ #include <unistd.h>
+ #include <stdlib.h>
+
+ ## number includes
+ ## number functions
+
+###### File: number.h
+ int number_parse(mpq_t num, char tail[3], struct text tok);
+
+###### File: scanner.mk
+ all :: libnumber.o
+ libnumber.o : libnumber.c
+ $(CC) $(CFLAGS) -c libnumber.c
+
+## Processing strings
+
+Both `TK_string` and `TK_multi_string` require post-processing which
+can be one of two types: literal or with escapes processed.
+Even literal processing is non-trivial as the file may contain indents
+which need to be stripped.
+
+Errors can only occur when processing escapes. Any unrecognised
+character following the escape character will cause an error.
+
+Processing escapes and striping indents can only make the string
+shorter, not longer, so we allocate a buffer which is the same size as
+the string and process into that.
+
+To request escape processing, we pass the character we want to use for
+quoting, usually '`\`'. To avoid escape processing we pass a zero.
+
+###### string main
+ int string_parse(struct token *tok, char escape,
+ struct text *str, char tail[3])
+ {
+ ## string vars
+ struct text t = tok->txt;
+
+ str->txt = NULL;
+ ## strip tail
+ if (tok->num == TK_string) {
+ ## strip single
+ } else {
+ ## strip multi
+ }
+ str->txt = malloc(t.len);
+ str->len = 0;
+
+ ## process string
+ return 1;
+ err:
+ free(str->txt);
+ str->txt = NULL;
+ return 0;
+ }
+
+### strip tail
+
+The tail of the string can be 0, 1, or 2 letters
+
+ i = t.len;
+ if (i >= 0 && isalpha(t.txt[i-1]))
+ i -= 1;
+ if (i >= 0 && isalpha(t.txt[i-1]))
+ i -= 1;
+ strncpy(tail, t.txt+i, t.len-i);
+ tail[t.len-i] = 0;
+ t.len = i;
+
+###### string vars
+ int i;
+
+### strip single
+
+Stripping the quote of a single-line string is trivial.
+The only part that is at all interesting is that quote character must
+be remembered.
+
+ quote = t.txt[0];
+ if (t.txt[t.len-1] != quote)
+ goto err;
+ t.txt += 1;
+ t.len -= 2;
+
+###### string vars
+ char quote;
+
+### strip multi
+
+For a multi-line string we have a little more work to do. We need to
+remove 3 quotes, not 1, and need to count the indent of the close
+quote as it will need to be stripped from all lines.
+
+ quote = t.txt[0];
+ if (t.len < 7 ||
+ t.txt[1] != quote || t.txt[2] != quote ||
+ !is_newline(t.txt[3]))
+ goto err;
+ t.txt += 4;
+ t.len -= 4;
+ i = t.len;
+ if (i <= 0 || t.txt[i-1] != quote)
+ goto err;
+ i -= 1;
+ if (i <= 0 || t.txt[i-1] != quote)
+ goto err;
+ i -= 1;
+ if (i <= 0 || t.txt[i-1] != quote)
+ goto err;
+ i -= 1;
+ t.len = i;
+ while (i > 0 && !is_newline(t.txt[i-1]))
+ i--;
+ indent = 0;
+ while (i < t.len) {
+ if (t.txt[i] == ' ')
+ indent += 1;
+ if (t.txt[i] == '\t')
+ indent = indent_tab(indent);
+ i++;
+ }
+
+###### string vars
+ int indent = 0;
+
+### process string
+
+Now we just take one byte at a time. trans-ASCII unicode won't look
+like anything we are interested in so it will just be copied byte by
+byte.
+
+ cp = str->txt;
+ at_sol = 1;
+ for (i = 0; i < t.len; i++) {
+ char c;
+ if (at_sol) {
+ at_sol = 0;
+ ## strip indent
+ if (i >= t.len)
+ break;
+ }
+ c = t.txt[i];
+ if (c != escape) {
+ *cp = c;
+ cp += 1;
+ if (is_newline(c))
+ at_sol = 1;
+ } else if (i+1 >= t.len) {
+ // escape and end of string
+ goto err;
+ } else {
+ i += 1;
+ c = t.txt[i];
+ ## parse escape
+ }
+ }
+ str->len = cp - str->txt;
+
+###### string vars
+ char *cp;
+ int at_sol;
+
+### strip indent
+
+Every time we find a start of line, we strip spaces and tabs until the
+required indent is found.
+
+ int skipped = 0;
+ while (i < t.len && skipped < indent) {
+ c = t.txt[i];
+ if (c == ' ')
+ skipped += 1;
+ else if (c == '\t')
+ skipped = indent_tab(c);
+ else
+ break;
+ i+= 1;
+ }
+
+### parse escape
+ switch (c) {
+ case 'n':
+ *cp++ = '\n'; break;
+ case 'r':
+ *cp++ = '\r'; break;
+ case 't':
+ *cp++ = '\t'; break;
+ case 'b':
+ *cp++ = '\b'; break;
+ case 'q':
+ *cp++ = quote; break;
+ case 'f':
+ *cp++ = '\f'; break;
+ case 'v':
+ *cp++ = '\v'; break;
+ case 'a':
+ *cp++ = '\a'; break;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ // 3 digit octal number
+ if (i+2 >= t.len)
+ goto err;
+ if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
+ t.txt[i+2] < '0' || t.txt[i+1] > '7')
+ goto err;
+ n = (t.txt[i ]-'0') * 64 +
+ (t.txt[i+1]-'0') * 8 +
+ (t.txt[i+2]-'0') * 1;
+ *cp++ = n;
+ i += 2;
+ break;
+ case 'x':
+ // 2 hex digits
+ n = take_hex(2, t.txt+i+1, t.len-i-1);
+ if (n < 0)
+ goto err;
+ *cp++ = n;
+ i += 2;
+ break;
+ case 'u':
+ case 'U':
+ // 4 or 8 hex digits for unicode
+ n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
+ if (n < 0)
+ goto err;
+ memset(&pstate, 0, sizeof(pstate));
+ n = wcrtomb(cp, n, &pstate);
+ if (n <= 0)
+ goto err;
+ cp += n;
+ i += c == 'u' ? 4 : 8;
+ break;
+ default:
+ if (c == escape)
+ *cp++ = c;
+ else if (is_newline(c))
+ at_sol = 1;
+ else
+ goto err;
+ }
+
+###### string vars
+ long n;
+ mbstate_t pstate;
+
+For `\x` `\u` and `\U` we need to collect a specific number of
+hexadecimal digits
+
+###### string functions
+
+ static long take_hex(int digits, char *cp, int l)
+ {
+ long n = 0;
+ if (l < digits)
+ return -1;
+ while (digits) {
+ char c = *cp;
+ int d;
+ if (!isxdigit(c))
+ return -1;
+ if (isdigit(c))
+ d = c - '0';
+ else if (isupper(c))
+ d = 10 + c - 'A';
+ else
+ d = 10 + c - 'a';
+ n = n * 16 + d;
+ digits--;
+ cp++;
+ }
+ return n;
+ }
+
+#### File: libstring.c
+
+String parsing goes in `libstring.c`
+
+ #include <unistd.h>
+ #include <stdlib.h>
+ #include <stdio.h>
+ #include <string.h>
+ #include <ctype.h>
+ #include <wchar.h>
+ #include "mdcode.h"
+ #include "scanner.h"
+ ## string functions
+ ## string main
+
+###### File: string.h
+ int string_parse(struct token *tok, char escape,
+ struct text *str, char tail[3]);
+
+###### File: scanner.mk
+ all :: libstring.o
+ libstring.o : libstring.c
+ $(CC) $(CFLAGS) -c libstring.c
+
+
+## Testing
+
+As "untested code is buggy code" we need a program to easily test
+the scanner library. This will simply parse a given file and report
+the tokens one per line.
+
+###### File: scanner.c
+
+ #include <unistd.h>
+ #include <stdlib.h>
+ #include <fcntl.h>
+ #include <errno.h>
+ #include <sys/mman.h>
+ #include <string.h>
+ #include <stdio.h>
+ #include <gmp.h>
+ #include <locale.h>
+ #include "mdcode.h"
+ #include "scanner.h"
+ #include "number.h"
+ #include "string.h"
+
+ static int errs;
+ static void pr_err(char *msg)
+ {
+ errs++;
+ fprintf(stderr, "%s\n", msg);
+ }
+
+ int main(int argc, char *argv[])
+ {
+ int fd;
+ int len;
+ char *file;
+ struct token_state *state;
+ char *known[] = {
+ "==",
+ "else",
+ "if",
+ "then",
+ "while",
+ "{",
+ "}",
+ };
+ struct token_config conf = {
+ .word_start = "_$",
+ .word_cont = "",
+ .words_marks = known,
+ .number_chars = "., _+-",
+ .known_count = sizeof(known)/sizeof(known[0]),
+ .ignored = (0 << TK_line_comment)
+ |(0 << TK_block_comment),
+ };
+ struct section *table, *s, *prev;
+ setlocale(LC_ALL,"");
+ if (argc != 2) {
+ fprintf(stderr, "Usage: scanner file\n");
+ exit(2);
+ }
+ fd = open(argv[1], O_RDONLY);
+ if (fd < 0) {
+ fprintf(stderr, "scanner: cannot open %s: %s\n",
+ argv[1], strerror(errno));
+ exit(1);
+ }
+ len = lseek(fd, 0, 2);
+ file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
+ table = code_extract(file, file+len, pr_err);
+
+ for (s = table; s;
+ (code_free(s->code), prev = s, s = s->next, free(prev))) {
+ printf("Tokenizing: %.*s\n", s->section.len,
+ s->section.txt);
+ state = token_open(s->code, &conf);
+ while(1) {
+ struct token tk = token_next(state);
+ printf("%d:%d ", tk.line, tk.col);
+ token_trace(stdout, tk, 20);
+ if (tk.num == TK_number) {
+ mpq_t num;
+ char tail[3];
+ if (number_parse(num, tail,tk.txt)) {
+ printf(" %s ", tail);
+ mpq_out_str(stdout, 10, num);
+ mpq_clear(num);
+ } else
+ printf(" BAD NUMBER");
+ }
+ if (tk.num == TK_string ||
+ tk.num == TK_multi_string) {
+ char esc = '\\';
+ struct text str;
+ char tail[3];
+ if (tk.txt.txt[0] == '`')
+ esc = 0;
+ if (string_parse(&tk, esc,
+ &str, tail)) {
+ printf(" %s ", tail);
+ text_dump(stdout, str, 20);
+ free(str.txt);
+ } else
+ printf(" BAD STRING");
+ }
+ printf("\n");
+ if (tk.num == TK_error)
+ errs = 1;
+ if (tk.num == TK_eof)
+ break;
+ }
+ }
+ exit(!!errs);
+ }
+###### File: scanner.mk
+ scanner.c : scanner.mdc
+ ./md2c scanner.mdc
+ all :: scanner
+ scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
+ $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
+ libmdcode.o libnumber.o libstring.o -licuuc -lgmp
+ scanner.o : scanner.c
+ $(CC) $(CFLAGS) -c scanner.c
+