ocean-lang.org Git - ocean/blob - csrc/scanner.mdc

   1 # Lexical Scanner #
   2
   3 ## The Task at Hand ##
   4
   5 The main task of the lexical scanner is to convert a stream of
   6 characters into a stream of tokens.  The tokens are then typically
   7 used by a parser to extract the syntactic structure.
   8
   9 The stream of characters are assumed to be in memory identified by a
  10 linked list of blocks, such as provided by the "[mdcode][]" literate
  11 program extractor.  A single token may never cross a block boundary.
  12
  13 [mdcode]: mdcode.html
  14
  15 ###### includes
  16         #include "mdcode.h"
  17
  18 The text is assumed to be UTF-8 though some matching assumes the
  19 ASCII subset.  If the text provided does not conform to UTF-8 an error
  20 will be reported and some number of bytes will be skipped.
  21
  22 ###### public types
  23         #include <wchar.h>
  24         #include <wctype.h>
  25         #include <unicode/uchar.h>
  26
  27 Tokens are returned by successive calls to the main interface
  28 function: `token_next()` which has a `state` structure to keep track
  29 of where it is up to.  Each token carries not just a numeric
  30 identifier but also the code block, the line and character within that
  31 block, and the actual start and length using the `struct text` from
  32 "mdcode".
  33
  34 ###### public types
  35         struct token {
  36                 int               num;
  37                 struct code_node *node;
  38                 struct text       txt;
  39                 int               line, col;
  40         };
  41         struct token_state;
  42
  43 ###### private types
  44         struct token_state {
  45                 ## state fields
  46         };
  47
  48 ###### exported functions
  49         struct token token_next(struct token_state *state);
  50
  51 ###### main functions
  52         struct token token_next(struct token_state *state)
  53         {
  54                 ## token_next init
  55                 while (1) {
  56                         wint_t ch;
  57                         struct token tk;
  58
  59                         ## one token
  60                 }
  61         }
  62
  63 The `line` and `col` offsets are useful for reporting errors.
  64 The `txt` provides the content when that is important.
  65
  66 ### Token types and configuration ##
  67
  68 The scanner is not completely general, yet not completely specified.
  69 There are a fixed set of token types, though particular tokens within
  70 those types can be distinguish via configuration.
  71
  72 Most token types may be explicitly ignored, as typically comments
  73 would be.  The exact consequence of ignoring each token type varies
  74 from token to token.
  75
  76 ###### public types
  77         struct token_config {
  78                 int ignored;    // bit set of ignored tokens.
  79                 ## token config parameters
  80         };
  81
  82 ###### state fields
  83         struct token_config *conf;
  84
  85 ###### token_next init
  86         int ignored = state->conf->ignored;
  87
  88 The different tokens are numbers, words, marks, strings, comments,
  89 newlines, EOF, and indents, each of which is examined in detail below.
  90
  91 There are various cases where no token can be found in part of the
  92 input.  All of these will be reported as a `TK_error` token.
  93
  94 It is possible to declare a number of strings which form distinct
  95 tokens (rather than being grouped as e.g. 'word').  These are given
  96 token numbers from `TK_reserved` upwards.
  97
  98 ###### public types
  99         enum token_num {
 100                 TK_error,
 101                 ## token types
 102                 TK_reserved
 103         };
 104
 105 ### Numbers
 106
 107 Numbers are the messiest tokens to parse, primarily because they can
 108 contain characters that also have meaning outside of numbers and,
 109 particularly, immediately after numbers.
 110
 111 The obvious example is the '`-`' sign.  It can come inside a number for
 112 a negative exponent, or after a number as a subtraction operator.  To
 113 be sure we have parsed as best as possible we need to only allow the
 114 '`-`' inside a number if it is after an exponent character.  This can be
 115 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
 116 digit, so we don't allow '`-`' after just any `e`.
 117
 118 To make matters worse, our language designer has decided to experiment
 119 with allowing commas to be used as the decimal indicator, and spaces
 120 to be used to separate groups of digits in large numbers.  Both of
 121 these can reasonably be restricted to appear between two digits, so we
 122 have to add that condition to our tests.  For consistency we require
 123 every non-alpha-numeric to appear between two hex digits, with the
 124 exception that a sign can appear only after a 'p' or 'e', and a space
 125 can only appear between decimal digits.  Allowing a space before a
 126 letter easily leads to confusion, such a in `a < 3 and b < 4`.
 127
 128 So we cannot just treat numbers as starting with a digit and being
 129 followed by some set of characters.  We need more structure than that.
 130
 131 So:
 132
 133 - Numbers must start with a digit.
 134 - If the first digit is zero, the next character should be a base
 135   signifier (one of `xob`) or a decimal marker (`.` or `,`) (though this isn't
 136   enforced at this stage)
 137   In the first case the only first `p` or `P` may be followed by a sign.
 138 - If the number doesn't start with `0` followed by one of `xob`, the
 139   first `e` may be followed by a sign.
 140 - A sign must always be followed by a digit.
 141 - Any digit may be followed by a space or underscore and any hex digit
 142   maybe followed by an underscore, providing that the subsequence character
 143   is also a digit (for space) or hex digit (for underscore).
 144   This rule will require an extra level of 'unget' to be
 145   supported when handling characters.
 146 - Otherwise any digits or ASCII letters are allowed.  We do not at
 147   this point check that the digits given are permitted by the base.
 148   That will happen when the token is converted to a number.
 149
 150 To allow easy configuration, the various non alphanumeric characters
 151 are only permitted if they are listed in a configuration parameter.
 152
 153 ###### token config parameters
 154         char *number_chars;
 155
 156 Note that numbers may not start with a period, so `.75` is not a
 157 number.  This is not the norm, but is not unheard of.  Excluding these
 158 numbers simplifies the rule at very little cost.
 159
 160 ###### token types
 161         TK_number,
 162
 163 If TK_number is ignored, digits will result in an error unless they
 164 are declared to be a start character for words.
 165
 166 ###### includes
 167
 168         #include <string.h>
 169
 170 ###### parse number
 171
 172         if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
 173                 int prev = 0;
 174                 int expect_p = 0;
 175                 int decimal_mark = 0;
 176                 if (ch == '0') {
 177                         wchar_t ch2 = get_char(state);
 178                         if (strchr("xobXOB", ch2) != NULL)
 179                                 expect_p = 1;
 180                         unget_char(state);
 181                 }
 182                 while (1) {
 183                         int sign_ok = 0;
 184                         switch(expect_p) {
 185                         case 0:
 186                                 if (ch == 'e' || ch == 'E') {
 187                                         sign_ok = 1;
 188                                         decimal_mark = 1;
 189                                 }
 190                                 break;
 191                         case 1:
 192                                 if (ch == 'p' || ch == 'P') {
 193                                         sign_ok = 1;
 194                                         decimal_mark = 1;
 195                                 }
 196                                 break;
 197                         }
 198                         save_unget_state(state);
 199                         prev = ch;
 200                         ch = get_char(state);
 201
 202                         if (!iswalnum(prev)) {
 203                                 /* special characters, like separators and decimal marks
 204                                  * and signs, must be followed by a hexdigit, and the
 205                                  * space and signs must be followed by a decimal digit.
 206                                  */
 207                                 if (!iswxdigit(ch) ||
 208                                    ((prev == '-' || prev == '+') && !iswdigit(ch)) ||
 209                                    (prev == ' ' && !iswdigit(ch))) {
 210                                         /* don't want the new char or the special */
 211                                         restore_unget_state(state);
 212                                         break;
 213                                 }
 214                         }
 215                         if (iswalnum(ch))
 216                                 continue;
 217
 218                         if (!strchr(state->conf->number_chars, ch)) {
 219                                 /* non-number char */
 220                                 break;
 221                         }
 222                         if (ch == '+' || ch == '-') {
 223                                 /* previous must be 'e' or 'p' in appropraite context */
 224                                 if (!sign_ok)
 225                                         break;
 226                                 expect_p = -1;
 227                         } else if (ch == ' ') {
 228                                 /* previous must be a digit */
 229                                 if (!iswdigit(prev))
 230                                         break;
 231                         } else {
 232                                 /* previous must be a hex digit */
 233                                 if (!iswxdigit(prev))
 234                                         break;
 235                         }
 236                         if (ch == '.' || ch == ',') {
 237                                 /* only one of these permitted */
 238                                 if (decimal_mark)
 239                                         break;
 240                                 decimal_mark = 1;
 241                         }
 242                 }
 243                 /* We seem to have a "number" token */
 244                 unget_char(state);
 245                 close_token(state, &tk);
 246                 tk.num = TK_number;
 247                 return tk;
 248         }
 249
 250 ### Words
 251 Words start with a "start" character followed by the longest
 252 sequence of "continue" characters.  The Unicode ID_START and
 253 ID_CONTINUE sets are always permitted, but other ASCII characters
 254 can be added to these sets.
 255
 256 ###### token config parameters
 257         char *word_start;
 258         char *word_cont;
 259
 260 ###### internal functions
 261         static int is_word_start(wchar_t ch, struct token_config *conf)
 262         {
 263                 return iswalpha(ch) ||
 264                        strchr(conf->word_start, ch) != NULL ||
 265                        u_hasBinaryProperty(ch, UCHAR_ID_START);
 266         }
 267
 268         static int is_word_continue(wchar_t ch, struct token_config *conf)
 269         {
 270                 return iswalnum(ch) ||
 271                        strchr(conf->word_cont, ch) != NULL ||
 272                        u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
 273         }
 274
 275 Words can be either known or unknown.  Known words are referred to as
 276 "reserved words" and get a unique token number.  Unknown words are
 277 "identifiers" and are syntactically a single token.
 278
 279 ###### token types
 280         TK_ident,
 281
 282 A list of known words must be provided.  This list is shared with the
 283 "marks" which are described next.  The list must be lexically sorted
 284 and the length of the list must be given (`known_count`).
 285 Tokens matching these known words are reported as the index of the
 286 list added to `TK_reserved`.
 287
 288 If identifiers are ignored, then any word which is not listed as a
 289 known word results in an error.
 290
 291 ###### token config parameters
 292         const char **words_marks;
 293         int known_count;
 294
 295 ###### parse word
 296
 297         if (is_word_start(ch, state->conf)) {
 298                 int n;
 299                 /* A word: identifier or reserved */
 300                 do
 301                         ch = get_char(state);
 302                 while (is_word_continue(ch, state->conf));
 303                 unget_char(state);
 304                 close_token(state, &tk);
 305                 tk.num = TK_ident;
 306                 if (ignored & (1<<TK_ident))
 307                         tk.num = TK_error;
 308                 n = find_known(state->conf, tk.txt);
 309                 if (n >= 0)
 310                         tk.num = TK_reserved + n;
 311                 return tk;
 312         }
 313
 314 ### Marks
 315
 316 Marks are generally one or more punctuation marks joined together.  It
 317 would be nice to use the term "symbol" for these, but that causes
 318 confusion in a subsequent discussion of the grammar, which has terminal
 319 symbols and non-terminal symbols which are conceptually quite
 320 different.  So strings of punctuation characters will be marks.
 321
 322 A "mark" consists of ASCII characters that are not white space, are not
 323 "start" characters for words, and are not digits.
 324 These will collectively be called mark characters.
 325
 326 ###### internal functions
 327         static int is_mark(wchar_t ch, struct token_config *conf)
 328         {
 329                 return ch > ' ' &&
 330                        ch < 0x7f &&
 331                        !iswalnum(ch) &&
 332                        strchr(conf->word_start, ch) == NULL;
 333         }
 334
 335 As with words, there can be known and unknown marks, though the rules
 336 are slightly different.
 337
 338 Two marks do not need to be separated by a non-mark characters.  This
 339 is different from words which do need to be separated by at least one
 340 non-continue character.
 341
 342 The scanner will normally prefer longer sequences of mark characters,
 343 but will more strongly prefer known marks over unknown marks.  So if
 344 it finds a known mark where adding one more character does not result
 345 in a known mark, it will return that first known mark.
 346
 347 If no known mark is found we will test against strings and comments
 348 below before giving up and assuming an unknown mark.
 349
 350 If an unknown mark contains a quote character or a comment marker, and
 351 that token is not being ignored, then we terminate the unknown mark
 352 before that quote or comment.  This ensures that an unknown mark
 353 immediately before a string is handled correctly.
 354
 355 If the first character of a comment marker (i.e. '/') is a known mark,
 356 the above rules would suggest that the start of a comment would be
 357 parsed as that mark, which is not what is wanted.  So the introductory
 358 sequences for a comment ("//" and "/*") are treated as
 359 partially-known.  They prevent the leading "/" from being a mark by
 360 itself, but do not actually constitute a stand-alone mark.
 361
 362 If `TK_mark` is ignored, then unknown marks are returned as errors.
 363
 364 ###### token types
 365         TK_mark,
 366
 367 Known marks are included in the same list as the list of known words.
 368
 369 ###### parse mark
 370         tk.num = TK_error;
 371         while (is_mark(ch, state->conf)) {
 372                 int n;
 373                 wchar_t prev;
 374                 close_token(state, &tk);
 375                 n = find_known(state->conf, tk.txt);
 376                 if (n >= 0)
 377                         tk.num = TK_reserved + n;
 378                 else if (tk.num != TK_error) {
 379                         /* found a longest-known-mark, still need to
 380                          * check for comments
 381                          */
 382                         if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
 383                             (ch == '/' || ch == '*')) {
 384                                 /* Yes, this is a comment, not a '/' */
 385                                 restore_unget_state(state);
 386                                 tk.num = TK_error;
 387                                 break;
 388                         }
 389                         unget_char(state);
 390                         close_token(state, &tk);
 391                         return tk;
 392                 }
 393                 prev = ch;
 394                 save_unget_state(state);
 395                 ch = get_char(state);
 396                 if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
 397                         /* If strings are allowed, a quote (Which isn't a known mark)
 398                          * mustn't be treated as part of an unknown mark.  It can be
 399                          * part of a multi-line srtings though.
 400                          */
 401                         break;
 402                 if (prev == '#' && n < 0)
 403                         /* '#' is not a known mark, so assume it is a comment */
 404                         break;
 405                 if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
 406                         close_token(state, &tk);
 407                         restore_unget_state(state);
 408                         break;
 409                 }
 410                 if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
 411                         close_token(state, &tk);
 412                         restore_unget_state(state);
 413                         break;
 414                 }
 415         }
 416         unget_char(state);
 417         if (tk.num != TK_error) {
 418                 close_token(state, &tk);
 419                 return tk;
 420         }
 421
 422 If we don't find a known mark, we will check for strings and comments
 423 before assuming that we have an unknown mark
 424
 425 ###### parse mark
 426         ## parse string
 427         ## parse comment
 428         ## unknown mark
 429
 430 ### Strings
 431
 432 Strings start with one of single quote, double quote, or back quote
 433 and continue until a matching character on the same line.  Any of
 434 these characters can be included in the list of known marks and then
 435 they will not be used for identifying strings.
 436
 437 Immediately following the close quote, one or two ASCII letters may
 438 appear.  These are somewhat like the arbitrary letters allowed in
 439 "Numbers" above.  They can be used by the language in various ways.
 440
 441 If 3 identical quote characters appear in a row and are
 442 followed by a newline, then this forms a multi-line string which
 443 continues until an identical triple quote appears on a line preceded
 444 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
 445
 446 Multi-line strings may not extend beyond the end of the `code_node` in
 447 which they start.
 448
 449 Normal strings and multi-line strings are encoded as two different
 450 token types.
 451
 452 ###### token types
 453         TK_string,
 454         TK_multi_string,
 455
 456 ###### internal functions
 457         static int is_quote(wchar_t ch)
 458         {
 459                 return ch == '\'' || ch == '"' || ch == '`'; // "
 460         }
 461
 462 #### Multi-line strings
 463
 464 The multi-line string is checked for first.  If they are being
 465 ignored, we fall through and treat a triple quote as an empty string
 466 followed by the start of a new string.
 467
 468 ###### parse string
 469         if (tk.txt.len == 3 &&
 470             !(ignored & (1 << TK_multi_string)) &&
 471             is_quote(tk.txt.txt[0]) &&
 472             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
 473             is_newline(tk.txt.txt[3])) {
 474                 // triple quote
 475                 wchar_t first = tk.txt.txt[0];
 476                 int qseen = 0;
 477                 int at_sol = 1;
 478                 while (!at_eon(state) && qseen < 3) {
 479                         ch = get_char(state);
 480                         if (is_newline(ch)) {
 481                                 at_sol = 1;
 482                                 qseen = 0;
 483                         } else if (at_sol && ch == first) {
 484                                 qseen += 1;
 485                         } else if (ch != ' ' && ch != '\t') {
 486                                 at_sol = 0;
 487                                 qseen = 0;
 488                         }
 489                 }
 490                 if (qseen != 3) {
 491                         /* Hit end of node - error.
 492                          * unget so the newline is seen,
 493                          * but return rest of string as an error.
 494                          */
 495                         if (is_newline(ch))
 496                                 unget_char(state);
 497                         close_token(state, &tk);
 498                         tk.num = TK_error;
 499                         return tk;
 500                 }
 501                 /* 2 letters are allowed */
 502                 ch = get_char(state);
 503                 if (iswalpha(ch))
 504                         ch = get_char(state);
 505                 if (iswalpha(ch))
 506                         ch = get_char(state);
 507                 /* Now we must have a newline, but we don't return it
 508                  * whatever it is.*/
 509                 unget_char(state);
 510                 close_token(state, &tk);
 511                 tk.num = TK_multi_string;
 512                 if (!is_newline(ch))
 513                         tk.num = TK_error;
 514                 return tk;
 515         }
 516
 517 #### Single-line strings
 518
 519 The sequence of marks collected may be more than a single-line
 520 string, so we reset to the start and collect characters until
 521 we find a close quote or a newline.
 522
 523 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
 524
 525 ###### parse string
 526         if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
 527             !(ignored & (1<<TK_string))) {
 528                 wchar_t first = tk.txt.txt[0];
 529                 reset_token(state, &tk);
 530                 ch = get_char(state);
 531                 tk.num = TK_error;
 532                 while (!at_eon(state) && !is_newline(ch)) {
 533                         ch = get_char(state);
 534                         if (ch == first) {
 535                                 tk.num = TK_string;
 536                                 break;
 537                         }
 538                         if (is_newline(ch)) {
 539                                 unget_char(state);
 540                                 break;
 541                         }
 542                 }
 543                 while (!at_eon(state) && (ch = get_char(state)) &&
 544                                           iswalpha(ch))
 545                         ;
 546                 unget_char(state);
 547                 close_token(state, &tk);
 548                 return tk;
 549         }
 550
 551 ### Comments
 552
 553 Single line comments may start with '`//`' or '`#`' providing that these
 554 are not known marks.  They continue to the end of the line.
 555
 556 Block comments start with '`/*`' if this is not a known mark.  They
 557 continue to the first occurrence of '`*/`' and may not contain any
 558 occurrence of '`/*`'.
 559
 560 Block comments can be wholly within one line or can continue over
 561 multiple lines.  The multi-line version should be followed immediately
 562 by a newline.  The Linux kernel contains over 285000 multi-line
 563 comments are only 34 are followed by characters other than white space
 564 (which should be removed) or a backslash (only needed in macros).  So
 565 it would not suffer from this rule.
 566
 567 These two comment types are reported as two separate token types, and
 568 consequently can be ignored separately.  When ignored a comment is
 569 still parsed, but is discarded.
 570
 571 ###### token types
 572         TK_line_comment,
 573         TK_block_comment,
 574
 575 ###### internal functions
 576         static int is_line_comment(struct text txt)
 577         {
 578                 return (txt.len >= 1 && txt.txt[0] == '#') ||
 579                        (txt.len >= 2 && txt.txt[0] == '/' &&
 580                                         txt.txt[1] == '/');
 581         }
 582
 583         static int is_block_comment(struct text txt)
 584         {
 585                 return txt.len >= 2 && txt.txt[0] == '/' &&
 586                        txt.txt[1] == '*';
 587         }
 588
 589 #### Single line comments
 590
 591 A single-line comment continues up to, but not including the newline
 592 or end of node.
 593
 594 ###### parse comment
 595
 596         if (is_line_comment(tk.txt)) {
 597                 while (!is_newline(ch) && !at_eon(state))
 598                         ch = get_char(state);
 599                 if (is_newline(ch))
 600                         unget_char(state);
 601                 close_token(state, &tk);
 602                 tk.num = TK_line_comment;
 603                 if (ignored & (1 << TK_line_comment))
 604                         continue;
 605                 return tk;
 606         }
 607
 608 #### Block comments
 609
 610 The token text collected so far could exceed the comment, so we need
 611 to reset it first.
 612
 613 If we find an embedded `/*` we reset to just before the '/' and report
 614 an error.  That way the next thing to be parsed will be the rest of
 615 the comment.  This requires a double unget, so we need to save/restore
 616 the unget state (explained later).
 617
 618 ###### parse comment
 619
 620         if (is_block_comment(tk.txt)) {
 621                 wchar_t prev;
 622                 int newlines = 0;
 623                 reset_token(state, &tk);
 624                 get_char(state);
 625                 get_char(state);
 626                 save_unget_state(state);
 627                 ch = get_char(state);
 628                 prev = 0;
 629                 while (!at_eon(state) &&
 630                        (prev != '/' || ch != '*') &&
 631                        (prev != '*' || ch != '/')) {
 632                         if (is_newline(ch))
 633                                 newlines = 1;
 634                         prev = ch;
 635                         save_unget_state(state);
 636                         ch = get_char(state);
 637                 }
 638                 close_token(state, &tk);
 639                 if (at_eon(state)) {
 640                         tk.num = TK_error;
 641                         return tk;
 642                 }
 643                 if (prev == '/') {
 644                         /* embedded.  Need to unget twice! */
 645                         restore_unget_state(state);
 646                         unget_char(state);
 647                         tk.num = TK_error;
 648                         return tk;
 649                 }
 650                 tk.num = TK_block_comment;
 651                 if (newlines && !(ignored & (1<<TK_newline))) {
 652                         /* next char must be newline */
 653                         ch = get_char(state);
 654                         unget_char(state);
 655                         if (!is_newline(ch))
 656                                 tk.num = TK_error;
 657                 }
 658                 if (tk.num == TK_error ||
 659                     !(ignored & (1 << TK_block_comment)))
 660                         return tk;
 661                 continue;
 662         }
 663
 664 ### Indents, Newlines, and White Space.
 665
 666 Normally white space is ignored.  However newlines can be important as
 667 can indents, which are either after a newline or at the start of a
 668 node (detected by `at_son()`);
 669
 670 ###### exported functions
 671         static inline int is_newline(wchar_t ch)
 672         {
 673                 return ch == '\n' || ch == '\f' || ch == '\v';
 674         }
 675
 676 ###### white space
 677         if (ch <= ' ' && !is_newline(ch)
 678             && ! at_son(state))
 679                 continue;
 680
 681 If a line starts with more white-space than the previous non-blank
 682 line - or if the first non-blank line in the document starts with any
 683 white-space - then an "IN" is reported at the start of the line.
 684
 685 Before the next non-blank line which starts with less white space, or
 686 at the latest at the end of the document, a matching "OUT" token
 687 is reported.  There will always be an exact match between "IN" and
 688 "OUT" tokens.
 689
 690 It is possible for "OUT" to be followed (almost) immediately by an
 691 "IN".  This happens if, for example, the indent of three consecutive
 692 lines are 0, 8, 4 spaces.  Before the second line we report an
 693 "IN".  Before the third line we must report an "OUT", as 4 is less
 694 than 8, then also an Ident as 4 is greater than 0.
 695
 696 ###### token types
 697         TK_in,
 698         TK_out,
 699
 700 For the purpose of measuring the length of white space, a tab adds at
 701 least one space, and rounds up to a multiple of 8.
 702
 703 ###### exported functions
 704         static inline int indent_tab(int indent)
 705         {
 706                 return (indent|7)+1;
 707         }
 708
 709 We need to track the current levels of indent.  This requires some
 710 sort of stack as indent levels are pushed on and popped off.  In
 711 practice this stack is unlikely to often exceed 5 so we will used a
 712 fixed stack of 20 indent levels.  More than this will be silently
 713 ignored.
 714
 715 ###### state fields
 716         int     indent_level;
 717         int     indent_sizes[20];
 718
 719 `indent_sizes[0]` will always be zero - this simplifies some code.
 720
 721 #### Newlines
 722
 723 Newlines can optionally be reported.  Newlines within a block comment
 724 or a multi-line string are not reported separately, but each of these
 725 must be followed immediately by a newline so these constructs cannot
 726 hide the fact that a newline was present.
 727
 728 When indents are being reported, the Newline which would normally be
 729 reported immediately before the "IN" is delayed until after the
 730 matching "OUT".  This makes an indented section act like a
 731 continuation of the previous line to some extent.
 732
 733 A blank line would normally be reported simply as two consecutive Newline
 734 tokens.  However if the subsequent line is indented (and indents are being
 735 reported) then the right thing to do is less obvious as Newlines should be
 736 delayed - but how many Newlines?
 737
 738 The approach we will take is to report the extra Newlines immediately after
 739 the IN token, so the blank line is treated as though it were an indented
 740 blank line.
 741
 742 ###### token types
 743         TK_newline,
 744
 745 If we find a newline or white space at the start of a block, we keep
 746 collecting spaces, tabs, and newlines until we find some real text.
 747 Then depending on the indent we generate some number of tokens.  These
 748 will be a sequence of "Newline OUT" pairs representing a decrease
 749 in indent, then either a Newline or an IN depending on whether the
 750 next line is indented, then zero or more Newlines representing all the
 751 blank lines that have been skipped.
 752
 753 When a Newline leads to the next block of code there is a question of
 754 whether the various Newline and OUT/IN tokens should appear to
 755 belong to the earlier or later block.  This is addressed by processing
 756 the tokens in two stages based on the relative indent levels of the
 757 two blocks (each block has a base indent to which the actual indents
 758 are added).
 759
 760 Any "Newline OUT" pairs needed to reduce the current indent to the
 761 maximum of the base indents of the old and new blocks are generated
 762 against the old block.  Then if the next block does not have an
 763 increased indent, one more "Newline" is generated.
 764
 765 If further "Newline OUT" pairs are needed to get to the indent
 766 level of the 'next' block, they are generated against that block,
 767 though the first Newline is suppressed (it having already been
 768 generated).
 769
 770 Finally the Newline or IN for the first line of the new block is
 771 generated, unless the Newline needs to be suppressed because it
 772 appeared at the end of the previous block.
 773
 774 This means that a block may start with an OUT or an IN, but
 775 will only start with a Newline if it actually starts with a blank
 776 line.
 777
 778 We will need to represent in the `token_state` where in this sequence
 779 of delayed tokens we are.  As `state.col` records the target indent we
 780 don't need to record how many OUTs or INs are needed.  We do
 781 need to record the number of blank lines, and which of Newline and
 782 OUT is needed next in the initial sequence of pairs.
 783
 784 For this we store one more than the number of blank lines as
 785 `delayed_lines` and a flag for `out_next`.
 786
 787 ###### state fields
 788         int check_indent;
 789         int delayed_lines;
 790         int out_next;
 791
 792 Generating these tokens involves two separate pieces of code.
 793
 794 Firstly we need to recognise white space and count the indents and
 795 newlines.  These are recorded in the above state fields.
 796
 797 Separately we need, on each call to `token_next`, to check if
 798 there are some delayed tokens and if so we need to advance the state
 799 information and return one token.
 800
 801 ###### internal functions
 802         static int state_indent(struct token_state *state)
 803         {
 804                 if (state->node == NULL)
 805                         return state->col;
 806                 return state->node->indent - state->node->needs_strip + state->col;
 807         }
 808
 809 ###### white space
 810         if (is_newline(ch))
 811                 state_check_node(state);
 812         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
 813                 int newlines = 0;
 814                 int was_nl = is_newline(ch);
 815                 if (ignored & (1<<TK_in)) {
 816                         if (!is_newline(ch))
 817                                 continue;
 818                         if (ignored & (1<<TK_newline))
 819                                 continue;
 820                         tk.num = TK_newline;
 821                         close_token(state, &tk);
 822                         return tk;
 823                 }
 824                 // Indents are needed, so check all white space.
 825                 while (ch <= ' ' && ch != WEOF) {
 826                         if (is_newline(ch))
 827                                 newlines += 1;
 828                         ch = get_char(state);
 829                         if (is_newline(ch))
 830                                 state_check_node(state);
 831                 }
 832                 if (ch != WEOF)
 833                         unget_char(state);
 834                 state->delayed_lines = newlines;
 835                 state->out_next = !was_nl;
 836                 state->check_indent = 1;
 837                 continue;
 838         }
 839
 840 ###### delayed tokens
 841
 842         if (state->check_indent || state->delayed_lines) {
 843                 if (state_indent(state) < state->indent_sizes[state->indent_level]) {
 844                         if (!state->out_next &&
 845                             !(ignored & (1<<TK_newline))) {
 846                                 state->out_next = 1;
 847                                 tk.num = TK_newline;
 848                                 return tk;
 849                         }
 850                         state->indent_level -= 1;
 851                         state->out_next = 0;
 852                         tk.num = TK_out;
 853                         return tk;
 854                 }
 855                 if (state_indent(state) > state->indent_sizes[state->indent_level] &&
 856                     state->indent_level < sizeof(state->indent_sizes)-1) {
 857                         state->indent_level += 1;
 858                         state->indent_sizes[state->indent_level] = state_indent(state);
 859                         if (state->delayed_lines)
 860                                 state->delayed_lines -= 1;
 861                         tk.num = TK_in;
 862                         return tk;
 863                 }
 864                 state->check_indent = 0;
 865                 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
 866                         tk.num = TK_newline;
 867                         state->delayed_lines -= 1;
 868                         return tk;
 869                 }
 870                 state->delayed_lines = 0;
 871                 continue;
 872         }
 873
 874 ### End of File
 875
 876 After the last newline in the file has been processed, a special
 877 end-of-file token will be returned.  any further attempts to get more
 878 tokens will continue to return the same end-of-file token.
 879
 880 ###### token types
 881         TK_eof,
 882
 883 ###### white space
 884         if (ch == WEOF) {
 885                 if (state->col) {
 886                         state->col = 0;
 887                         state->check_indent = 1;
 888                         continue;
 889                 }
 890                 tk.num = TK_eof;
 891                 return tk;
 892         }
 893
 894 ### Unknown Marks, or errors.
 895
 896 We have now handled all the possible known mark-like tokens.
 897 If the token we have is not empty and `TK_mark` is allowed,
 898 we have an unknown mark, otherwise this must be an error.
 899
 900 ###### unknown mark
 901
 902         /* one unknown mark character */
 903         if (tk.txt.len) {
 904                 close_token(state, &tk);
 905                 if (ignored & (1<<TK_mark))
 906                         tk.num = TK_error;
 907                 else
 908                         tk.num = TK_mark;
 909                 return tk;
 910         }
 911         /* Completely unrecognised character is next, possibly
 912          * a digit and we are ignoring numbers.
 913          * What ever it is, make it an error.
 914          */
 915         get_char(state);
 916         close_token(state, &tk);
 917         tk.num = TK_error;
 918         return tk;
 919
 920 ## Tools For The Task
 921
 922 You may have noticed that are few gaps we left in the above -
 923 functions used without first defining them.  Doing so above would have
 924 broken the flow.
 925
 926 ### Character by character
 927
 928 As we walk through the various `code_node`s we need to process whole
 929 Unicode codepoints, and keep track of which line and column we are on.
 930 We will assume for now that any printing character uses one column,
 931 though that is not true in general.
 932
 933 As the text in a `code_node` may include an indent that identifies it as
 934 being code, we need to be careful to strip that.  The `code_node` has
 935 a flag that tells us whether or not we need to strip.
 936
 937 ###### includes
 938         #include <memory.h>
 939
 940 ###### state fields
 941         struct code_node *node;
 942         int    offset;
 943         int    line;
 944         int    col;
 945         int    strip_offset;
 946
 947 ###### internal functions
 948
 949         static void do_strip(struct token_state *state)
 950         {
 951                 int indent = 0;
 952                 if (state->node->needs_strip) {
 953                         int n = 4;
 954                         while (n && state->node->code.txt[state->offset] == ' ') {
 955                                 indent += 1;
 956                                 state->offset += 1;
 957                                 n -= 1;
 958                         }
 959                         while (n == 4 && state->node->code.txt[state->offset] == '\t') {
 960                                 indent = indent_tab(indent);
 961                                 state->offset += 1;
 962                                 n -= 4;
 963                         }
 964                 }
 965         }
 966
 967         static void state_check_node(struct token_state *state)
 968         {
 969                 if (!state->node)
 970                         return;
 971                 if (state->node->code.len > state->offset)
 972                         return;
 973
 974                 do
 975                         state->node = state->node->next;
 976                 while (state->node && state->node->code.txt == NULL);
 977                 state->offset = 0;
 978                 state->prev_offset = 0;
 979                 state->strip_offset = 0;
 980                 state->col = 0;
 981                 if (state->node == NULL)
 982                         return;
 983                 state->line = state->node->line_no;
 984                 do_strip(state);
 985                 state->col = state->node->needs_strip;
 986                 state->strip_offset = state->offset;
 987         }
 988
 989         static wint_t get_char(struct token_state *state)
 990         {
 991                 wchar_t next;
 992                 size_t n;
 993                 mbstate_t mbstate;
 994
 995                 state_check_node(state);
 996                 if (state->node == NULL)
 997                         return WEOF;
 998
 999                 ## before get_char
1000
1001                 memset(&mbstate, 0, sizeof(mbstate));
1002
1003                 n = mbrtowc(&next, state->node->code.txt + state->offset,
1004                             state->node->code.len - state->offset,
1005                             &mbstate);
1006                 if (n == -2 || n == 0) {
1007                         /* Not enough bytes - not really possible */
1008                         next = '\n';
1009                         state->offset = state->node->code.len;
1010                 } else if (n == -1) {
1011                         /* error */
1012                         state->offset += 1;
1013                         next = 0x7f; // an illegal character
1014                 } else
1015                         state->offset += n;
1016
1017                 if (next >= ' ') {
1018                         state->col += 1;
1019                 } else if (is_newline(next)) {
1020                         state->line += 1;
1021                         do_strip(state);
1022                         state->col = state->node->needs_strip;
1023                 } else if (next == '\t') {
1024                         state->col = indent_tab(state->col);
1025                 }
1026                 return next;
1027         }
1028
1029 We will sometimes want to "unget" the last character as it needs to be
1030 considered again as part of the next token.  So we need to store a
1031 'previous' version of all metadata.
1032
1033 ###### state fields
1034         int    prev_offset;
1035         int    prev_line;
1036         int    prev_col;
1037
1038 ###### before get_char
1039         state->prev_offset = state->offset;
1040         state->prev_line   = state->line;
1041         state->prev_col    = state->col;
1042
1043 ###### internal functions
1044
1045         static void unget_char(struct token_state *state)
1046         {
1047                 if (state->node) {
1048                         state->offset = state->prev_offset;
1049                         state->line   = state->prev_line;
1050                         state->col    = state->prev_col;
1051                 }
1052         }
1053
1054 We occasionally need a double-unget, particularly for numbers and
1055 block comments.  We don't impose this cost on all scanning, but
1056 require those code sections that need it to call `save_unget_state`
1057 before each `get_char`, and then `restore_unget_state` when a
1058 double-unget is needed.
1059
1060 ###### state fields
1061         int     prev_offset2;
1062         int     prev_line2;
1063         int     prev_col2;
1064
1065 ###### internal functions
1066         static void save_unget_state(struct token_state *state)
1067         {
1068                 state->prev_offset2 = state->prev_offset;
1069                 state->prev_line2 = state->prev_line;
1070                 state->prev_col2 = state->prev_col;
1071         }
1072
1073         static void restore_unget_state(struct token_state *state)
1074         {
1075                 state->prev_offset = state->prev_offset2;
1076                 state->prev_line = state->prev_line2;
1077                 state->prev_col = state->prev_col2;
1078         }
1079
1080 At the start of a token we don't want to be at the end of a code block
1081 if we can help it.  To avoid this possibility, we 'get' and 'unget' a
1082 single character.  This will move into the next non-empty code block
1083 and leave the current pointer at the start of it.
1084
1085 This has to happen _after_ dealing with delayed tokens as some of them
1086 must appear in the previous node.  When we do this, we need to reset
1087 the data in the token.
1088
1089 ###### delayed tokens
1090         if (at_eon(state)) {
1091                 get_char(state);
1092                 unget_char(state);
1093                 tk.node = state->node;
1094                 if (state->node)
1095                         tk.txt.txt = state->node->code.txt + state->offset;
1096                 tk.line = state->line;
1097                 tk.col = state->col;
1098                 tk.txt.len = 0;
1099         }
1100
1101 ### Managing tokens
1102
1103 The current token is initialized to line up with the first character
1104 that we 'get' for each token.  When we have, or might have, a full
1105 token we can call `close_token` to set the `len` of the token
1106 appropriately.  This can safely be called multiple times.
1107
1108 Finally we occasionally (for single-line strings and block comments)
1109 need to reset to the beginning of the current token as we might have
1110 parsed too much already.  For that there is `reset_token`.
1111
1112 ###### one token
1113         tk.node = state->node;
1114         if (state->node)
1115                 tk.txt.txt = state->node->code.txt + state->offset;
1116         tk.line = state->line;
1117         tk.col = state->col;
1118         tk.txt.len = 0;
1119
1120 ###### internal functions
1121
1122         static void close_token(struct token_state *state,
1123                                 struct token *tk)
1124         {
1125                 if (state->node != tk->node)
1126                         tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
1127                 else
1128                         tk->txt.len = (state->node->code.txt + state->offset)
1129                                       - tk->txt.txt;
1130         }
1131
1132         static void reset_token(struct token_state *state, struct token *tok)
1133         {
1134                 state->prev_line = tok->line;
1135                 state->prev_col = tok->col;
1136                 state->prev_offset = tok->txt.txt - state->node->code.txt;
1137                 unget_char(state);
1138                 tok->txt.len = 0;
1139         }
1140
1141 Tokens may not cross into the next `code_node`, and some tokens can
1142 include the newline at the and of a `code_node`, we must be able to
1143 easily check if we have reached the end.  Equally we need to know if
1144 we are at the start of a node, as white space is treated a little
1145 differently there.
1146
1147 ###### internal functions
1148
1149         static int at_son(struct token_state *state)
1150         {
1151                 return state->prev_offset <= state->strip_offset;
1152         }
1153
1154         static int at_eon(struct token_state *state)
1155         {
1156                 // at end-of-node ??
1157                 return state->node == NULL ||
1158                        state->offset >= state->node->code.len;
1159         }
1160
1161 ### Find a known word
1162
1163 As the known-word list is sorted we can use a simple binary search.
1164 Following the pattern established in "mdcode", we will use a `struct
1165 text` with start and length to represent the code fragment we are
1166 searching for.
1167
1168 ###### internal functions
1169         static int find_known(struct token_config *conf, struct text txt)
1170         {
1171                 int lo = 0;
1172                 int hi = conf->known_count;
1173
1174                 while (lo + 1 < hi) {
1175                         int mid = (lo + hi) / 2;
1176                         int cmp = strncmp(conf->words_marks[mid],
1177                                           txt.txt, txt.len);
1178                         if (cmp == 0 && conf->words_marks[mid][txt.len])
1179                                 cmp = 1;
1180                         if (cmp <= 0)
1181                                 lo = mid;
1182                         else
1183                                 hi = mid;
1184                 }
1185                 if (strncmp(conf->words_marks[lo],
1186                            txt.txt, txt.len) == 0
1187                     && conf->words_marks[lo][txt.len] == 0)
1188                         return lo;
1189                 else
1190                         return -1;
1191         }
1192
1193 ### Bringing it all together
1194
1195 Now we have all the bits there is just one section missing:  combining
1196 all the token parsing code into one block.
1197
1198 The handling of delayed tokens (Newlines, INs, OUTs) must come
1199 first before we try getting another character.
1200
1201 Then we parse all the test, making sure that we check for known marks
1202 before strings and comments, but unknown marks after strings and comments.
1203
1204 This block of code will either return a token, or will choose to
1205 ignore one, in which case it will `continue` around to the top of the
1206 loop.
1207
1208 ###### one token
1209         ## delayed tokens
1210
1211         ch = get_char(state);
1212
1213         ## white space
1214         ## parse number
1215         ## parse word
1216         ## parse mark
1217
1218 ### Start and stop
1219
1220 As well as getting tokens, we need to be able to create the
1221 `token_state` to start with, and discard it later.
1222
1223 ###### includes
1224         #include <malloc.h>
1225
1226 ###### main functions
1227         struct token_state *token_open(struct code_node *code, struct
1228                                        token_config *conf)
1229         {
1230                 struct token_state *state = malloc(sizeof(*state));
1231                 memset(state, 0, sizeof(*state));
1232                 state->node = code;
1233                 state->line = code->line_no;
1234                 do_strip(state);
1235                 state->col = state->node->needs_strip;
1236                 state->strip_offset = state->offset;
1237                 state->conf = conf;
1238                 return state;
1239         }
1240         void token_close(struct token_state *state)
1241         {
1242                 free(state);
1243         }
1244
1245 ###### exported functions
1246         struct token_state *token_open(struct code_node *code, struct
1247                                        token_config *conf);
1248         void token_close(struct token_state *state);
1249
1250 ### Trace tokens
1251
1252 Getting tokens is the main thing but it is also useful to be able to
1253 print out token information, particularly for tracing and testing.
1254
1255 Known tokens are printed verbatim.  Other tokens are printed as
1256 `type(content)` where content is truncated to a given number of characters.
1257
1258 The function for printing a truncated string (`text_dump`) is also exported
1259 so that it can be used to tracing processed strings too.
1260
1261 ###### includes
1262         #include <stdio.h>
1263
1264 ###### exported functions
1265         void token_trace(FILE *f, struct token tok, int max);
1266         void text_dump(FILE *f, struct text t, int max);
1267
1268 ###### main functions
1269
1270         void text_dump(FILE *f, struct text txt, int max)
1271         {
1272                 int i;
1273                 if (txt.len > max)
1274                         max -= 2;
1275                 else
1276                         max = txt.len;
1277                 for (i = 0; i < max; i++) {
1278                         char c = txt.txt[i];
1279                         if (c < ' ' || c > '~')
1280                                 fprintf(f, "\\x%02x", c & 0xff);
1281                         else if (c == '\\')
1282                                 fprintf(f, "\\\\");
1283                         else
1284                                 fprintf(f, "%c", c);
1285                 }
1286                 if (i < txt.len)
1287                         fprintf(f, "..");
1288         }
1289
1290         void token_trace(FILE *f, struct token tok, int max)
1291         {
1292                 static char *types[] = {
1293                         [TK_ident] = "ident",
1294                         [TK_mark] = "mark",
1295                         [TK_number] = "number",
1296                         [TK_string] = "string",
1297                         [TK_multi_string] = "mstring",
1298                         [TK_line_comment] = "lcomment",
1299                         [TK_block_comment] = "bcomment",
1300                         [TK_in] = "in",
1301                         [TK_out] = "out",
1302                         [TK_newline] = "newline",
1303                         [TK_eof] = "eof",
1304                         [TK_error] = "ERROR",
1305                         };
1306
1307                 switch (tok.num) {
1308                 default: /* known word or mark */
1309                         fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1310                         break;
1311                 case TK_in:
1312                 case TK_out:
1313                 case TK_newline:
1314                 case TK_eof:
1315                         /* No token text included */
1316                         fprintf(f, "%s()", types[tok.num]);
1317                         break;
1318                 case TK_ident:
1319                 case TK_mark:
1320                 case TK_number:
1321                 case TK_string:
1322                 case TK_multi_string:
1323                 case TK_line_comment:
1324                 case TK_block_comment:
1325                 case TK_error:
1326                         fprintf(f, "%s(", types[tok.num]);
1327                         text_dump(f, tok.txt, max);
1328                         fprintf(f, ")");
1329                         break;
1330                 }
1331         }
1332
1333 ### And there we have it
1334
1335 We now have all the library functions defined for reading and printing
1336 tokens.  Now we just need C files to store them, and a mk file to make them.
1337
1338 ###### File: scanner.h
1339         ## public types
1340         ## exported functions
1341
1342 ###### File: libscanner.c
1343         ## includes
1344         #include "scanner.h"
1345         ## private types
1346         ## internal functions
1347         ## main functions
1348
1349 ###### File: scanner.mk
1350
1351         CFLAGS += -Wall -g
1352         all ::
1353         scanner.mk scanner.h libscanner.c : scanner.mdc
1354                 ./md2c scanner.mdc
1355         all :: libscanner.o
1356         libscanner.o : libscanner.c
1357                 $(CC) $(CFLAGS) -c libscanner.c
1358
1359 ## Processing numbers
1360
1361 Converting a `TK_number` token to a numerical value is a slightly
1362 higher level task than lexical analysis, and slightly lower than
1363 grammar parsing, so put it here - as an appendix if you like.
1364
1365 Importantly it will be used by the same testing rig that is used for
1366 testing the token scanner.
1367
1368 The numeric value that we will convert all numbers into is the `mpq_t`
1369 from the GNU high precision number library "libgmp".
1370
1371 ###### number includes
1372         #include <gmp.h>
1373         #include "mdcode.h"
1374
1375 Firstly we need to be able to parse a string of digits in a given base
1376 and possibly with a decimal marker.  We store this in an `mpz_t`
1377 integer and report the number of digits after the decimal mark.
1378
1379 On error we return zero and ensure that the 'mpz_t' has been freed, or
1380 had never been initialised.
1381
1382 ###### number functions
1383
1384         static int parse_digits(mpz_t num, struct text tok, int base,
1385                                 int *placesp)
1386         {
1387                 /* Accept digits up to 'base', ignore '_' and
1388                  * (for base 10) ' ' if they appear between two
1389                  * legal digits, and if `placesp` is not NULL,
1390                  * allow a single '.' or ',' and report the number
1391                  * of digits beyond there.
1392                  * Return number of characters processed (p),
1393                  * or 0 if something illegal was found.
1394                  */
1395                 int p;
1396                 int decimal = -1; // digits after marker
1397                 enum {Digit, Space, Other} prev = Other;
1398                 int digits = 0;
1399
1400                 for (p = 0; p < tok.len; p++) {
1401                         int dig;
1402                         char c = tok.txt[p];
1403
1404                         if (c == '_' || (c == ' ' && base == 10)) {
1405                                 if (prev != Digit)
1406                                         goto bad;
1407                                 prev = Space;
1408                                 continue;
1409                         }
1410                         if (c == '.' || c == ',') {
1411                                 if (prev != Digit)
1412                                         goto bad;
1413                                 if (!placesp || decimal >= 0)
1414                                         return p-1;
1415                                 decimal = 0;
1416                                 prev = Other;
1417                                 continue;
1418                         }
1419                         if (isdigit(c))
1420                                 dig = c - '0';
1421                         else if (isupper(c))
1422                                 dig = 10 + c - 'A';
1423                         else if (islower(c))
1424                                 dig = 10 + c - 'a';
1425                         else
1426                                 dig = base;
1427                         if (dig >= base) {
1428                                 if (prev == Space)
1429                                         p--;
1430                                 break;
1431                         }
1432                         prev = Digit;
1433                         if (digits)
1434                                 mpz_mul_ui(num, num, base);
1435                         else
1436                                 mpz_init(num);
1437                         digits += 1;
1438                         mpz_add_ui(num, num, dig);
1439                         if (decimal >= 0)
1440                                 decimal++;
1441                 }
1442                 if (digits == 0)
1443                         return 0;
1444                 if (placesp) {
1445                         if (decimal >= 0)
1446                                 *placesp = decimal;
1447                         else
1448                                 *placesp = 0;
1449                 }
1450                 return p;
1451         bad:
1452                 if (digits)
1453                         mpz_clear(num);
1454                 return 0;
1455         }
1456
1457 ###### number includes
1458         #include <ctype.h>
1459
1460 To parse a full number we need to consider the optional base, the
1461 mantissa, and the optional exponent.  We will treat these one at a
1462 time.
1463
1464 The base is indicated by a letter after a leading zero, which must be
1465 followed by a base letter or a period.  The base also determines the
1466 character which will mark an exponent.
1467
1468 ###### number vars
1469         int base = 10;
1470         char expc = 'e';
1471
1472 ###### parse base
1473
1474         if (tok.txt[0] == '0' && tok.len > 1) {
1475                 int skip = 0;
1476                 switch(tok.txt[1]) {
1477                 case 'x':
1478                 case 'X':
1479                         base = 16;
1480                         skip = 2;
1481                         expc = 'p';
1482                         break;
1483                 case 'o':
1484                 case 'O':
1485                         base = 8;
1486                         skip = 2;
1487                         expc = 'p';
1488                         break;
1489                 case 'b':
1490                 case 'B':
1491                         base = 2;
1492                         skip = 2;
1493                         expc = 'p';
1494                         break;
1495                 case '0':
1496                 case '1':
1497                 case '2':
1498                 case '3':
1499                 case '4':
1500                 case '5':
1501                 case '6':
1502                 case '7':
1503                 case '8':
1504                 case '9':
1505                 case '_':
1506                 case ' ':
1507                         // another digit is not permitted
1508                         // after a zero.
1509                         return 0;
1510                 default:
1511                         // must be decimal marker or trailing
1512                         // letter, which are OK;
1513                         break;
1514                 }
1515                 tok.txt += skip;
1516                 tok.len -= skip;
1517         }
1518
1519 After the base is the mantissa, which may contain a decimal mark, so
1520 we need to record the number of places.  We won't impose the number of
1521 places until we have the exponent as well.
1522
1523 ###### number vars
1524         int places = 0;
1525         mpz_t mant;
1526         int d;
1527
1528 ###### parse mantissa
1529
1530         d = parse_digits(mant, tok, base, &places);
1531         if (d == 0)
1532                 return 0;
1533         tok.txt += d;
1534         tok.len -= d;
1535         mpq_init(num);
1536         mpq_set_z(num, mant);
1537         mpz_clear(mant);
1538
1539 After the mantissa number may come an exponent which may be positive
1540 or negative.  We assume at this point that we have seen the exponent
1541 character `expc`.
1542
1543 ###### number vars
1544         long lexp = 0;
1545         mpz_t exp;
1546         int esign = 1;
1547
1548 ###### parse exponent
1549         if (tok.len > 1) {
1550                 if (tok.txt[0] == '+') {
1551                         tok.txt++;
1552                         tok.len--;
1553                 } else if (tok.txt[0] == '-') {
1554                         esign = -1;
1555                         tok.txt++;
1556                         tok.len--;
1557                 }
1558         }
1559         d = parse_digits(exp, tok, 10, NULL);
1560         if (d == 0) {
1561                 mpq_clear(num);
1562                 return 0;
1563         }
1564         if (!mpz_fits_slong_p(exp)) {
1565                 mpq_clear(num);
1566                 mpz_clear(exp);
1567                 return 0;
1568         }
1569         lexp = mpz_get_si(exp) * esign;
1570         mpz_clear(exp);
1571         tok.txt += d;
1572         tok.len -= d;
1573
1574 Now that we have the mantissa and the exponent we can multiply them
1575 together, also allowing for the number of digits after the decimal
1576 mark.
1577
1578 For base 10, we simply subtract the decimal places from the exponent.
1579 For the other bases, as the exponent is alway based on 2, even for
1580 octal and hex, we need a bit more detail.
1581 We then recover the sign from the exponent, as division is quite
1582 different from multiplication.
1583
1584 ###### calc exponent
1585         switch (base) {
1586         case 10:
1587         case 2:
1588                 lexp -= places;
1589                 break;
1590         case 16:
1591                 lexp -= 4*places;
1592                 break;
1593         case 8:
1594                 lexp -= 3*places;
1595                 break;
1596         }
1597         if (lexp < 0) {
1598                 lexp = -lexp;
1599                 esign = -1;
1600         } else
1601                 esign = 1;
1602
1603 Imposing the exponent on the number is also very different for base 10
1604 than for the others.  For the binary shift `gmp` provides a simple
1605 function.  For base 10 we use something like Russian Peasant
1606 Multiplication.
1607
1608 ###### calc exponent
1609         if (expc == 'e') {
1610                 mpq_t tens;
1611                 mpq_init(tens);
1612                 mpq_set_ui(tens, 10, 1);
1613                 while (1) {
1614                         if (lexp & 1) {
1615                                 if (esign > 0)
1616                                         mpq_mul(num, num, tens);
1617                                 else
1618                                         mpq_div(num, num, tens);
1619                         }
1620                         lexp >>= 1;
1621                         if (lexp == 0)
1622                                 break;
1623                         mpq_mul(tens, tens, tens);
1624                 }
1625                 mpq_clear(tens);
1626         } else {
1627                 if (esign > 0)
1628                         mpq_mul_2exp(num, num, lexp);
1629                 else
1630                         mpq_div_2exp(num, num, lexp);
1631         }
1632
1633 Now we are ready to parse a number: the base, mantissa, and exponent.
1634 If all goes well we check for the possible trailing letters and
1635 return.  Return value is 1 for success and 0 for failure.
1636
1637 ###### number functions
1638         int number_parse(mpq_t num, char tail[3], struct text tok)
1639         {
1640                 ## number vars
1641                 int i;
1642
1643                 ## parse base
1644                 ## parse mantissa
1645                 if (tok.len > 1 && (tok.txt[0] == expc ||
1646                                     tok.txt[0] == toupper(expc))) {
1647                         tok.txt++;
1648                         tok.len--;
1649                         ## parse exponent
1650                 }
1651                 ## calc exponent
1652
1653                 for (i = 0; i < 2; i++) {
1654                         if (tok.len <= i)
1655                                 break;
1656                         if (!isalpha(tok.txt[i]))
1657                                 goto err;
1658                         tail[i] = tok.txt[i];
1659                 }
1660                 tail[i] = 0;
1661                 if (i == tok.len)
1662                         return 1;
1663         err:
1664                 mpq_clear(num);
1665                 return 0;
1666         }
1667
1668 Number parsing goes in `libnumber.c`
1669
1670 ###### File: libnumber.c
1671
1672         #include <unistd.h>
1673         #include <stdlib.h>
1674
1675         ## number includes
1676         ## number functions
1677
1678 ###### File: number.h
1679         int number_parse(mpq_t num, char tail[3], struct text tok);
1680
1681 ###### File: scanner.mk
1682         all :: libnumber.o
1683         libnumber.o : libnumber.c
1684                 $(CC) $(CFLAGS) -c libnumber.c
1685
1686 ## Processing strings
1687
1688 Both `TK_string` and `TK_multi_string` require post-processing which
1689 can be one of two types: literal or with escapes processed.
1690 Even literal processing is non-trivial as the file may contain indents
1691 which need to be stripped.
1692
1693 Errors can only occur when processing escapes.  Any unrecognised
1694 character following the escape character will cause an error.
1695
1696 Processing escapes and striping indents can only make the string
1697 shorter, not longer, so we allocate a buffer which is the same size as
1698 the string and process into that.
1699
1700 To request escape processing, we pass the character we want to use for
1701 quoting, usually '`\`'.  To avoid escape processing we pass a zero.
1702
1703 ###### string main
1704         int string_parse(struct token *tok, char escape,
1705                          struct text *str, char tail[3])
1706         {
1707                 ## string vars
1708                 struct text t = tok->txt;
1709
1710                 str->txt = NULL;
1711                 ## strip tail
1712                 if (tok->num == TK_string) {
1713                         ## strip single
1714                 } else {
1715                         ## strip multi
1716                 }
1717                 str->txt = malloc(t.len);
1718                 str->len = 0;
1719
1720                 ## process string
1721                 return 1;
1722         err:
1723                 free(str->txt);
1724                 str->txt = NULL;
1725                 return 0;
1726         }
1727
1728 ### strip tail
1729
1730 The tail of the string can be 0, 1, or 2 letters
1731
1732         i = t.len;
1733         if (i >= 0 && isalpha(t.txt[i-1]))
1734                 i -= 1;
1735         if (i >= 0 && isalpha(t.txt[i-1]))
1736                 i -= 1;
1737         strncpy(tail, t.txt+i, t.len-i);
1738         tail[t.len-i] = 0;
1739         t.len = i;
1740
1741 ###### string vars
1742         int i;
1743
1744 ### strip single
1745
1746 Stripping the quote of a single-line string is trivial.
1747 The only part that is at all interesting is that quote character must
1748 be remembered.
1749
1750         quote = t.txt[0];
1751         if (t.txt[t.len-1] != quote)
1752                 goto err;
1753         t.txt += 1;
1754         t.len -= 2;
1755
1756 ###### string vars
1757         char quote;
1758
1759 ### strip multi
1760
1761 For a multi-line string we have a little more work to do.  We need to
1762 remove 3 quotes, not 1, and need to count the indent of the close
1763 quote as it will need to be stripped from all lines.
1764
1765         quote = t.txt[0];
1766         if (t.len < 7 ||
1767             t.txt[1] != quote || t.txt[2] != quote ||
1768             !is_newline(t.txt[3]))
1769                 goto err;
1770         t.txt += 4;
1771         t.len -= 4;
1772         i = t.len;
1773         if (i <= 0 || t.txt[i-1] != quote)
1774                 goto err;
1775         i -= 1;
1776         if (i <= 0 || t.txt[i-1] != quote)
1777                 goto err;
1778         i -= 1;
1779         if (i <= 0 || t.txt[i-1] != quote)
1780                 goto err;
1781         i -= 1;
1782         t.len = i;
1783         while (i > 0 && !is_newline(t.txt[i-1]))
1784                 i--;
1785         indent = 0;
1786         while (i < t.len) {
1787                 if (t.txt[i] == ' ')
1788                         indent += 1;
1789                 if (t.txt[i] == '\t')
1790                         indent = indent_tab(indent);
1791                 i++;
1792         }
1793
1794 ###### string vars
1795         int indent = 0;
1796
1797 ### process string
1798
1799 Now we just take one byte at a time. trans-ASCII unicode won't look
1800 like anything we are interested in so it will just be copied byte by
1801 byte.
1802
1803         cp = str->txt;
1804         at_sol = 1;
1805         for (i = 0; i < t.len; i++) {
1806                 char c;
1807                 if (at_sol) {
1808                         at_sol = 0;
1809                         ## strip indent
1810                         if (i >= t.len)
1811                                 break;
1812                 }
1813                 c = t.txt[i];
1814                 if (c != escape) {
1815                         *cp = c;
1816                         cp += 1;
1817                         if (is_newline(c))
1818                                 at_sol = 1;
1819                 } else if (i+1 >= t.len) {
1820                         // escape and end of string
1821                         goto err;
1822                 } else {
1823                         i += 1;
1824                         c = t.txt[i];
1825                         ## parse escape
1826                 }
1827         }
1828         str->len = cp - str->txt;
1829
1830 ###### string vars
1831         char *cp;
1832         int at_sol;
1833
1834 ### strip indent
1835
1836 Every time we find a start of line, we strip spaces and tabs until the
1837 required indent is found.
1838
1839         int skipped = 0;
1840         while (i < t.len && skipped < indent) {
1841                 c = t.txt[i];
1842                 if (c == ' ')
1843                         skipped += 1;
1844                 else if (c == '\t')
1845                         skipped = indent_tab(skipped);
1846                 else
1847                         break;
1848                 i+= 1;
1849         }
1850
1851 ### parse escape
1852         switch (c) {
1853         case 'n':
1854                 *cp++ = '\n'; break;
1855         case 'r':
1856                 *cp++ = '\r'; break;
1857         case 't':
1858                 *cp++ = '\t'; break;
1859         case 'b':
1860                 *cp++ = '\b'; break;
1861         case 'q':
1862                 *cp++ = quote; break;
1863         case 'f':
1864                 *cp++ = '\f'; break;
1865         case 'v':
1866                 *cp++ = '\v'; break;
1867         case 'a':
1868                 *cp++ = '\a'; break;
1869         case '0':
1870         case '1':
1871         case '2':
1872         case '3':
1873                 // 3 digit octal number
1874                 if (i+2 >= t.len)
1875                         goto err;
1876                 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1877                     t.txt[i+2] < '0' || t.txt[i+1] > '7')
1878                         goto err;
1879                 n = (t.txt[i  ]-'0') * 64 +
1880                     (t.txt[i+1]-'0') *  8 +
1881                     (t.txt[i+2]-'0') *  1;
1882                 *cp++ = n;
1883                 i += 2;
1884                 break;
1885         case 'x':
1886                 // 2 hex digits
1887                 n = take_hex(2, t.txt+i+1, t.len-i-1);
1888                 if (n < 0)
1889                         goto err;
1890                 *cp++ = n;
1891                 i += 2;
1892                 break;
1893         case 'u':
1894         case 'U':
1895                 // 4 or 8 hex digits for unicode
1896                 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1897                 if (n < 0)
1898                         goto err;
1899                 memset(&pstate, 0, sizeof(pstate));
1900                 n = wcrtomb(cp, n, &pstate);
1901                 if (n <= 0)
1902                         goto err;
1903                 cp += n;
1904                 i += c == 'u' ? 4 : 8;
1905                 break;
1906         default:
1907                 if (c == escape)
1908                         *cp++ = c;
1909                 else if (is_newline(c))
1910                         at_sol = 1;
1911                 else
1912                         goto err;
1913         }
1914
1915 ###### string vars
1916         long n;
1917         mbstate_t pstate;
1918
1919 For `\x` `\u` and `\U` we need to collect a specific number of
1920 hexadecimal digits
1921
1922 ###### string functions
1923
1924         static long take_hex(int digits, char *cp, int l)
1925         {
1926                 long n = 0;
1927                 if (l < digits)
1928                         return -1;
1929                 while (digits) {
1930                         char  c = *cp;
1931                         int d;
1932                         if (!isxdigit(c))
1933                                 return -1;
1934                         if (isdigit(c))
1935                                 d = c - '0';
1936                         else if (isupper(c))
1937                                 d = 10 + c - 'A';
1938                         else
1939                                 d = 10 + c - 'a';
1940                         n = n * 16 + d;
1941                         digits--;
1942                         cp++;
1943                 }
1944                 return n;
1945         }
1946
1947 #### File: libstring.c
1948
1949 String parsing goes in `libstring.c`
1950
1951         #include <unistd.h>
1952         #include <stdlib.h>
1953         #include <stdio.h>
1954         #include <string.h>
1955         #include <ctype.h>
1956         #include <wchar.h>
1957         #include "mdcode.h"
1958         #include "scanner.h"
1959         ## string functions
1960         ## string main
1961
1962 ###### File: string.h
1963         int string_parse(struct token *tok, char escape,
1964                          struct text *str, char tail[3]);
1965
1966 ###### File: scanner.mk
1967         all :: libstring.o
1968         libstring.o : libstring.c
1969                 $(CC) $(CFLAGS) -c libstring.c
1970
1971 ## Testing
1972
1973 As "untested code is buggy code" we need a program to easily test
1974 the scanner library.  This will simply parse a given file and report
1975 the tokens one per line.
1976
1977 ###### File: scanner.c
1978
1979         #include <unistd.h>
1980         #include <stdlib.h>
1981         #include <fcntl.h>
1982         #include <errno.h>
1983         #include <sys/mman.h>
1984         #include <string.h>
1985         #include <stdio.h>
1986         #include <gmp.h>
1987         #include <locale.h>
1988         #include <getopt.h>
1989         #include "mdcode.h"
1990         #include "scanner.h"
1991         #include "number.h"
1992         #include "string.h"
1993
1994         static int errs;
1995         static void pr_err(char *msg)
1996         {
1997                 errs++;
1998                 fprintf(stderr, "%s\n", msg);
1999         }
2000
2001         static int kcmp(const void *ap, const void *bp)
2002         {
2003                 char * const *a = ap;
2004                 char * const *b = bp;
2005                 return strcmp(*a, *b);
2006         }
2007
2008         int main(int argc, char *argv[])
2009         {
2010                 int fd;
2011                 int len;
2012                 char *file;
2013                 char *filename = NULL;
2014                 struct token_state *state;
2015                 const char *known[] = {
2016                         "==",
2017                         "else",
2018                         "if",
2019                         "then",
2020                         "while",
2021                         "{",
2022                         "}",
2023                 };
2024                 struct token_config conf = {
2025                         .word_start = "_$",
2026                         .word_cont = "",
2027                         .words_marks = known,
2028                         .number_chars = "., _+-",
2029                         .known_count = sizeof(known)/sizeof(known[0]),
2030                         .ignored = 0,
2031                 };
2032                 static const struct option long_options[] = {
2033                         { "word-start",         1, NULL, 'W'},
2034                         { "word-cont",          1, NULL, 'w'},
2035                         { "number-chars",       1, NULL, 'n'},
2036                         { "ignore-numbers",     0, NULL, 'N'},
2037                         { "ignore-ident",       0, NULL, 'I'},
2038                         { "ignore-marks",       0, NULL, 'M'},
2039                         { "ignore-strings",     0, NULL, 'S'},
2040                         { "ignore-multi-strings",0, NULL, 'z'},
2041                         { "ignore-line-comment",0, NULL, 'c'},
2042                         { "ignore-newline",     0, NULL, 'l'},
2043                         { "ignore-block-comment", 0, NULL, 'C'},
2044                         { "ignore-indent",      0, NULL, 'i'},
2045                         { "file",               1, NULL, 'f'},
2046                         { "section",            1, NULL, 's'},
2047                         { NULL,                 0, NULL, 0},
2048                 };
2049                 static const char options[] = "W:w:n:NIMSzclCif:s:";
2050
2051                 struct section *table, *s, *prev;
2052                 int opt;
2053                 char *section_name = NULL;
2054                 int section_found = 0;
2055
2056                 setlocale(LC_ALL,"");
2057                 while ((opt = getopt_long(argc, argv, options, long_options, NULL))
2058                        != -1) {
2059                         switch(opt) {
2060                         case 'W': conf.word_start = optarg; break;
2061                         case 'w': conf.word_cont = optarg; break;
2062                         case 'n': conf.number_chars = optarg; break;
2063                         case 'N': conf.ignored |= 1 << TK_number; break;
2064                         case 'I': conf.ignored |= 1 << TK_ident; break;
2065                         case 'M': conf.ignored |= 1 << TK_mark; break;
2066                         case 'S': conf.ignored |= 1 << TK_string; break;
2067                         case 'z': conf.ignored |= 1 << TK_multi_string; break;
2068                         case 'c': conf.ignored |= 1 << TK_line_comment; break;
2069                         case 'C': conf.ignored |= 1 << TK_block_comment; break;
2070                         case 'l': conf.ignored |= 1 << TK_newline; break;
2071                         case 'i': conf.ignored |= 1 << TK_in; break;
2072                         case 'f': filename = optarg; break;
2073                         case 's': section_name = optarg; break;
2074                         default: fprintf(stderr, "scanner: unknown option '%c'.\n",
2075                                          opt);
2076                                 exit(1);
2077                         }
2078                 }
2079
2080                 if (optind < argc) {
2081                         const char **wm = calloc(argc - optind, sizeof(char*));
2082                         int i;
2083                         for (i = optind; i < argc; i++)
2084                                 wm[i - optind] = argv[i];
2085                         qsort(wm, argc-optind, sizeof(char*), kcmp);
2086                         conf.words_marks = wm;
2087                         conf.known_count = argc - optind;
2088                 }
2089
2090                 if (filename)
2091                         fd = open(filename, O_RDONLY);
2092                 else
2093                         fd = 0;
2094                 if (fd < 0) {
2095                         fprintf(stderr, "scanner: cannot open %s: %s\n",
2096                                 filename, strerror(errno));
2097                         exit(1);
2098                 }
2099                 len = lseek(fd, 0, 2);
2100                 if (len <= 0) {
2101                         fprintf(stderr,"scanner: %s is empty or not seekable\n",
2102                                 filename ?: "stdin");
2103                         exit(1);
2104                 }
2105                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2106                 table = code_extract(file, file+len, pr_err);
2107
2108                 for (s = table; s;
2109                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
2110                         if (section_name &&
2111                             (s->section.len != strlen(section_name) ||
2112                              strncmp(s->section.txt, section_name, s->section.len) != 0))
2113                                 continue;
2114                         if (section_name)
2115                                 section_found = 1;
2116                         printf("Tokenizing: %.*s\n", s->section.len,
2117                                 s->section.txt);
2118                         state = token_open(s->code, &conf);
2119                         while(1) {
2120                                 struct token tk = token_next(state);
2121                                 printf("%d:%d ", tk.line, tk.col);
2122                                 token_trace(stdout, tk, 20);
2123                                 if (tk.num == TK_number) {
2124                                         mpq_t num;
2125                                         char tail[3];
2126                                         if (number_parse(num, tail,tk.txt)) {
2127                                                 printf(" %s ", tail);
2128                                                 mpq_out_str(stdout, 10, num);
2129                                                 mpq_clear(num);
2130                                         } else
2131                                                 printf(" BAD NUMBER");
2132                                 }
2133                                 if (tk.num == TK_string ||
2134                                     tk.num == TK_multi_string) {
2135                                         char esc = '\\';
2136                                         struct text str;
2137                                         char tail[3];
2138                                         if (tk.txt.txt[0] == '`')
2139                                                 esc = 0;
2140                                         if (string_parse(&tk, esc,
2141                                                          &str, tail)) {
2142                                                 printf(" %s ", tail);
2143                                                 text_dump(stdout, str, 20);
2144                                                 free(str.txt);
2145                                         } else
2146                                                 printf(" BAD STRING");
2147                                 }
2148                                 printf("\n");
2149                                 if (tk.num == TK_error)
2150                                         errs = 1;
2151                                 if (tk.num == TK_eof)
2152                                         break;
2153                         }
2154                         token_close(state);
2155                 }
2156                 if (conf.words_marks != known)
2157                         free(conf.words_marks);
2158                 if (section_name && !section_found) {
2159                         fprintf(stderr, "scanner: section %s not found\n", section_name);
2160                         errs = 1;
2161                 }
2162                 exit(!!errs);
2163         }
2164 ###### File: scanner.mk
2165         scanner.c : scanner.mdc
2166                 ./md2c scanner.mdc
2167         all :: scanner
2168         scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
2169                 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
2170                         libmdcode.o libnumber.o libstring.o -licuuc -lgmp
2171         scanner.o : scanner.c
2172                 $(CC) $(CFLAGS) -c scanner.c