ocean-lang.org Git - ocean/blob - csrc/scanner.mdc

   1 # Lexical Scanner #
   2
   3 ## The Task at Hand ##
   4
   5 The main task of the lexical scanner is to convert a stream of
   6 characters into a stream of tokens.  The tokens are then typically
   7 used by a parser to extract the syntactic structure.
   8
   9 The stream of characters are assumed to be in memory identified by a
  10 linked list of blocks, such as provided by the "[mdcode][]" literate
  11 program extractor.  A single token may never cross a block boundary.
  12
  13 [mdcode]: mdcode.html
  14
  15 ###### includes
  16         #include "mdcode.h"
  17
  18 The text is assumed to be UTF-8 though some matching assumes the
  19 ASCII subset.  If the text provided does not conform to UTF-8 an error
  20 will be reported and some number of bytes will be skipped.
  21
  22 ###### public types
  23         #include <wchar.h>
  24         #include <wctype.h>
  25         #include <unicode/uchar.h>
  26
  27 Tokens are returned by successive calls to the main interface
  28 function: `token_next()` which has a `state` structure to keep track
  29 of where it is up to.  Each token carries not just a numeric
  30 identifier but also the code block, the line and character within that
  31 block, and the actual start and length using the `struct text` from
  32 "mdcode".
  33
  34 ###### public types
  35         struct token {
  36                 int               num;
  37                 struct code_node *node;
  38                 struct text       txt;
  39                 int               line, col;
  40         };
  41         struct token_state;
  42
  43 ###### private types
  44         struct token_state {
  45                 ## state fields
  46         };
  47
  48 ###### exported functions
  49         struct token token_next(struct token_state *state);
  50
  51 ###### main functions
  52         struct token token_next(struct token_state *state)
  53         {
  54                 ## token_next init
  55                 while (1) {
  56                         wint_t ch;
  57                         struct token tk;
  58
  59                         ## one token
  60                 }
  61         }
  62
  63 The `line` and `col` offsets are useful for reporting errors.
  64 The `txt` provides the content when that is important.
  65
  66 ### Token types and configuration ##
  67
  68 The scanner is not completely general, yet not completely specified.
  69 There are a fixed set of token types, though particular tokens within
  70 those types can be distinguish via configuration.
  71
  72 Most token types may be explicitly ignored, as typically comments
  73 would be.  The exact consequence of ignoring each token type varies
  74 from token to token.
  75
  76 ###### public types
  77         struct token_config {
  78                 int ignored;    // bit set of ignored tokens.
  79                 ## token config parameters
  80         };
  81
  82 ###### state fields
  83         struct token_config *conf;
  84
  85 ###### token_next init
  86         int ignored = state->conf->ignored;
  87
  88
  89 The different tokens are numbers, words, marks, strings, comments,
  90 newlines, EOF, and indents, each of which is examined in detail below.
  91
  92 There are various cases where no token can be found in part of the
  93 input.  All of these will be reported as an `TK_error` token.
  94
  95 It is possible to declare a number of strings which form distinct
  96 tokens (rather than being grouped as e.g. 'word').  These are given
  97 token numbers from `TK_reserved` upwards.
  98
  99 ###### public types
 100         enum token_num {
 101                 TK_error,
 102                 ## token types
 103                 TK_reserved
 104         };
 105
 106 ### Numbers
 107
 108 Numbers are the messiest tokens to parse, primarily because they can
 109 contain characters that also have meaning outside of number and,
 110 particularly, immediately after numbers.
 111
 112 The obvious example is the '`-`' sign.  It can come inside a number for
 113 a negative exponent, or after a number as a subtraction operator.  To
 114 be sure we have parsed as best as possible we need to only allow the
 115 '`-`' inside a number if it is after an exponent character.  This can be
 116 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
 117 digit, so we don't allow '`-`' after just any `e`.
 118
 119 To make matters worse, our language designer has decided to experiment
 120 with allowing commas to be used as the decimal indicator, and spaces
 121 to be used to separate groups of digits in large numbers.  Both of
 122 these can reasonably be restricted to appear between two digits, so we
 123 have to add that condition to our tests.
 124
 125 So we cannot just treat numbers as starting with a digit and being
 126 followed by some set of characters.  We need more structure than that.
 127
 128 So:
 129
 130 - Numbers must start with a digit.
 131 - If the first digit is zero, the next character must be a base
 132   signifier (one of `xob`) or a decimal marker (`.` or `,`).
 133   In the first case the first `p` or `P` may be followed by a sign.
 134 - If the number doesn't start with `0` followed by one of `xob`, the
 135   first `e` may be followed by a sign.
 136 - Any digit or hex digit may be followed by a space or underscore
 137   providing that the subsequence character is also a (hex) digit.
 138   This rule will require an extra level of 'unget' to be
 139   supported when handling characters.
 140 - Otherwise any digits or ASCII letters are allowed.  We do not at
 141   this point check that the digits given are permitted by the base.
 142   That will happen when the token is converted to a number.
 143
 144 To allow easy configuration, the various non alphanumeric characters
 145 are only permitted if they are listed in a configuration parameter.
 146
 147 ###### token config parameters
 148         char *number_chars;
 149
 150 Note that numbers may not start with a period, so `.75` is not a
 151 number.  This is not the norm, but is not unheard of.  Excluding these
 152 numbers simplifies the rule at very little cost.
 153
 154 ###### token types
 155         TK_number,
 156
 157 If TK_number is ignored, digits will result in an error unless they
 158 are declared to be a start character for words.
 159
 160 ###### includes
 161
 162         #include <string.h>
 163
 164 ###### parse number
 165
 166         if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
 167                 int prev_special = 0;
 168                 int expect_p = 0;
 169                 int decimal_mark = 0;
 170                 if (ch == '0') {
 171                         wchar_t ch2 = get_char(state);
 172                         if (strchr("xobXOB", ch2) != NULL)
 173                                 expect_p = 1;
 174                         unget_char(state);
 175                 }
 176                 while (1) {
 177                         int sign_ok = 0;
 178                         switch(expect_p) {
 179                         case 0:
 180                                 if (ch == 'e' || ch == 'E')
 181                                         sign_ok = 1;
 182                                 break;
 183                         case 1:
 184                                 if (ch == 'p' || ch == 'P')
 185                                         sign_ok = 1;
 186                                 break;
 187                         }
 188                         save_unget_state(state);
 189                         ch = get_char(state);
 190                         if (iswalnum(ch)) {
 191                                 prev_special = 0;
 192                                 continue;
 193                         }
 194                         if (ch == '+' || ch == '-') {
 195                                 if (!sign_ok)
 196                                         break;
 197                                 expect_p = -1;
 198                         }
 199                         if (ch == '.' || ch == ',') {
 200                                 if (decimal_mark)
 201                                         break;
 202                                 decimal_mark = 1;
 203                         }
 204                         if (prev_special) {
 205                                 /* Don't allow that special char,
 206                                  * need two 'ungets'
 207                                  */
 208                                 restore_unget_state(state);
 209                                 break;
 210                         }
 211                         if (strchr(state->conf->number_chars, ch)) {
 212                                 prev_special = 1;
 213                                 continue;
 214                         }
 215                         /* non-number char */
 216                         break;
 217                 }
 218                 /* We seem to have a "number" token */
 219                 unget_char(state);
 220                 close_token(state, &tk);
 221                 tk.num = TK_number;
 222                 return tk;
 223         }
 224
 225 ### Words
 226 Words start with a "start" character followed by the longest
 227 sequence of "continue" characters.  The Unicode ID_START and
 228 ID_CONTINUE sets are always permitted, but other ASCII characters
 229 can be added to these sets.
 230
 231 ###### token config parameters
 232         char *word_start;
 233         char *word_cont;
 234
 235 ###### internal functions
 236         static int is_word_start(wchar_t ch, struct token_config *conf)
 237         {
 238                 return iswalpha(ch) ||
 239                        strchr(conf->word_start, ch) != NULL ||
 240                        u_hasBinaryProperty(ch, UCHAR_ID_START);
 241         }
 242
 243         static int is_word_continue(wchar_t ch, struct token_config *conf)
 244         {
 245                 return iswalnum(ch) ||
 246                        strchr(conf->word_cont, ch) != NULL ||
 247                        u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
 248         }
 249
 250 Words can be either known or unknown.  Known words are referred to as
 251 "reserved words" and get a unique token number.  Unknown words are
 252 "identifiers" and are syntactically a single token.
 253
 254 ###### token types
 255         TK_ident,
 256
 257 A list of known words must be provided.  This list is shared with the
 258 "marks" which are described next.  The list must be lexically sorted
 259 and the length of the list must be given (`known_count`).
 260 Tokens matching these known words are reported as the index of the
 261 list added to `TK_reserved`.
 262
 263 ###### token config parameters
 264         const char **words_marks;
 265         int known_count;
 266
 267 ###### parse word
 268
 269         if (is_word_start(ch, state->conf)) {
 270                 int n;
 271                 /* A word: identifier or reserved */
 272                 do
 273                         ch = get_char(state);
 274                 while (is_word_continue(ch, state->conf));
 275                 unget_char(state);
 276                 close_token(state, &tk);
 277                 tk.num = TK_ident;
 278                 if (ignored & (1<<TK_ident))
 279                         tk.num = TK_error;
 280                 n = find_known(state->conf, tk.txt);
 281                 if (n >= 0)
 282                         tk.num = TK_reserved + n;
 283                 return tk;
 284         }
 285
 286 ### Marks
 287
 288 Marks are generally one or more punctuation marks joined together.  It
 289 would be nice to use the term "symbol" for these, but that causes
 290 confusion in a subsequent discussion of the grammar, which has terminal
 291 symbols and non-terminal symbols which are conceptually quite
 292 different.  So strings of punctuation characters will be marks.
 293
 294 A "mark" consists of ASCII characters that are not white space, are not
 295 "start" characters for words, and are not digits.
 296 These will collectively be called mark characters.
 297
 298 ###### internal functions
 299         static int is_mark(wchar_t ch, struct token_config *conf)
 300         {
 301                 return ch > ' ' &&
 302                        ch < 0x7f &&
 303                        !iswalnum(ch) &&
 304                        strchr(conf->word_start, ch) == NULL;
 305         }
 306
 307 As with words, there can be known and unknown marks, though the rules
 308 are slightly different.
 309
 310 Two marks do not need to be separated by a non-mark characters.  This
 311 is different from words which do need to be separated by at least one
 312 non-continue character.
 313
 314 The scanner will normally prefer longer sequences of mark characters,
 315 but will more strongly prefer known marks over unknown marks.  So if
 316 it finds a known mark where adding one more character does not result
 317 in a known mark, it will return that first known mark.
 318
 319 If no known mark is found we will test against strings and comments
 320 below before giving up and assuming an unknown mark.
 321
 322 If an unknown mark contains a quote character or a comment marker, and
 323 that token is not being ignored, then we terminate the unknown mark
 324 before that quote or comment.  This ensure that an unknown mark
 325 immediately before a string is handled correctly.
 326
 327 If `TK_mark` is ignored, then unknown marks as returned as an error.
 328
 329 ###### token types
 330         TK_mark,
 331
 332 Known marks are included in the same list as the list of known words.
 333
 334 ###### parse mark
 335         tk.num = TK_error;
 336         while (is_mark(ch, state->conf)) {
 337                 int n;
 338                 wchar_t prev;
 339                 close_token(state, &tk);
 340                 n = find_known(state->conf, tk.txt);
 341                 if (n >= 0)
 342                         tk.num = TK_reserved + n;
 343                 else if (tk.num != TK_error) {
 344                         /* found a longest-known-mark */
 345                         unget_char(state);
 346                         close_token(state, &tk);
 347                         return tk;
 348                 }
 349                 prev = ch;
 350                 save_unget_state(state);
 351                 ch = get_char(state);
 352                 if (!(ignored && (1<<TK_string)) && is_quote(ch))
 353                         break;
 354                 if (prev == '#')
 355                         break;
 356                 if (prev == '/' && ch == '/' && tk.txt.len > 1) {
 357                         restore_unget_state(state);
 358                         break;
 359                 }
 360                 if (prev == '/' && ch == '*' && tk.txt.len > 1) {
 361                         restore_unget_state(state);
 362                         break;
 363                 }
 364         }
 365         unget_char(state);
 366         if (tk.num != TK_error) {
 367                 close_token(state, &tk);
 368                 return tk;
 369         }
 370
 371 If we don't find a known mark, we will check for strings and comments
 372 before assuming that we have an unknown mark
 373
 374 ###### parse mark
 375         ## parse string
 376         ## parse comment
 377         ## unknown mark
 378
 379 ###### unknown mark
 380         if (tk.txt.len) {
 381                 if (ignored & (1<<TK_mark))
 382                         tk.num = TK_error;
 383                 else
 384                         tk.num = TK_mark;
 385                 return tk;
 386         }
 387
 388 ### Strings
 389
 390 Strings start with one of single quote, double quote, or back quote
 391 and continue until a matching character on the same line.  Any of
 392 these characters can be included in the list of known marks and then
 393 they will not be used for identifying strings.
 394
 395 Immediately following the close quote one or two ASCII letters may
 396 appear.  These are somewhat like the arbitrary letters allowed in
 397 "Numbers" above.  They can be used by the language in various ways.
 398
 399 If 3 identical quote characters appear in a row and are
 400 followed by a newline, then this forms a multi-line string which
 401 continues until an identical triple quote appears on a line preceded
 402 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
 403
 404 Multi-line strings may not extend beyond the end of the `code_node` in
 405 which they start.
 406
 407 Normal strings and multi-line strings are encoded as two different
 408 token types.
 409
 410 ###### token types
 411         TK_string,
 412         TK_multi_string,
 413
 414 ###### internal functions
 415         static int is_quote(wchar_t ch)
 416         {
 417                 return ch == '\'' || ch == '"' || ch == '`';
 418         }
 419
 420 #### Multi-line strings
 421
 422 The multi-line string is checked for first.  If they are being
 423 ignored, we fall through and treat a triple quote as an empty string
 424 followed by the start of a new string.
 425
 426 ###### parse string
 427         if (tk.txt.len == 3 &&
 428             !(ignored & (1 << TK_multi_string)) &&
 429             is_quote(tk.txt.txt[0]) &&
 430             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
 431             is_newline(tk.txt.txt[3])) {
 432                 // triple quote
 433                 wchar_t first = tk.txt.txt[0];
 434                 int qseen = 0;
 435                 int at_sol = 1;
 436                 while (!at_eon(state) && qseen < 3) {
 437                         ch = get_char(state);
 438                         if (is_newline(ch)) {
 439                                 at_sol = 1;
 440                                 qseen = 0;
 441                         } else if (at_sol && ch == first) {
 442                                 qseen += 1;
 443                         } else if (ch != ' ' && ch != '\t') {
 444                                 at_sol = 0;
 445                                 qseen = 0;
 446                         }
 447                 }
 448                 if (qseen != 3) {
 449                         /* Hit end of node - error.
 450                          * unget so the newline is seen,
 451                          * but return rest of string as an error.
 452                          */
 453                         if (is_newline(ch))
 454                                 unget_char(state);
 455                         close_token(state, &tk);
 456                         tk.num = TK_error;
 457                         return tk;
 458                 }
 459                 /* 2 letters are allowed */
 460                 ch = get_char(state);
 461                 if (iswalpha(ch))
 462                         ch = get_char(state);
 463                 if (iswalpha(ch))
 464                         ch = get_char(state);
 465                 /* Now we must have a newline, but we don't return it
 466                  * whatever it is.*/
 467                 unget_char(state);
 468                 close_token(state, &tk);
 469                 tk.num = TK_multi_string;
 470                 if (!is_newline(ch))
 471                         tk.num = TK_error;
 472                 return tk;
 473         }
 474
 475 #### Single-line strings
 476
 477 The sequence of marks collected may be more than a single-line
 478 string, so we reset to the start and collect characters until
 479 we find a close quote or a newline.
 480
 481 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
 482
 483 ###### parse string
 484         if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
 485             !(ignored & (1<<TK_string))) {
 486                 wchar_t first = tk.txt.txt[0];
 487                 reset_token(state, &tk);
 488                 ch = get_char(state);
 489                 tk.num = TK_error;
 490                 while (!at_eon(state) && !is_newline(ch)) {
 491                         ch = get_char(state);
 492                         if (ch == first) {
 493                                 tk.num = TK_string;
 494                                 break;
 495                         }
 496                         if (is_newline(ch)) {
 497                                 unget_char(state);
 498                                 break;
 499                         }
 500                 }
 501                 close_token(state, &tk);
 502                 return tk;
 503         }
 504
 505 ### Comments
 506
 507 Single line comments may start with '`//`' or '`#`' providing that these
 508 are not known marks.  They continue to the end of the line.
 509
 510 Block comments start with '`/*`' if this is not a known mark.  They
 511 continue to the first occurrence of '`*/`' and may not contain any
 512 occurrence of '`/*`'.
 513
 514 Block comments can be wholly within one line or can continue over
 515 multiple lines.  The multi-line version should be followed immediately
 516 by a newline.  The Linux kernel contains over 285000 multi-line
 517 comments are only 34 are followed by characters other than white space
 518 (which should be removed) or a backslash (only needed in macros).  So
 519 it would not suffer from this rule.
 520
 521 These two comment types are reported as two separate token types, and
 522 consequently can be ignored separately.  When ignored a comment is
 523 still parsed, but is discarded.
 524
 525 ###### token types
 526         TK_line_comment,
 527         TK_block_comment,
 528
 529 ###### internal functions
 530         static int is_line_comment(struct text txt)
 531         {
 532                 return (txt.len >= 1 && txt.txt[0] == '#') ||
 533                        (txt.len >= 2 && txt.txt[0] == '/' &&
 534                                         txt.txt[1] == '/');
 535         }
 536
 537         static int is_block_comment(struct text txt)
 538         {
 539                 return txt.len >= 2 && txt.txt[0] == '/' &&
 540                        txt.txt[1] == '*';
 541         }
 542
 543 #### Single line comments
 544
 545 A single-line comment continues up to, but not including the newline
 546 or end of node.
 547
 548 ###### parse comment
 549
 550         if (is_line_comment(tk.txt)) {
 551                 while (!is_newline(ch) && !at_eon(state))
 552                         ch = get_char(state);
 553                 if (is_newline(ch))
 554                         unget_char(state);
 555                 close_token(state, &tk);
 556                 tk.num = TK_line_comment;
 557                 if (ignored & (1 << TK_line_comment))
 558                         continue;
 559                 return tk;
 560         }
 561
 562 #### Block comments
 563
 564 The token text collected so far could exceed the comment, so we need
 565 to reset it first.
 566
 567 If we find an embedded `/*` we reset to just before the '/' and report
 568 an error.  That way the next thing to be parsed will be the rest of
 569 the comment.  This requires a double unget, so we need to save/restore
 570 the unget state (explained later).
 571
 572 ###### parse comment
 573
 574         if (is_block_comment(tk.txt)) {
 575                 wchar_t prev;
 576                 int newlines = 0;
 577                 reset_token(state, &tk);
 578                 get_char(state);
 579                 get_char(state);
 580                 save_unget_state(state);
 581                 ch = get_char(state);
 582                 prev = 0;
 583                 while (!at_eon(state) &&
 584                        (prev != '/' || ch != '*') &&
 585                        (prev != '*' || ch != '/')) {
 586                         if (is_newline(ch))
 587                                 newlines = 1;
 588                         prev = ch;
 589                         save_unget_state(state);
 590                         ch = get_char(state);
 591                 }
 592                 close_token(state, &tk);
 593                 if (at_eon(state)) {
 594                         tk.num = TK_error;
 595                         return tk;
 596                 }
 597                 if (prev == '/') {
 598                         /* embedded.  Need to unget twice! */
 599                         restore_unget_state(state);
 600                         unget_char(state);
 601                         tk.num = TK_error;
 602                         return tk;
 603                 }
 604                 tk.num = TK_block_comment;
 605                 if (newlines && !(ignored & (1<<TK_newline))) {
 606                         /* next char must be newline */
 607                         ch = get_char(state);
 608                         unget_char(state);
 609                         if (!is_newline(ch))
 610                                 tk.num = TK_error;
 611                 }
 612                 if (tk.num == TK_error ||
 613                     !(ignored & (1 << TK_block_comment)))
 614                         return tk;
 615                 continue;
 616         }
 617
 618 ### Indents, Newlines, and White Space.
 619
 620 Normally white space is ignored.  However newlines can be important as
 621 can indents, which are either after a newline or at the start of a
 622 node (detected by `at_son()`);
 623
 624 ###### exported functions
 625         static inline int is_newline(wchar_t ch)
 626         {
 627                 return ch == '\n' || ch == '\f' || ch == '\v';
 628         }
 629
 630 ###### white space
 631         if (ch <= ' ' && !is_newline(ch)
 632             && ! at_son(state))
 633                 continue;
 634
 635 If a line starts with more white-space than the previous non-blank
 636 line - or if the first non-blank line in the document starts with any
 637 white-space - then an "IN" is reported at the start of the line.
 638
 639 Before the next non-blank line which starts with less white space, or
 640 at the latest at the end of the document, a matching "OUT" token
 641 is reported.  There will always be an exact match between "IN" and
 642 "OUT" tokens.
 643
 644 It is possible for "OUT" to be followed (almost) immediately by an
 645 "IN".  This happens if, for example, the indent of three consecutive
 646 lines are 0, 8, 4 spaces.  Before the second line we report an
 647 "IN".  Before the third line we must report an "OUT", as 4 is less
 648 than 8, then also an Ident as 4 is greater than 0.
 649
 650 ###### token types
 651         TK_in,
 652         TK_out,
 653
 654 For the purpose of measuring the length of white space, a tab adds at
 655 least one space, and rounds up to a multiple of 8.
 656
 657 ###### exported functions
 658         static inline int indent_tab(int indent)
 659         {
 660                 return (indent|7)+1;
 661         }
 662
 663 We need to track the current levels of indent.  This requires some
 664 sort of stack as indent levels are pushed on and popped off.  In
 665 practice this stack is unlikely to often exceed 5 so we will used a
 666 fixed stack of 20 indent levels.  More than this will be silently
 667 ignored.
 668
 669 ###### state fields
 670         int     indent_level;
 671         int     indent_sizes[20];
 672
 673 #### Newlines
 674
 675 Newlines can optionally be reported.  Newlines within a block comment
 676 or a multi-line string are not reported separately, but each of these
 677 must be followed immediately by a newline so these constructs cannot
 678 hide the fact that a newline was present.
 679
 680 When indents are being reported, the Newline which would normally be
 681 reported immediately before the "IN" is delayed until after the
 682 matching "OUT".  This makes an indented section act like a
 683 continuation of the previous line to some extent.
 684
 685 A blank line would normally be reported simply as two consecutive Newline
 686 tokens.  However if the subsequent line is indented (and indents are being
 687 reported) then the right thing to do is less obvious as Newlines should be
 688 delayed - but how many Newlines?
 689
 690 The approach we will take is to report the extra Newlines immediately after
 691 the IN token, so the blank line is treated as though it were an indented
 692 blank line.
 693
 694 ###### token types
 695         TK_newline,
 696
 697 If we find a newline or white space at the start of a block, we keep
 698 collecting spaces, tabs, and newlines until we find some real text.
 699 Then depending on the indent we generate some number of tokens.  These
 700 will be a sequence of "Newline OUT" pairs representing a decrease
 701 in indent, then either a Newline or an IN depending on whether the
 702 next line is indented, then zero or more Newlines representing all the
 703 blank lines that have been skipped.
 704
 705 When a Newline leads to the next block of code there is a question of
 706 whether the various Newline and OUT/IN tokens should appear to
 707 pbelong to the earlier or later block.  This is addressed by processing
 708 the tokens in two stages based on the relative indent levels of the
 709 two blocks (each block has a base indent to which the actual indents
 710 are added).
 711
 712 Any "Newline OUT" pairs needed to reduce the current indent to the
 713 maximum of the base indents of the old and new blocks are generated
 714 against the old block.  Then if the next block does not have an
 715 increased indent, one more "Newline" is generated.
 716
 717 If further "Newline OUT" pairs are needed to get to the indent
 718 level of the 'next' block, they are generated against that block,
 719 though the first Newline is suppressed (it having already been
 720 generated).
 721
 722 Finally the Newline or IN for the first line of the new block is
 723 generated, unless the Newline needs to be suppressed because it
 724 appeared at the end of the previous block.
 725
 726 This means that a block may start with an OUT or an IN, but
 727 will only start with a Newline if it actually starts with a blank
 728 line.
 729
 730 We will need to represent in the `token_state` where in this sequence
 731 of delayed tokens we are.  As `state.col` records the target indent we
 732 don't need to record how many OUTs or INs are needed.  We do
 733 need to record the number of blank lines, and which of Newline and
 734 OUT is needed next in the initial sequence of pairs.
 735
 736 For this we store one more than the number of blank lines as
 737 `delayed_lines` and a flag for `out_next`.
 738
 739 ###### state fields
 740         int check_indent;
 741         int delayed_lines;
 742         int out_next;
 743
 744 Generating these tokens involve two separate pieces of code.
 745
 746 Firstly we need to recognise white space and count the indents and
 747 newlines.  These are recorded in the above state fields.
 748
 749 Separately we need, on each call to `token_next`, we need to check if
 750 there are some delayed tokens and if so we need to advance the state
 751 information and return one token.
 752
 753 ###### white space
 754         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
 755                 int newlines = 0;
 756                 int was_son = at_son(state);
 757                 if (ignored & (1<<TK_in)) {
 758                         if (!is_newline(ch))
 759                                 continue;
 760                         if (ignored & (1<<TK_newline))
 761                                 continue;
 762                         tk.num = TK_newline;
 763                         close_token(state, &tk);
 764                         return tk;
 765                 }
 766                 // Indents are needed, so check all white space.
 767                 while (ch <= ' ' && !at_eon(state)) {
 768                         if (is_newline(ch))
 769                                 newlines += 1;
 770                         ch = get_char(state);
 771                 }
 772                 if (at_eon(state)) {
 773                         newlines += 1;
 774                         if (state->node->next &&
 775                             state->node->next->indent > state->node->indent)
 776                                 state->col = state->node->next->indent;
 777                         else
 778                                 state->col = state->node->indent;
 779                 } else
 780                         unget_char(state);
 781                 state->delayed_lines = newlines;
 782                 state->out_next = was_son;
 783                 state->check_indent = 1;
 784                 continue;
 785         }
 786
 787
 788 ###### delayed tokens
 789
 790         if (state->check_indent || state->delayed_lines) {
 791                 if (state->col < state->indent_sizes[state->indent_level]) {
 792                         if (!state->out_next &&
 793                             !(ignored & (1<<TK_newline))) {
 794                                 state->out_next = 1;
 795                                 tk.num = TK_newline;
 796                                 return tk;
 797                         }
 798                         state->indent_level -= 1;
 799                         state->out_next = 0;
 800                         tk.num = TK_out;
 801                         return tk;
 802                 }
 803                 if (state->col > state->indent_sizes[state->indent_level] &&
 804                     state->indent_level < sizeof(state->indent_sizes)-1) {
 805                         state->indent_level += 1;
 806                         state->indent_sizes[state->indent_level] = state->col;
 807                         state->delayed_lines -= 1;
 808                         tk.num = TK_in;
 809                         return tk;
 810                 }
 811                 state->check_indent = 0;
 812                 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
 813                         tk.num = TK_newline;
 814                         state->delayed_lines -= 1;
 815                         return tk;
 816                 }
 817                 state->delayed_lines = 0;
 818                 continue;
 819         }
 820
 821 ### End of File
 822
 823 After the last newline in the file has been processed, a special
 824 end-of-file token will be returned.  any further attempts to get more
 825 tokens will continue to return the same end-of-file token.
 826
 827 ###### token types
 828         TK_eof,
 829
 830
 831 ###### white space
 832         if (ch == WEOF) {
 833                 tk.num = TK_eof;
 834                 return tk;
 835         }
 836
 837 ### Unknown Marks, or errors.
 838
 839 We have now handled all the possible known mark-like tokens.
 840 If the token we have is not empty and `TK_mark` is allowed,
 841 we have an unknown mark, otherwise this must be an error.
 842
 843 ###### unknown mark
 844         /* one unknown character */
 845         close_token(state, &tk);
 846         tk.num = TK_error;
 847         return tk;
 848
 849 ## Tools For The Task
 850
 851 You may have noticed that are few gaps we left in the above -
 852 functions used without first defining them.  Doing so above would have
 853 broken the flow.
 854
 855 ### Character by character
 856
 857 As we walk through the various `code_node`s we need to process whole
 858 Unicode codepoints, and keep track of which line and column we are on.
 859 We will assume for now that any printing character uses one column,
 860 though that is not true in general.
 861
 862 As the text in a `code_node` may include an indent that identifies it as
 863 being code, we need to be careful to strip that.  The `code_node` has
 864 a flag that tells us whether or not we need to strip.
 865
 866 ###### includes
 867         #include <memory.h>
 868
 869 ###### state fields
 870         struct code_node *node;
 871         int    offset;
 872         int    line;
 873         int    col;
 874
 875 ###### internal functions
 876
 877         static void do_strip(struct token_state *state)
 878         {
 879                 if (state->node->needs_strip) {
 880                         int n = 4;
 881                         while (n && state->node->code.txt[state->offset] == ' ') {
 882                                 state->offset += 1;
 883                                 n -= 1;
 884                         }
 885                         while (n == 4 && state->node->code.txt[state->offset] == '\t') {
 886                                 state->offset += 1;
 887                                 n -= 4;
 888                         }
 889                 }
 890         }
 891
 892         static wint_t get_char(struct token_state *state)
 893         {
 894                 wchar_t next;
 895                 size_t n;
 896                 mbstate_t mbstate;
 897
 898                 if (state->node == NULL)
 899                         return WEOF;
 900                 if (state->node->code.len <= state->offset) {
 901                         do
 902                                 state->node = state->node->next;
 903                         while (state->node && state->node->code.txt == NULL);
 904                         state->offset = 0;
 905                         if (state->node == NULL)
 906                                 return WEOF;
 907                         do_strip(state);
 908                         state->line = state->node->line_no;
 909                         state->col = state->node->indent;
 910                 }
 911
 912                 ## before get_char
 913
 914                 memset(&mbstate, 0, sizeof(mbstate));
 915
 916                 n = mbrtowc(&next, state->node->code.txt + state->offset,
 917                             state->node->code.len - state->offset,
 918                             &mbstate);
 919                 if (n == -2 || n == 0) {
 920                         /* Not enough bytes - not really possible */
 921                         next = '\n';
 922                         state->offset = state->node->code.len;
 923                 } else if (n == -1) {
 924                         /* error */
 925                         state->offset += 1;
 926                         next = 0x7f; // an illegal character
 927                 } else
 928                         state->offset += n;
 929
 930                 if (next >= ' ') {
 931                         state->col += 1;
 932                 } else if (is_newline(next)) {
 933                         state->line += 1;
 934                         state->col = state->node->indent;
 935                         do_strip(state);
 936                 } else if (next == '\t') {
 937                         state->col = indent_tab(state->col);
 938                 }
 939                 return next;
 940         }
 941
 942 We will sometimes want to "unget" the last character as it needs to be
 943 considered again as part of the next token.  So we need to store a
 944 'previous' version of all metadata.
 945
 946 ###### state fields
 947         int    prev_offset;
 948         int    prev_line;
 949         int    prev_col;
 950
 951 ###### before get_char
 952         state->prev_offset = state->offset;
 953         state->prev_line   = state->line;
 954         state->prev_col    = state->col;
 955
 956 ###### internal functions
 957
 958         static void unget_char(struct token_state *state)
 959         {
 960                 if (state->node) {
 961                         state->offset = state->prev_offset;
 962                         state->line   = state->prev_line;
 963                         state->col    = state->prev_col;
 964                 }
 965         }
 966
 967 We occasionally need a double-unget, particularly for numbers and
 968 block comments.  We don't impose this cost on all scanning, but
 969 require those code sections that need it to call `save_unget_state`
 970 before each `get_char`, and then `restore_unget_state` when a
 971 double-unget is needed.
 972
 973 ###### state fields
 974         int     prev_offset2;
 975         int     prev_line2;
 976         int     prev_col2;
 977
 978 ###### internal functions
 979         static void save_unget_state(struct token_state *state)
 980         {
 981                 state->prev_offset2 = state->prev_offset;
 982                 state->prev_line2 = state->prev_line;
 983                 state->prev_col2 = state->prev_col;
 984         }
 985
 986         static void restore_unget_state(struct token_state *state)
 987         {
 988                 state->prev_offset = state->prev_offset2;
 989                 state->prev_line = state->prev_line2;
 990                 state->prev_col = state->prev_col2;
 991         }
 992
 993 At the start of a token we don't want to be at the end of a code block
 994 if we can help it.  To avoid this possibility, we 'get' and 'unget' a
 995 single character.  This will move into the next non-empty code block
 996 and leave the current pointer at the start of it.
 997
 998 This has to happen _after_ dealing with delayed tokens as some of them
 999 must appear in the previous node.  When we do this, we need to reset
1000 the data in the token.
1001
1002 ###### delayed tokens
1003         if (at_eon(state)) {
1004                 get_char(state);
1005                 unget_char(state);
1006                 tk.node = state->node;
1007                 if (state->node)
1008                         tk.txt.txt = state->node->code.txt + state->offset;
1009                 tk.line = state->line;
1010                 tk.col = state->col;
1011                 tk.txt.len = 0;
1012         }
1013
1014 ### Managing tokens
1015
1016 The current token is initialized to line up with the first character
1017 that we 'get' for each token.  When we have, or might have, a full
1018 token we can call `close_token` to set the `len` of the token
1019 appropriately.  This can safely be called multiple times.
1020
1021 Finally we occasionally (for single-line strings and block comments)
1022 need to reset to the beginning of the current token as we might have
1023 parsed too much already.  For that there is `reset_token`.
1024
1025 ###### one token
1026         tk.node = state->node;
1027         if (state->node)
1028                 tk.txt.txt = state->node->code.txt + state->offset;
1029         tk.line = state->line;
1030         tk.col = state->col;
1031         tk.txt.len = 0;
1032
1033 ###### internal functions
1034
1035         static void close_token(struct token_state *state,
1036                                 struct token *tk)
1037         {
1038                 tk->txt.len = (state->node->code.txt + state->offset)
1039                               - tk->txt.txt;
1040         }
1041
1042         static void reset_token(struct token_state *state, struct token *tok)
1043         {
1044                 state->prev_line = tok->line;
1045                 state->prev_col = tok->col;
1046                 state->prev_offset = tok->txt.txt - state->node->code.txt;
1047                 unget_char(state);
1048                 tok->txt.len = 0;
1049         }
1050
1051
1052 Tokens make not cross into the next `code_node`, and some tokens can
1053 include the newline at the and of a `code_node`, we must be able to
1054 easily check if we have reached the end.  Equally we need to know if
1055 we are at the start of a node, as white space is treated a little
1056 differently there.
1057
1058 ###### internal functions
1059
1060         static int at_son(struct token_state *state)
1061         {
1062                 return state->offset == 0;
1063         }
1064
1065         static int at_eon(struct token_state *state)
1066         {
1067                 // at end-of-node ??
1068                 return state->node == NULL ||
1069                        state->offset >= state->node->code.len;
1070         }
1071
1072 ### Find a known word
1073
1074 As the known-word list is sorted we can use a simple binary search.
1075 Following the pattern established in "mdcode", we will use a `struct
1076 text` with start and length to represent the code fragment we are
1077 searching for.
1078
1079 ###### internal functions
1080         static int find_known(struct token_config *conf, struct text txt)
1081         {
1082                 int lo = 0;
1083                 int hi = conf->known_count;
1084
1085                 while (lo + 1 < hi) {
1086                         int mid = (lo + hi) / 2;
1087                         int cmp = strncmp(conf->words_marks[mid],
1088                                           txt.txt, txt.len);
1089                         if (cmp == 0 && conf->words_marks[mid][txt.len])
1090                                 cmp = 1;
1091                         if (cmp <= 0)
1092                                 lo = mid;
1093                         else
1094                                 hi = mid;
1095                 }
1096                 if (strncmp(conf->words_marks[lo],
1097                            txt.txt, txt.len) == 0
1098                     && conf->words_marks[lo][txt.len] == 0)
1099                         return lo;
1100                 else
1101                         return -1;
1102         }
1103
1104 ### Bringing it all together
1105
1106 Now we have all the bits there is just one section missing:  combining
1107 all the token parsing code into one block.
1108
1109 The handling of delayed tokens (Newlines, INs, OUTs) must come
1110 first before we try getting another character.
1111
1112 Then we parse all the test, making sure that we check for known marks
1113 before strings and comments, but unknown marks after strings and comments.
1114
1115 This block of code will either return a token, or will choose to
1116 ignore one, in which case it will `continue` around to the top of the
1117 loop.
1118
1119 ###### one token
1120         ## delayed tokens
1121
1122         ch = get_char(state);
1123
1124         ## white space
1125         ## parse number
1126         ## parse word
1127         ## parse mark
1128
1129 ### Start and stop
1130
1131 As well as getting tokens, we need to be able to create the
1132 `token_state` to start with, and discard it later.
1133
1134 ###### includes
1135         #include <malloc.h>
1136
1137 ###### main functions
1138         struct token_state *token_open(struct code_node *code, struct
1139                                        token_config *conf)
1140         {
1141                 struct token_state *state = malloc(sizeof(*state));
1142                 memset(state, 0, sizeof(*state));
1143                 state->node = code;
1144                 state->line = code->line_no;
1145                 state->col = code->indent;
1146                 state->conf = conf;
1147                 do_strip(state);
1148                 return state;
1149         }
1150         void token_close(struct token_state *state)
1151         {
1152                 free(state);
1153         }
1154
1155 ###### exported functions
1156         struct token_state *token_open(struct code_node *code, struct
1157                                        token_config *conf);
1158         void token_close(struct token_state *state);
1159
1160 ### Trace tokens
1161
1162 Getting tokens is the main thing but it is also useful to be able to
1163 print out token information, particularly for tracing and testing.
1164
1165 Known tokens are printed verbatim.  Other tokens are printed as
1166 `type(content)` where content is truncated to a given number of characters.
1167
1168 The function for printing a truncated string (`text_dump`) is also exported
1169 so that it can be used to tracing processed strings too.
1170
1171 ###### includes
1172         #include <stdio.h>
1173
1174 ###### exported functions
1175         void token_trace(FILE *f, struct token tok, int max);
1176         void text_dump(FILE *f, struct text t, int max);
1177
1178 ###### main functions
1179
1180         void text_dump(FILE *f, struct text txt, int max)
1181         {
1182                 int i;
1183                 if (txt.len > max)
1184                         max -= 2;
1185                 else
1186                         max = txt.len;
1187                 for (i = 0; i < max; i++) {
1188                         char c = txt.txt[i];
1189                         if (c < ' ' || c > '~')
1190                                 fprintf(f, "\\x%02x", c & 0xff);
1191                         else if (c == '\\')
1192                                 fprintf(f, "\\\\");
1193                         else
1194                                 fprintf(f, "%c", c);
1195                 }
1196                 if (i < txt.len)
1197                         fprintf(f, "..");
1198         }
1199
1200         void token_trace(FILE *f, struct token tok, int max)
1201         {
1202                 static char *types[] = {
1203                         [TK_ident] = "ident",
1204                         [TK_mark] = "mark",
1205                         [TK_number] = "number",
1206                         [TK_string] = "string",
1207                         [TK_multi_string] = "mstring",
1208                         [TK_line_comment] = "lcomment",
1209                         [TK_block_comment] = "bcomment",
1210                         [TK_in] = "in",
1211                         [TK_out] = "out",
1212                         [TK_newline] = "newline",
1213                         [TK_eof] = "eof",
1214                         [TK_error] = "ERROR",
1215                         };
1216
1217                 switch (tok.num) {
1218                 default: /* known word or mark */
1219                         fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1220                         break;
1221                 case TK_in:
1222                 case TK_out:
1223                 case TK_newline:
1224                 case TK_eof:
1225                         /* No token text included */
1226                         fprintf(f, "%s()", types[tok.num]);
1227                         break;
1228                 case TK_ident:
1229                 case TK_mark:
1230                 case TK_number:
1231                 case TK_string:
1232                 case TK_multi_string:
1233                 case TK_line_comment:
1234                 case TK_block_comment:
1235                 case TK_error:
1236                         fprintf(f, "%s(", types[tok.num]);
1237                         text_dump(f, tok.txt, max);
1238                         fprintf(f, ")");
1239                         break;
1240                 }
1241         }
1242
1243 ### And there we have it
1244
1245 We now have all the library functions defined for reading and printing
1246 tokens.  Now we just need C files to store them, and a mk file to make them.
1247
1248 ###### File: scanner.h
1249         ## public types
1250         ## exported functions
1251
1252 ###### File: libscanner.c
1253         ## includes
1254         #include "scanner.h"
1255         ## private types
1256         ## internal functions
1257         ## main functions
1258
1259 ###### File: scanner.mk
1260
1261         CFLAGS += -Wall -g
1262         all ::
1263         scanner.mk scanner.h libscanner.c : scanner.mdc
1264                 ./md2c scanner.mdc
1265         all :: libscanner.o
1266         libscanner.o : libscanner.c
1267                 $(CC) $(CFLAGS) -c libscanner.c
1268
1269 ## Processing numbers
1270
1271 Converting a `TK_number` token to a numerical value is a slightly
1272 higher level task than lexical analysis, and slightly lower than
1273 grammar parsing, so put it here - as an index if you like.
1274
1275 Importantly it will be used by the same testing rig that is used for
1276 testing the token scanner.
1277
1278 The numeric value that we will convert all numbers into is the `mpq_t`
1279 from the GNU high precision number library "libgmp".
1280
1281 ###### number includes
1282         #include <gmp.h>
1283         #include "mdcode.h"
1284
1285 Firstly we need to be able to parse a string of digits in a given base
1286 and possibly with a decimal marker.  We store this in an `mpz_t`
1287 integer and report the number of digits after the decimal mark.
1288
1289 On error we return zero and ensure that the 'mpz_t' has been freed, or
1290 had never been initialised.
1291
1292 ###### number functions
1293
1294         static int parse_digits(mpz_t num, struct text tok, int base,
1295                                 int *placesp)
1296         {
1297                 /* Accept digits up to 'base', ignore '_' and
1298                  * ' ' if they appear between two legal digits,
1299                  * and if `placesp` is not NULL, allow a single
1300                  * '.' or ',' and report the number of digits
1301                  * beyond there.
1302                  * Return number of characters processed (p),
1303                  * or 0 if something illegal was found.
1304                  */
1305                 int p;
1306                 int decimal = -1; // digits after marker
1307                 enum {Digit, Space, Other} prev = Other;
1308                 int digits = 0;
1309
1310                 for (p = 0; p < tok.len; p++) {
1311                         int dig;
1312                         char c = tok.txt[p];
1313
1314                         if (c == '_' || c == ' ') {
1315                                 if (prev != Digit)
1316                                         goto bad;
1317                                 prev = Space;
1318                                 continue;
1319                         }
1320                         if (c == '.' || c == ',') {
1321                                 if (prev != Digit)
1322                                         goto bad;
1323                                 if (!placesp || decimal >= 0)
1324                                         return p-1;
1325                                 decimal = 0;
1326                                 prev = Other;
1327                                 continue;
1328                         }
1329                         if (isdigit(c))
1330                                 dig = c - '0';
1331                         else if (isupper(c))
1332                                 dig = 10 + c - 'A';
1333                         else if (islower(c))
1334                                 dig = 10 + c - 'a';
1335                         else
1336                                 dig = base;
1337                         if (dig >= base) {
1338                                 if (prev == Space)
1339                                         p--;
1340                                 break;
1341                         }
1342                         prev = Digit;
1343                         if (digits)
1344                                 mpz_mul_ui(num, num, base);
1345                         else
1346                                 mpz_init(num);
1347                         digits += 1;
1348                         mpz_add_ui(num, num, dig);
1349                         if (decimal >= 0)
1350                                 decimal++;
1351                 }
1352                 if (digits == 0)
1353                         return 0;
1354                 if (placesp) {
1355                         if (decimal >= 0)
1356                                 *placesp = decimal;
1357                         else
1358                                 *placesp = 0;
1359                 }
1360                 return p;
1361         bad:
1362                 if (digits)
1363                         mpz_clear(num);
1364                 return 0;
1365         }
1366
1367 ###### number includes
1368         #include <ctype.h>
1369
1370 To parse a full number we need to consider the optional base, the
1371 mantissa, and the optional exponent.  We will treat these one at a
1372 time.
1373
1374 The base is indicated by a letter after a leading zero, which must be
1375 followed by a base letter or a period.  The base also determines the
1376 character which will mark an exponent.
1377
1378 ###### number vars
1379         int base = 10;
1380         char expc = 'e';
1381
1382 ###### parse base
1383
1384         if (tok.txt[0] == '0' && tok.len > 1) {
1385                 int skip = 0;
1386                 switch(tok.txt[1]) {
1387                 case 'x':
1388                 case 'X':
1389                         base = 16;
1390                         skip = 2;
1391                         expc = 'p';
1392                         break;
1393                 case 'o':
1394                 case 'O':
1395                         base = 8;
1396                         skip = 2;
1397                         expc = 'p';
1398                         break;
1399                 case 'b':
1400                 case 'B':
1401                         base = 2;
1402                         skip = 2;
1403                         expc = 'p';
1404                         break;
1405                 case '0':
1406                 case '1':
1407                 case '2':
1408                 case '3':
1409                 case '4':
1410                 case '5':
1411                 case '6':
1412                 case '7':
1413                 case '8':
1414                 case '9':
1415                 case '_':
1416                 case ' ':
1417                         // another digit is not permitted
1418                         // after a zero.
1419                         return 0;
1420                 default:
1421                         // must be decimal marker or trailing
1422                         // letter, which are OK;
1423                         break;
1424                 }
1425                 tok.txt += skip;
1426                 tok.len -= skip;
1427         }
1428
1429 After the base is the mantissa, which may contain a decimal mark, so
1430 we need to record the number of places.  We won't impose the number of
1431 places until we have the exponent as well.
1432
1433 ###### number vars
1434         int places =0;
1435         mpz_t mant;
1436         int d;
1437
1438 ###### parse mantissa
1439
1440         d = parse_digits(mant, tok, base, &places);
1441         if (d == 0)
1442                 return 0;
1443         tok.txt += d;
1444         tok.len -= d;
1445         mpq_init(num);
1446         mpq_set_z(num, mant);
1447         mpz_clear(mant);
1448
1449 After the mantissa number may come an exponent which may be positive
1450 or negative.  We assume at this point that we have seen the exponent
1451 character `expc`.
1452
1453 ###### number vars
1454         long lexp = 0;
1455         mpz_t exp;
1456         int esign = 1;
1457
1458 ###### parse exponent
1459         if (tok.len > 1) {
1460                 if (tok.txt[0] == '+') {
1461                         tok.txt++;
1462                         tok.len--;
1463                 } else if (tok.txt[0] == '-') {
1464                         esign = -1;
1465                         tok.txt++;
1466                         tok.len--;
1467                 }
1468         }
1469         d = parse_digits(exp, tok, 10, NULL);
1470         if (d == 0) {
1471                 mpq_clear(num);
1472                 return 0;
1473         }
1474         if (!mpz_fits_slong_p(exp)) {
1475                 mpq_clear(num);
1476                 mpz_clear(exp);
1477                 return 0;
1478         }
1479         lexp = mpz_get_si(exp) * esign;
1480         mpz_clear(exp);
1481         tok.txt += d;
1482         tok.len -= d;
1483
1484
1485 Now that we have the mantissa and the exponent we can multiply them
1486 together, also allowing for the number of digits after the decimal
1487 mark.
1488
1489 For base 10, we simply subtract the decimal places from the exponent.
1490 For the other bases, as the exponent is alway based on 2, even for
1491 octal and hex, we need a bit more detail.
1492 We then recover the sign from the exponent, as division is quite
1493 different from multiplication.
1494
1495 ###### calc exponent
1496         switch (base) {
1497         case 10:
1498         case 2:
1499                 lexp -= places;
1500                 break;
1501         case 16:
1502                 lexp -= 4*places;
1503                 break;
1504         case 8:
1505                 lexp -= 3*places;
1506                 break;
1507         }
1508         if (lexp < 0) {
1509                 lexp = -lexp;
1510                 esign = -1;
1511         } else
1512                 esign = 1;
1513
1514 Imposing the exponent on the number is also very different for base 10
1515 than for the others.  For the binary shift `gmp` provides a simple
1516 function.  For base 10 we use something like Russian Peasant
1517 Multiplication.
1518
1519 ###### calc exponent
1520         if (expc == 'e') {
1521                 mpq_t tens;
1522                 mpq_init(tens);
1523                 mpq_set_ui(tens, 10, 1);
1524                 while (1) {
1525                         if (lexp & 1) {
1526                                 if (esign > 0)
1527                                         mpq_mul(num, num, tens);
1528                                 else
1529                                         mpq_div(num, num, tens);
1530                         }
1531                         lexp >>= 1;
1532                         if (lexp == 0)
1533                                 break;
1534                         mpq_mul(tens, tens, tens);
1535                 }
1536                 mpq_clear(tens);
1537         } else {
1538                 if (esign > 0)
1539                         mpq_mul_2exp(num, num, lexp);
1540                 else
1541                         mpq_div_2exp(num, num, lexp);
1542         }
1543
1544 Now we are ready to parse a number: the base, mantissa, and exponent.
1545 If all goes well we check for the possible trailing letters and
1546 return.  Return value is 1 for success and 0 for failure.
1547
1548
1549 ###### number functions
1550         int number_parse(mpq_t num, char tail[3], struct text tok)
1551         {
1552                 ## number vars
1553                 int i;
1554
1555                 ## parse base
1556                 ## parse mantissa
1557                 if (tok.len > 1 && (tok.txt[0] == expc ||
1558                                     tok.txt[0] == toupper(expc))) {
1559                         tok.txt++;
1560                         tok.len--;
1561                         ## parse exponent
1562                 }
1563                 ## calc exponent
1564
1565                 for (i = 0; i < 2; i++) {
1566                         if (tok.len <= i)
1567                                 break;
1568                         if (!isalpha(tok.txt[i]))
1569                                 goto err;
1570                         tail[i] = tok.txt[i];
1571                 }
1572                 tail[i] = 0;
1573                 if (i == tok.len)
1574                         return 1;
1575         err:
1576                 mpq_clear(num);
1577                 return 0;
1578         }
1579
1580 Number parsing goes in `libnumber.c`
1581
1582 ###### File: libnumber.c
1583
1584         #include <unistd.h>
1585         #include <stdlib.h>
1586
1587         ## number includes
1588         ## number functions
1589
1590 ###### File: number.h
1591         int number_parse(mpq_t num, char tail[3], struct text tok);
1592
1593 ###### File: scanner.mk
1594         all :: libnumber.o
1595         libnumber.o : libnumber.c
1596                 $(CC) $(CFLAGS) -c libnumber.c
1597
1598 ## Processing strings
1599
1600 Both `TK_string` and `TK_multi_string` require post-processing which
1601 can be one of two types: literal or with escapes processed.
1602 Even literal processing is non-trivial as the file may contain indents
1603 which need to be stripped.
1604
1605 Errors can only occur when processing escapes.  Any unrecognised
1606 character following the escape character will cause an error.
1607
1608 Processing escapes and striping indents can only make the string
1609 shorter, not longer, so we allocate a buffer which is the same size as
1610 the string and process into that.
1611
1612 To request escape processing, we pass the character we want to use for
1613 quoting, usually '`\`'.  To avoid escape processing we pass a zero.
1614
1615 ###### string main
1616         int string_parse(struct token *tok, char escape,
1617                          struct text *str, char tail[3])
1618         {
1619                 ## string vars
1620                 struct text t = tok->txt;
1621
1622                 str->txt = NULL;
1623                 ## strip tail
1624                 if (tok->num == TK_string) {
1625                         ## strip single
1626                 } else {
1627                         ## strip multi
1628                 }
1629                 str->txt = malloc(t.len);
1630                 str->len = 0;
1631
1632                 ## process string
1633                 return 1;
1634         err:
1635                 free(str->txt);
1636                 str->txt = NULL;
1637                 return 0;
1638         }
1639
1640 ### strip tail
1641
1642 The tail of the string can be 0, 1, or 2 letters
1643
1644         i = t.len;
1645         if (i >= 0 && isalpha(t.txt[i-1]))
1646                 i -= 1;
1647         if (i >= 0 && isalpha(t.txt[i-1]))
1648                 i -= 1;
1649         strncpy(tail, t.txt+i, t.len-i);
1650         tail[t.len-i] = 0;
1651         t.len = i;
1652
1653 ###### string vars
1654         int i;
1655
1656 ### strip single
1657
1658 Stripping the quote of a single-line string is trivial.
1659 The only part that is at all interesting is that quote character must
1660 be remembered.
1661
1662         quote = t.txt[0];
1663         if (t.txt[t.len-1] != quote)
1664                 goto err;
1665         t.txt += 1;
1666         t.len -= 2;
1667
1668 ###### string vars
1669         char quote;
1670
1671 ### strip multi
1672
1673 For a multi-line string we have a little more work to do.  We need to
1674 remove 3 quotes, not 1, and need to count the indent of the close
1675 quote as it will need to be stripped from all lines.
1676
1677         quote = t.txt[0];
1678         if (t.len < 7 ||
1679             t.txt[1] != quote || t.txt[2] != quote ||
1680             !is_newline(t.txt[3]))
1681                 goto err;
1682         t.txt += 4;
1683         t.len -= 4;
1684         i = t.len;
1685         if (i <= 0 || t.txt[i-1] != quote)
1686                 goto err;
1687         i -= 1;
1688         if (i <= 0 || t.txt[i-1] != quote)
1689                 goto err;
1690         i -= 1;
1691         if (i <= 0 || t.txt[i-1] != quote)
1692                 goto err;
1693         i -= 1;
1694         t.len = i;
1695         while (i > 0 && !is_newline(t.txt[i-1]))
1696                 i--;
1697         indent = 0;
1698         while (i < t.len) {
1699                 if (t.txt[i] == ' ')
1700                         indent += 1;
1701                 if (t.txt[i] == '\t')
1702                         indent = indent_tab(indent);
1703                 i++;
1704         }
1705
1706 ###### string vars
1707         int indent = 0;
1708
1709 ### process string
1710
1711 Now we just take one byte at a time. trans-ASCII unicode won't look
1712 like anything we are interested in so it will just be copied byte by
1713 byte.
1714
1715         cp = str->txt;
1716         at_sol = 1;
1717         for (i = 0; i < t.len; i++) {
1718                 char c;
1719                 if (at_sol) {
1720                         at_sol = 0;
1721                         ## strip indent
1722                         if (i >= t.len)
1723                                 break;
1724                 }
1725                 c = t.txt[i];
1726                 if (c != escape) {
1727                         *cp = c;
1728                         cp += 1;
1729                         if (is_newline(c))
1730                                 at_sol = 1;
1731                 } else if (i+1 >= t.len) {
1732                         // escape and end of string
1733                         goto err;
1734                 } else {
1735                         i += 1;
1736                         c = t.txt[i];
1737                         ## parse escape
1738                 }
1739         }
1740         str->len = cp - str->txt;
1741
1742 ###### string vars
1743         char *cp;
1744         int at_sol;
1745
1746 ### strip indent
1747
1748 Every time we find a start of line, we strip spaces and tabs until the
1749 required indent is found.
1750
1751         int skipped = 0;
1752         while (i < t.len && skipped < indent) {
1753                 c = t.txt[i];
1754                 if (c == ' ')
1755                         skipped += 1;
1756                 else if (c == '\t')
1757                         skipped = indent_tab(c);
1758                 else
1759                         break;
1760                 i+= 1;
1761         }
1762
1763 ### parse escape
1764         switch (c) {
1765         case 'n':
1766                 *cp++ = '\n'; break;
1767         case 'r':
1768                 *cp++ = '\r'; break;
1769         case 't':
1770                 *cp++ = '\t'; break;
1771         case 'b':
1772                 *cp++ = '\b'; break;
1773         case 'q':
1774                 *cp++ = quote; break;
1775         case 'f':
1776                 *cp++ = '\f'; break;
1777         case 'v':
1778                 *cp++ = '\v'; break;
1779         case 'a':
1780                 *cp++ = '\a'; break;
1781         case '0':
1782         case '1':
1783         case '2':
1784         case '3':
1785                 // 3 digit octal number
1786                 if (i+2 >= t.len)
1787                         goto err;
1788                 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1789                     t.txt[i+2] < '0' || t.txt[i+1] > '7')
1790                         goto err;
1791                 n = (t.txt[i  ]-'0') * 64 +
1792                     (t.txt[i+1]-'0') *  8 +
1793                     (t.txt[i+2]-'0') *  1;
1794                 *cp++ = n;
1795                 i += 2;
1796                 break;
1797         case 'x':
1798                 // 2 hex digits
1799                 n = take_hex(2, t.txt+i+1, t.len-i-1);
1800                 if (n < 0)
1801                         goto err;
1802                 *cp++ = n;
1803                 i += 2;
1804                 break;
1805         case 'u':
1806         case 'U':
1807                 // 4 or 8 hex digits for unicode
1808                 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1809                 if (n < 0)
1810                         goto err;
1811                 memset(&pstate, 0, sizeof(pstate));
1812                 n = wcrtomb(cp, n, &pstate);
1813                 if (n <= 0)
1814                         goto err;
1815                 cp += n;
1816                 i += c == 'u' ? 4 : 8;
1817                 break;
1818         default:
1819                 if (c == escape)
1820                         *cp++ = c;
1821                 else if (is_newline(c))
1822                         at_sol = 1;
1823                 else
1824                         goto err;
1825         }
1826
1827 ###### string vars
1828         long n;
1829         mbstate_t pstate;
1830
1831 For `\x` `\u` and `\U` we need to collect a specific number of
1832 hexadecimal digits
1833
1834 ###### string functions
1835
1836         static long take_hex(int digits, char *cp, int l)
1837         {
1838                 long n = 0;
1839                 if (l < digits)
1840                         return -1;
1841                 while (digits) {
1842                         char  c = *cp;
1843                         int d;
1844                         if (!isxdigit(c))
1845                                 return -1;
1846                         if (isdigit(c))
1847                                 d = c - '0';
1848                         else if (isupper(c))
1849                                 d = 10 + c - 'A';
1850                         else
1851                                 d = 10 + c - 'a';
1852                         n = n * 16 + d;
1853                         digits--;
1854                         cp++;
1855                 }
1856                 return n;
1857         }
1858
1859 #### File: libstring.c
1860
1861 String parsing goes in `libstring.c`
1862
1863         #include <unistd.h>
1864         #include <stdlib.h>
1865         #include <stdio.h>
1866         #include <string.h>
1867         #include <ctype.h>
1868         #include <wchar.h>
1869         #include "mdcode.h"
1870         #include "scanner.h"
1871         ## string functions
1872         ## string main
1873
1874 ###### File: string.h
1875         int string_parse(struct token *tok, char escape,
1876                          struct text *str, char tail[3]);
1877
1878 ###### File: scanner.mk
1879         all :: libstring.o
1880         libstring.o : libstring.c
1881                 $(CC) $(CFLAGS) -c libstring.c
1882
1883
1884 ## Testing
1885
1886 As "untested code is buggy code" we need a program to easily test
1887 the scanner library.  This will simply parse a given file and report
1888 the tokens one per line.
1889
1890 ###### File: scanner.c
1891
1892         #include <unistd.h>
1893         #include <stdlib.h>
1894         #include <fcntl.h>
1895         #include <errno.h>
1896         #include <sys/mman.h>
1897         #include <string.h>
1898         #include <stdio.h>
1899         #include <gmp.h>
1900         #include <locale.h>
1901         #include "mdcode.h"
1902         #include "scanner.h"
1903         #include "number.h"
1904         #include "string.h"
1905
1906         static int errs;
1907         static void pr_err(char *msg)
1908         {
1909                 errs++;
1910                 fprintf(stderr, "%s\n", msg);
1911         }
1912
1913         int main(int argc, char *argv[])
1914         {
1915                 int fd;
1916                 int len;
1917                 char *file;
1918                 struct token_state *state;
1919                 const char *known[] = {
1920                         "==",
1921                         "else",
1922                         "if",
1923                         "then",
1924                         "while",
1925                         "{",
1926                         "}",
1927                 };
1928                 struct token_config conf = {
1929                         .word_start = "_$",
1930                         .word_cont = "",
1931                         .words_marks = known,
1932                         .number_chars = "., _+-",
1933                         .known_count = sizeof(known)/sizeof(known[0]),
1934                         .ignored = (0 << TK_line_comment)
1935                                   |(0 << TK_block_comment),
1936                 };
1937                 struct section *table, *s, *prev;
1938                 setlocale(LC_ALL,"");
1939                 if (argc != 2) {
1940                         fprintf(stderr, "Usage: scanner file\n");
1941                         exit(2);
1942                 }
1943                 fd = open(argv[1], O_RDONLY);
1944                 if (fd < 0) {
1945                         fprintf(stderr, "scanner: cannot open %s: %s\n",
1946                                 argv[1], strerror(errno));
1947                         exit(1);
1948                 }
1949                 len = lseek(fd, 0, 2);
1950                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
1951                 table = code_extract(file, file+len, pr_err);
1952
1953                 for (s = table; s;
1954                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
1955                         printf("Tokenizing: %.*s\n", s->section.len,
1956                                 s->section.txt);
1957                         state = token_open(s->code, &conf);
1958                         while(1) {
1959                                 struct token tk = token_next(state);
1960                                 printf("%d:%d ", tk.line, tk.col);
1961                                 token_trace(stdout, tk, 20);
1962                                 if (tk.num == TK_number) {
1963                                         mpq_t num;
1964                                         char tail[3];
1965                                         if (number_parse(num, tail,tk.txt)) {
1966                                                 printf(" %s ", tail);
1967                                                 mpq_out_str(stdout, 10, num);
1968                                                 mpq_clear(num);
1969                                         } else
1970                                                 printf(" BAD NUMBER");
1971                                 }
1972                                 if (tk.num == TK_string ||
1973                                     tk.num == TK_multi_string) {
1974                                         char esc = '\\';
1975                                         struct text str;
1976                                         char tail[3];
1977                                         if (tk.txt.txt[0] == '`')
1978                                                 esc = 0;
1979                                         if (string_parse(&tk, esc,
1980                                                          &str, tail)) {
1981                                                 printf(" %s ", tail);
1982                                                 text_dump(stdout, str, 20);
1983                                                 free(str.txt);
1984                                         } else
1985                                                 printf(" BAD STRING");
1986                                 }
1987                                 printf("\n");
1988                                 if (tk.num == TK_error)
1989                                         errs = 1;
1990                                 if (tk.num == TK_eof)
1991                                         break;
1992                         }
1993                 }
1994                 exit(!!errs);
1995         }
1996 ###### File: scanner.mk
1997         scanner.c : scanner.mdc
1998                 ./md2c scanner.mdc
1999         all :: scanner
2000         scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
2001                 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
2002                         libmdcode.o libnumber.o libstring.o -licuuc -lgmp
2003         scanner.o : scanner.c
2004                 $(CC) $(CFLAGS) -c scanner.c
2005