ocean-lang.org Git - ocean/blob - csrc/scanner.mdc

   1 # Lexical Scanner #
   2
   3 ## The Task at Hand ##
   4
   5 The main task of the lexical scanner is to convert a stream of
   6 characters into a stream of tokens.  The tokens are then typically
   7 used by a parser to extract the syntactic structure.
   8
   9 The stream of characters are assumed to be in memory identified by a
  10 linked list of blocks, such as provided by the "[mdcode][]" literate
  11 program extractor.  A single token may never cross a block boundary.
  12
  13 [mdcode]: mdcode.html
  14
  15 ###### includes
  16         #include "mdcode.h"
  17
  18 The text is assumed to be UTF-8 though some matching assumes the
  19 ASCII subset.  If the text provided does not conform to UTF-8 an error
  20 will be reported and some number of bytes will be skipped.
  21
  22 ###### includes
  23         #include <wchar.h>
  24         #include <wctype.h>
  25         #include <unicode/uchar.h>
  26
  27 Tokens are returned by successive calls to the main interface
  28 function: `token_next()` which has a `state` structure to keep track
  29 of where it is up to.  Each token carries not just a numeric
  30 identifier but also the code block, the line and character within that
  31 block, and the actual start and length using the `struct text` from
  32 "mdcode".
  33
  34 ###### public types
  35         struct token {
  36                 int               num;
  37                 struct code_node *node;
  38                 struct text       txt;
  39                 int               line, col;
  40         };
  41         struct token_state;
  42
  43 ###### private types
  44         struct token_state {
  45                 ## state fields
  46         };
  47
  48 ###### exported functions
  49         struct token token_next(struct token_state *state);
  50
  51 ###### main functions
  52         struct token token_next(struct token_state *state)
  53         {
  54                 ## token_next init
  55                 while (1) {
  56                         wint_t ch;
  57                         struct token tk;
  58
  59                         ## one token
  60                 }
  61         }
  62
  63 The `line` and `col` offsets are useful for reporting errors.
  64 The `txt` provides the content when that is important.
  65
  66 ### Token types and configuration ##
  67
  68 The scanner is not completely general, yet not completely specified.
  69 There are a fixed set of token types, though particular tokens within
  70 those types can be distinguish via configuration.
  71
  72 Most token types may be explicitly ignored, as typically comments
  73 would be.  The exact consequence of ignoring each token type varies
  74 from token to token.
  75
  76 ###### public types
  77         struct token_config {
  78                 int ignored;    // bit set of ignored tokens.
  79                 ## token config parameters
  80         };
  81
  82 ###### state fields
  83         struct token_config *conf;
  84
  85 ###### token_next init
  86         int ignored = state->conf->ignored;
  87
  88
  89 The different tokens are numbers, words, marks, strings, comments,
  90 newlines, EOF, and indents, each of which is examined in detail below.
  91
  92 There are various cases where no token can be found in part of the
  93 input.  All of these will be reported as an `TK_error` token.
  94
  95 It is possible to declare a number of strings which form distinct
  96 tokens (rather than being grouped as e.g. 'word').  These are given
  97 token numbers from `TK_reserved` upwards.
  98
  99 ###### public types
 100         enum token_num {
 101                 TK_error,
 102                 ## token types
 103                 TK_reserved
 104         };
 105
 106 ### Numbers
 107
 108 Numbers are the messiest tokens to parse, primarily because they can
 109 contain characters that also have meaning outside of number and,
 110 particularly, immediately after numbers.
 111
 112 The obvious example is the '`-`' sign.  It can come inside a number for
 113 a negative exponent, or after a number as a subtraction operator.  To
 114 be sure we have parsed as best as possible we need to only allow the
 115 '`-`' inside a number if it is after an exponent character.  This can be
 116 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
 117 digit, so we don't allow '`-`' after just any `e`.
 118
 119 To make matters worse, our language designer has decided to experiment
 120 with allowing commas to be used as the decimal indicator, and spaces
 121 to be used to separate groups of digits in large numbers.  Both of
 122 these can reasonably be restricted to appear between two digits, so we
 123 have to add that condition to our tests.
 124
 125 So we cannot just treat numbers as starting with a digit and being
 126 followed by some set of characters.  We need more structure than that.
 127
 128 So:
 129
 130 - Numbers must start with a digit.
 131 - If the first digit is zero, the next character must be a base
 132   signifier (one of `xob`) or a decimal marker (`.` or `,`).
 133   In the first case the first `p` or `P` may be followed by a sign.
 134 - If the number doesn't start with `0` followed by one of `xob`, the
 135   first `e` may be followed by a sign.
 136 - Any digit or hex digit may be followed by a space or underscore
 137   providing that the subsequence character is also a (hex) digit.
 138   This rule will require an extra level of 'unget' to be
 139   supported when handling characters.
 140 - Otherwise any digits or ASCII letters are allowed.  We do not at
 141   this point check that the digits given are permitted by the base.
 142   That will happen when the token is converted to a number.
 143
 144 To allow easy configuration, the various non alphanumeric characters
 145 are only permitted if they are listed in a configuration parameter.
 146
 147 ###### token config parameters
 148         char *number_chars;
 149
 150 Note that numbers may not start with a period, so `.75` is not a
 151 number.  This is not the norm, but is not unheard of.  Excluding these
 152 numbers simplifies the rule at very little cost.
 153
 154 ###### token types
 155         TK_number,
 156
 157 If TK_number is ignored, digits will result in an error unless they
 158 are declared to be a start character for words.
 159
 160 ###### includes
 161
 162         #include <string.h>
 163
 164 ###### parse number
 165
 166         if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
 167                 int prev_special = 0;
 168                 int expect_p = 0;
 169                 int decimal_mark = 0;
 170                 if (ch == '0') {
 171                         wchar_t ch2 = get_char(state);
 172                         if (strchr("xobXOB", ch2) != NULL)
 173                                 expect_p = 1;
 174                         unget_char(state);
 175                 }
 176                 while (1) {
 177                         int sign_ok = 0;
 178                         switch(expect_p) {
 179                         case 0:
 180                                 if (ch == 'e')
 181                                         sign_ok = 1;
 182                                 break;
 183                         case 1:
 184                                 if (ch == 'p')
 185                                         sign_ok = 1;
 186                                 break;
 187                         }
 188                         save_unget_state(state);
 189                         ch = get_char(state);
 190                         if (iswalnum(ch)) {
 191                                 prev_special = 0;
 192                                 continue;
 193                         }
 194                         if (ch == '+' || ch == '-') {
 195                                 if (!sign_ok)
 196                                         break;
 197                                 expect_p = -1;
 198                         }
 199                         if (ch == '.' || ch == ',') {
 200                                 if (decimal_mark)
 201                                         break;
 202                                 decimal_mark = 1;
 203                         }
 204                         if (prev_special) {
 205                                 /* Don't allow that special char,
 206                                  * need two 'ungets'
 207                                  */
 208                                 restore_unget_state(state);
 209                                 break;
 210                         }
 211                         if (strchr(state->conf->number_chars, ch)) {
 212                                 prev_special = 1;
 213                                 continue;
 214                         }
 215                         /* non-number char */
 216                         break;
 217                 }
 218                 /* We seem to have a "number" token */
 219                 unget_char(state);
 220                 close_token(state, &tk);
 221                 tk.num = TK_number;
 222                 return tk;
 223         }
 224
 225 ### Words
 226 Words start with a "start" character followed by the longest
 227 sequence of "continue" characters.  The Unicode ID_START and
 228 ID_CONTINUE sets are always permitted, but other ASCII characters
 229 can be added to these sets.
 230
 231 ###### token config parameters
 232         char *word_start;
 233         char *word_cont;
 234
 235 ###### internal functions
 236         static int is_word_start(wchar_t ch, struct token_config *conf)
 237         {
 238                 return iswalpha(ch) ||
 239                        strchr(conf->word_start, ch) != NULL ||
 240                        u_hasBinaryProperty(ch, UCHAR_ID_START);
 241         }
 242
 243         static int is_word_continue(wchar_t ch, struct token_config *conf)
 244         {
 245                 return iswalnum(ch) ||
 246                        strchr(conf->word_cont, ch) != NULL ||
 247                        u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
 248         }
 249
 250 Words can be either known or unknown.  Known words are referred to as
 251 "reserved words" and get a unique token number.  Unknown words are
 252 "identifiers" and are syntactically a single token.
 253
 254 ###### token types
 255         TK_ident,
 256
 257 A list of known words must be provided.  This list is shared with the
 258 "marks" which are described next.  The list must be lexically sorted
 259 and the length of the list must be given (`known_count`).
 260 Tokens matching these known words are reported as the index of the
 261 list added to `TK_reserved`.
 262
 263 ###### token config parameters
 264         const char **words_marks;
 265         int known_count;
 266
 267 ###### parse word
 268
 269         if (is_word_start(ch, state->conf)) {
 270                 int n;
 271                 /* A word: identifier or reserved */
 272                 do
 273                         ch = get_char(state);
 274                 while (is_word_continue(ch, state->conf));
 275                 unget_char(state);
 276                 close_token(state, &tk);
 277                 tk.num = TK_ident;
 278                 if (ignored & (1<<TK_ident))
 279                         tk.num = TK_error;
 280                 n = find_known(state->conf, tk.txt);
 281                 if (n >= 0)
 282                         tk.num = TK_reserved + n;
 283                 return tk;
 284         }
 285
 286 ### Marks
 287
 288 Marks are generally one or more punctuation marks joined together.  It
 289 would be nice to use the term "symbol" for these, but that causes
 290 confusion in a subsequent discussion of the grammar, which has terminal
 291 symbols and non-terminal symbols which are conceptually quite
 292 different.  So strings of punctuation characters will be marks.
 293
 294 A "mark" consists of ASCII characters that are not white space, are not
 295 "start" characters for words, and are not digits.
 296 These will collectively be called mark characters.
 297
 298 ###### internal functions
 299         static int is_mark(wchar_t ch, struct token_config *conf)
 300         {
 301                 return ch > ' ' &&
 302                        ch < 0x7f &&
 303                        !iswalnum(ch) &&
 304                        strchr(conf->word_start, ch) == NULL;
 305         }
 306
 307 As with words, there can be known and unknown marks, though the rules
 308 are slightly different.
 309
 310 Two marks do not need to be separated by a non-mark characters.  This
 311 is different from words which do need to be separated by at least one
 312 non-continue character.
 313
 314 The scanner will normally prefer longer sequences of mark characters,
 315 but will more strongly prefer known marks over unknown marks.  So if
 316 it finds a known mark where adding one more character does not result
 317 in a known mark, it will return that first known mark.
 318
 319 If no known mark is found we will test against strings and comments
 320 below before giving up and assuming an unknown mark.
 321
 322 If an unknown mark contains a quote character or a comment marker, and
 323 that token is not being ignored, then we terminate the unknown mark
 324 before that quote or comment.  This ensure that an unknown mark
 325 immediately before a string is handled correctly.
 326
 327 If `TK_mark` is ignored, then unknown marks as returned as an error.
 328
 329 ###### token types
 330         TK_mark,
 331
 332 Known marks are included in the same list as the list of known words.
 333
 334 ###### parse mark
 335         tk.num = TK_error;
 336         while (is_mark(ch, state->conf)) {
 337                 int n;
 338                 wchar_t prev;
 339                 close_token(state, &tk);
 340                 n = find_known(state->conf, tk.txt);
 341                 if (n >= 0)
 342                         tk.num = TK_reserved + n;
 343                 else if (tk.num != TK_error) {
 344                         /* found a longest-known-mark */
 345                         unget_char(state);
 346                         close_token(state, &tk);
 347                         return tk;
 348                 }
 349                 prev = ch;
 350                 if (prev == '/')
 351                         save_unget_state(state);
 352                 ch = get_char(state);
 353                 if (!(ignored && (1<<TK_string)) && is_quote(ch))
 354                         break;
 355                 if (!(ignored && (1<<TK_line_comment)) &&
 356                     prev == '/' && ch == '/') {
 357                         restore_unget_state(state);
 358                         break;
 359                 }
 360                 if (!(ignored && (1<<TK_block_comment)) &&
 361                     prev == '/' && ch == '*') {
 362                         restore_unget_state(state);
 363                         break;
 364                 }
 365         }
 366         unget_char(state);
 367         if (tk.num != TK_error)
 368                 return tk;
 369
 370 ###### unknown mark
 371         if (tk.txt.len) {
 372                 if (ignored & (1<<TK_mark))
 373                         tk.num = TK_error;
 374                 else
 375                         tk.num = TK_mark;
 376                 return tk;
 377         }
 378
 379 ### Strings
 380
 381 Strings start with one of single quote, double quote, or back quote
 382 and continue until a matching character on the same line.  Any of
 383 these characters can be included in the list of known marks and then
 384 they will not be used for identifying strings.
 385
 386 Immediately following the close quote one or two ASCII letters may
 387 appear.  These are somewhat like the arbitrary letters allowed in
 388 "Numbers" above.  They can be used by the language in various ways.
 389
 390 If 3 identical quote characters appear in a row and are
 391 followed by a newline, then this forms a multi-line string which
 392 continues until an identical triple quote appears on a line preceded
 393 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
 394
 395 Multi-line strings may not extend beyond the end of the `code_node` in
 396 which they start.
 397
 398 Normal strings and multi-line strings are encoded as two different
 399 token types.
 400
 401 ###### token types
 402         TK_string,
 403         TK_multi_string,
 404
 405 ###### internal functions
 406         static int is_quote(wchar_t ch)
 407         {
 408                 return ch == '\'' || ch == '"' || ch == '`';
 409         }
 410
 411 #### Multi-line strings
 412
 413 The multi-line string is checked for first.  If they are being
 414 ignored, we fall through and treat a triple quote as an empty string
 415 followed by the start of a new string.
 416
 417 ###### parse string
 418         if (tk.txt.len == 3 &&
 419             !(ignored & (1 << TK_multi_string)) &&
 420             is_quote(tk.txt.txt[0]) &&
 421             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
 422             is_newline(tk.txt.txt[3])) {
 423                 // triple quote
 424                 wchar_t first = tk.txt.txt[0];
 425                 int qseen = 0;
 426                 int at_sol = 1;
 427                 while (!at_eon(state) && qseen < 3) {
 428                         ch = get_char(state);
 429                         if (is_newline(ch)) {
 430                                 at_sol = 1;
 431                                 qseen = 0;
 432                         } else if (at_sol && ch == first) {
 433                                 qseen += 1;
 434                         } else if (ch != ' ' && ch != '\t') {
 435                                 at_sol = 0;
 436                                 qseen = 0;
 437                         }
 438                 }
 439                 if (qseen != 3) {
 440                         /* Hit end of node - error.
 441                          * unget so the newline is seen,
 442                          * but return rest of string as an error.
 443                          */
 444                         unget_char(state);
 445                         close_token(state, &tk);
 446                         tk.num = TK_error;
 447                         return tk;
 448                 }
 449                 /* 2 letters are allowed */
 450                 ch = get_char(state);
 451                 if (iswalpha(ch))
 452                         ch = get_char(state);
 453                 if (iswalpha(ch))
 454                         ch = get_char(state);
 455                 /* Now we must have a newline, but we don't return it
 456                  * whatever it is.*/
 457                 unget_char(state);
 458                 close_token(state, &tk);
 459                 tk.num = TK_multi_string;
 460                 if (!is_newline(ch))
 461                         tk.num = TK_error;
 462                 return tk;
 463         }
 464
 465 #### Single-line strings
 466
 467 The sequence of marks collected may be more than a single-line
 468 string, so we reset to the start and collect characters until
 469 we find a close quote or a newline.
 470
 471 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
 472
 473 ###### parse string
 474         if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
 475             !(ignored & (1<<TK_string))) {
 476                 wchar_t first = tk.txt.txt[0];
 477                 reset_token(state, &tk);
 478                 get_char(state);
 479                 do
 480                         ch = get_char(state);
 481                 while (ch != first && !is_newline(ch));
 482                 tk.num = TK_string;
 483                 if (is_newline(ch)) {
 484                         unget_char(state);
 485                         tk.num = TK_error;
 486                 }
 487                 close_token(state, &tk);
 488                 return tk;
 489         }
 490
 491 ### Comments
 492
 493 Single line comments may start with '`//`' or '`#`' providing that these
 494 are not known marks.  They continue to the end of the line.
 495
 496 Block comments start with '`/*`' if this is not a known mark.  They
 497 continue to the first occurrence of '`*/`' and may not contain any
 498 occurrence of '`/*`'.
 499
 500 Block comments can be wholly within one line or can continue over
 501 multiple lines.  The multi-line version should be followed immediately
 502 by a newline.  The Linux kernel contains over 285000 multi-line
 503 comments are only 34 are followed by characters other than white space
 504 (which should be removed) or a backslash (only needed in macros).  So
 505 it would not suffer from this rule.
 506
 507 These two comment types are reported as two separate token types, and
 508 consequently can be ignored separately.  When ignored a comment is
 509 parsed and discarded.
 510
 511 ###### token types
 512         TK_line_comment,
 513         TK_block_comment,
 514
 515 ###### internal functions
 516         static int is_line_comment(struct text txt)
 517         {
 518                 return (txt.len >= 1 && txt.txt[0] == '#') ||
 519                        (txt.len >= 2 && txt.txt[0] == '/' &&
 520                                         txt.txt[1] == '/');
 521         }
 522
 523         static int is_block_comment(struct text txt)
 524         {
 525                 return txt.len >= 2 && txt.txt[0] == '/' &&
 526                        txt.txt[1] == '*';
 527         }
 528
 529 #### Single line comments
 530
 531 A single-line comment continues up to, but not including the newline.
 532
 533 ###### parse comment
 534
 535         if (is_line_comment(tk.txt)) {
 536                 while (!is_newline(ch))
 537                         ch = get_char(state);
 538                 unget_char(state);
 539                 close_token(state, &tk);
 540                 tk.num = TK_line_comment;
 541                 if (ignored & (1 << TK_line_comment))
 542                         continue;
 543                 return tk;
 544         }
 545
 546 #### Block comments
 547
 548 The token text collected so far could exceed the comment, so we need
 549 to reset it first.
 550
 551 If we find an embedded `/*` we reset to just before the '/' and report
 552 an error.  That way the next thing to be parsed will be the rest of
 553 the comment.  This requires a double unget, so we need to save/restore
 554 the unget state (explained later).
 555
 556 ###### parse comment
 557
 558         if (is_block_comment(tk.txt)) {
 559                 wchar_t prev;
 560                 int newlines = 0;
 561                 reset_token(state, &tk);
 562                 get_char(state);
 563                 get_char(state);
 564                 save_unget_state(state);
 565                 ch = get_char(state);
 566                 prev = 0;
 567                 while (!at_eon(state) &&
 568                        (prev != '/' || ch != '*') &&
 569                        (prev != '*' || ch != '/')) {
 570                         if (is_newline(ch))
 571                                 newlines = 1;
 572                         prev = ch;
 573                         save_unget_state(state);
 574                         ch = get_char(state);
 575                 }
 576                 close_token(state, &tk);
 577                 if (at_eon(state)) {
 578                         tk.num = TK_error;
 579                         return tk;
 580                 }
 581                 if (prev == '/') {
 582                         /* embedded.  Need to unget twice! */
 583                         restore_unget_state(state);
 584                         unget_char(state);
 585                         tk.num = TK_error;
 586                         return tk;
 587                 }
 588                 tk.num = TK_block_comment;
 589                 if (newlines && !(ignored & (1<<TK_newline))) {
 590                         /* next char must be newline */
 591                         ch = get_char(state);
 592                         unget_char(state);
 593                         if (!is_newline(ch))
 594                                 tk.num = TK_error;
 595                 }
 596                 if (tk.num == TK_error ||
 597                     !(ignored & (1 << TK_block_comment)))
 598                         return tk;
 599                 continue;
 600         }
 601
 602 ### Indents, Newlines, and White Space.
 603
 604 Normally white space is ignored.  However newlines can be important as
 605 can indents, which are either after a newline or at the start of a
 606 node (detected by `at_son()`);
 607
 608 ###### exported functions
 609         static inline int is_newline(wchar_t ch)
 610         {
 611                 return ch == '\n' || ch == '\f' || ch == '\v';
 612         }
 613
 614 ###### white space
 615         if (ch <= ' ' && !is_newline(ch)
 616             && ! at_son(state))
 617                 continue;
 618
 619 If a line starts with more white-space than the previous non-blank
 620 line - or if the first non-blank line in the document starts with any
 621 white-space - then an "IN" is reported at the start of the line.
 622
 623 Before the next non-blank line which starts with less white space, or
 624 at the latest at the end of the document, a matching "OUT" token
 625 is reported.  There will always be an exact match between "IN" and
 626 "OUT" tokens.
 627
 628 It is possible for "OUT" to be followed (almost) immediately by an
 629 "IN".  This happens if, for example, the indent of three consecutive
 630 lines are 0, 8, 4 spaces.  Before the second line we report an
 631 "IN".  Before the third line we must report an "OUT", as 4 is less
 632 than 8, then also an Ident as 4 is greater than 0.
 633
 634 ###### token types
 635         TK_in,
 636         TK_out,
 637
 638 For the purpose of measuring the length of white space, a tab adds at
 639 least one space, and rounds up to a multiple of 8.
 640
 641 ###### exported functions
 642         static inline int indent_tab(int indent)
 643         {
 644                 return (indent|7)+1;
 645         }
 646
 647 We need to track the current levels of indent.  This requires some
 648 sort of stack as indent levels are pushed on and popped off.  In
 649 practice this stack is unlikely to often exceed 5 so we will used a
 650 fixed stack of 20 indent levels.  More than this will be silently
 651 ignored.
 652
 653 ###### state fields
 654         int     indent_level;
 655         int     indent_sizes[20];
 656
 657 #### Newlines
 658
 659 Newlines can optionally be reported.  Newlines within a block comment
 660 or a multi-line string are not reported separately, but each of these
 661 must be followed immediately by a newline so these constructs cannot
 662 hide the fact that a newline was present.
 663
 664 When indents are being reported, the Newline which would normally be
 665 reported immediately before the "IN" is delayed until after the
 666 matching "OUT".  This makes an indented section act like a
 667 continuation of the previous line to some extent.
 668
 669 A blank line would normally be reported simply as two consecutive Newline
 670 tokens.  However if the subsequent line is indented (and indents are being
 671 reported) then the right thing to do is less obvious as Newlines should be
 672 delayed - but how many Newlines?
 673
 674 The approach we will take is to report the extra Newlines immediately after
 675 the IN token, so the blank line is treated as though it were an indented
 676 blank line.
 677
 678 ###### token types
 679         TK_newline,
 680
 681 If we find a newline or white space at the start of a block, we keep
 682 collecting spaces, tabs, and newlines until we find some real text.
 683 Then depending on the indent we generate some number of tokens.  These
 684 will be a sequence of "Newline OUT" pairs representing a decrease
 685 in indent, then either a Newline or an IN depending on whether the
 686 next line is indented, then zero or more Newlines representing all the
 687 blank lines that have been skipped.
 688
 689 When a Newline leads to the next block of code there is a question of
 690 whether the various Newline and OUT/IN tokens should appear to
 691 pbelong to the earlier or later block.  This is addressed by processing
 692 the tokens in two stages based on the relative indent levels of the
 693 two blocks (each block has a base indent to which the actual indents
 694 are added).
 695
 696 Any "Newline OUT" pairs needed to reduce the current indent to the
 697 maximum of the base indents of the old and new blocks are generated
 698 against the old block.  Then if the next block does not have an
 699 increased indent, one more "Newline" is generated.
 700
 701 If further "Newline OUT" pairs are needed to get to the indent
 702 level of the 'next' block, they are generated against that block,
 703 though the first Newline is suppressed (it having already been
 704 generated).
 705
 706 Finally the Newline or IN for the first line of the new block is
 707 generated, unless the Newline needs to be suppressed because it
 708 appeared at the end of the previous block.
 709
 710 This means that a block may start with an OUT or an IN, but
 711 will only start with a Newline if it actually starts with a blank
 712 line.
 713
 714 We will need to represent in the `token_state` where in this sequence
 715 of delayed tokens we are.  As `state.col` records the target indent we
 716 don't need to record how many OUTs or INs are needed.  We do
 717 need to record the number of blank lines, and which of Newline and
 718 OUT is needed next in the initial sequence of pairs.
 719
 720 For this we store one more than the number of blank lines as
 721 `delayed_lines` and a flag for `out_next`.
 722
 723 ###### state fields
 724         int check_indent;
 725         int delayed_lines;
 726         int out_next;
 727
 728 Generating these tokens involve two separate pieces of code.
 729
 730 Firstly we need to recognise white space and count the indents and
 731 newlines.  These are recorded in the above state fields.
 732
 733 Separately we need, on each call to `token_next`, we need to check if
 734 there are some delayed tokens and if so we need to advance the state
 735 information and return one token.
 736
 737 ###### white space
 738         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
 739                 int newlines = 0;
 740                 int was_son = at_son(state);
 741                 if (ignored & (1<<TK_in)) {
 742                         if (!is_newline(ch))
 743                                 continue;
 744                         if (ignored & (1<<TK_newline))
 745                                 continue;
 746                         tk.num = TK_newline;
 747                         close_token(state, &tk);
 748                         return tk;
 749                 }
 750                 // Indents are needed, so check all white space.
 751                 while (ch <= ' ' && !at_eon(state)) {
 752                         if (is_newline(ch))
 753                                 newlines += 1;
 754                         ch = get_char(state);
 755                 }
 756                 if (at_eon(state)) {
 757                         newlines += 1;
 758                         if (state->node->next &&
 759                             state->node->next->indent > state->node->indent)
 760                                 state->col = state->node->next->indent;
 761                         else
 762                                 state->col = state->node->indent;
 763                 } else
 764                         unget_char(state);
 765                 state->delayed_lines = newlines;
 766                 state->out_next = was_son;
 767                 state->check_indent = 1;
 768                 continue;
 769         }
 770
 771
 772 ###### delayed tokens
 773
 774         if (state->check_indent || state->delayed_lines) {
 775                 if (state->col < state->indent_sizes[state->indent_level]) {
 776                         if (!state->out_next &&
 777                             !(ignored & (1<<TK_newline))) {
 778                                 state->out_next = 1;
 779                                 tk.num = TK_newline;
 780                                 return tk;
 781                         }
 782                         state->indent_level -= 1;
 783                         state->out_next = 0;
 784                         tk.num = TK_out;
 785                         return tk;
 786                 }
 787                 if (state->col > state->indent_sizes[state->indent_level] &&
 788                     state->indent_level < sizeof(state->indent_sizes)-1) {
 789                         state->indent_level += 1;
 790                         state->indent_sizes[state->indent_level] = state->col;
 791                         state->delayed_lines -= 1;
 792                         tk.num = TK_in;
 793                         return tk;
 794                 }
 795                 state->check_indent = 0;
 796                 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
 797                         tk.num = TK_newline;
 798                         state->delayed_lines -= 1;
 799                         return tk;
 800                 }
 801                 state->delayed_lines = 0;
 802                 continue;
 803         }
 804
 805 ### End of File
 806
 807 After the last newline in the file has been processed, a special
 808 end-of-file token will be returned.  any further attempts to get more
 809 tokens will continue to return the same end-of-file token.
 810
 811 ###### token types
 812         TK_eof,
 813
 814
 815 ###### white space
 816         if (ch == WEOF) {
 817                 tk.num = TK_eof;
 818                 return tk;
 819         }
 820
 821 ### Unknown Marks, or errors.
 822
 823 We have now handled all the possible known mark-like tokens.
 824 If the token we have is not empty and `TK_mark` is allowed,
 825 we have an unknown mark, otherwise this must be an error.
 826
 827 ###### unknown mark
 828         /* one unknown character */
 829         close_token(state, &tk);
 830         tk.num = TK_error;
 831         return tk;
 832
 833 ## Tools For The Task
 834
 835 You may have noticed that are few gaps we left in the above -
 836 functions used without first defining them.  Doing so above would have
 837 broken the flow.
 838
 839 ### Character by character
 840
 841 As we walk through the various `code_node`s we need to process whole
 842 Unicode codepoints, and keep track of which line and column we are on.
 843 We will assume for now that any printing character uses one column,
 844 though that is not true in general.
 845
 846 As the text in a `code_node` may include an indent that identifies it as
 847 being code, we need to be careful to strip that.  The `code_node` has
 848 a flag that tells us whether or not we need to strip.
 849
 850 ###### includes
 851         #include <memory.h>
 852
 853 ###### state fields
 854         struct code_node *node;
 855         int    offset;
 856         int    line;
 857         int    col;
 858
 859 ###### internal functions
 860
 861         static void do_strip(struct token_state *state)
 862         {
 863                 if (state->node->needs_strip) {
 864                         int n = 4;
 865                         while (n && state->node->code.txt[state->offset] == ' ') {
 866                                 state->offset += 1;
 867                                 n -= 1;
 868                         }
 869                         while (n == 4 && state->node->code.txt[state->offset] == '\t') {
 870                                 state->offset += 1;
 871                                 n -= 4;
 872                         }
 873                 }
 874         }
 875
 876         static wint_t get_char(struct token_state *state)
 877         {
 878                 wchar_t next;
 879                 size_t n;
 880                 mbstate_t mbstate;
 881
 882                 if (state->node == NULL)
 883                         return WEOF;
 884                 if (state->node->code.len <= state->offset) {
 885                         do
 886                                 state->node = state->node->next;
 887                         while (state->node && state->node->code.txt == NULL);
 888                         state->offset = 0;
 889                         if (state->node == NULL)
 890                                 return WEOF;
 891                         do_strip(state);
 892                         state->line = state->node->line_no;
 893                         state->col = state->node->indent;
 894                 }
 895
 896                 ## before get_char
 897
 898                 memset(&mbstate, 0, sizeof(mbstate));
 899
 900                 n = mbrtowc(&next, state->node->code.txt + state->offset,
 901                             state->node->code.len - state->offset,
 902                             &mbstate);
 903                 if (n == -2 || n == 0) {
 904                         /* Not enough bytes - not really possible */
 905                         next = '\n';
 906                         state->offset = state->node->code.len;
 907                 } else if (n == -1) {
 908                         /* error */
 909                         state->offset += 1;
 910                         next = 0x7f; // an illegal character
 911                 } else
 912                         state->offset += n;
 913
 914                 if (next >= ' ') {
 915                         state->col += 1;
 916                 } else if (is_newline(next)) {
 917                         state->line += 1;
 918                         state->col = state->node->indent;
 919                         do_strip(state);
 920                 } else if (next == '\t') {
 921                         state->col = indent_tab(state->col);
 922                 }
 923                 return next;
 924         }
 925
 926 We will sometimes want to "unget" the last character as it needs to be
 927 considered again as part of the next token.  So we need to store a
 928 'previous' version of all metadata.
 929
 930 ###### state fields
 931         int    prev_offset;
 932         int    prev_line;
 933         int    prev_col;
 934
 935 ###### before get_char
 936         state->prev_offset = state->offset;
 937         state->prev_line   = state->line;
 938         state->prev_col    = state->col;
 939
 940 ###### internal functions
 941
 942         static void unget_char(struct token_state *state)
 943         {
 944                 if (state->node) {
 945                         state->offset = state->prev_offset;
 946                         state->line   = state->prev_line;
 947                         state->col    = state->prev_col;
 948                 }
 949         }
 950
 951 We occasionally need a double-unget, particularly for numbers and
 952 block comments.  We don't impose this cost on all scanning, but
 953 require those code sections that need it to call `save_unget_state`
 954 before each `get_char`, and then `restore_unget_state` when a
 955 double-unget is needed.
 956
 957 ###### state fields
 958         int     prev_offset2;
 959         int     prev_line2;
 960         int     prev_col2;
 961
 962 ###### internal functions
 963         static void save_unget_state(struct token_state *state)
 964         {
 965                 state->prev_offset2 = state->prev_offset;
 966                 state->prev_line2 = state->prev_line;
 967                 state->prev_col2 = state->prev_col;
 968         }
 969
 970         static void restore_unget_state(struct token_state *state)
 971         {
 972                 state->prev_offset = state->prev_offset2;
 973                 state->prev_line = state->prev_line2;
 974                 state->prev_col = state->prev_col2;
 975         }
 976
 977 At the start of a token we don't want to be at the end of a code block
 978 if we can help it.  To avoid this possibility, we 'get' and 'unget' a
 979 single character.  This will move into the next non-empty code block
 980 and leave the current pointer at the start of it.
 981
 982 This has to happen _after_ dealing with delayed tokens as some of them
 983 must appear in the previous node.  When we do this, we need to reset
 984 the data in the token.
 985
 986 ###### delayed tokens
 987         if (at_eon(state)) {
 988                 get_char(state);
 989                 unget_char(state);
 990                 tk.node = state->node;
 991                 if (state->node)
 992                         tk.txt.txt = state->node->code.txt + state->offset;
 993                 tk.line = state->line;
 994                 tk.col = state->col;
 995                 tk.txt.len = 0;
 996         }
 997
 998 ### Managing tokens
 999
1000 The current token is initialized to line up with the first character
1001 that we 'get' for each token.  When we have, or might have, a full
1002 token we can call `close_token` to set the `len` of the token
1003 appropriately.  This can safely be called multiple times.
1004
1005 Finally we occasionally (for single-line strings and block comments)
1006 need to reset to the beginning of the current token as we might have
1007 parsed too much already.  For that there is `reset_token`.
1008
1009 ###### one token
1010         tk.node = state->node;
1011         if (state->node)
1012                 tk.txt.txt = state->node->code.txt + state->offset;
1013         tk.line = state->line;
1014         tk.col = state->col;
1015         tk.txt.len = 0;
1016
1017 ###### internal functions
1018
1019         static void close_token(struct token_state *state,
1020                                 struct token *tk)
1021         {
1022                 tk->txt.len = (state->node->code.txt + state->offset)
1023                               - tk->txt.txt;
1024         }
1025
1026         static void reset_token(struct token_state *state, struct token *tok)
1027         {
1028                 state->prev_line = tok->line;
1029                 state->prev_col = tok->col;
1030                 state->prev_offset = tok->txt.txt - state->node->code.txt;
1031                 unget_char(state);
1032                 tok->txt.len = 0;
1033         }
1034
1035
1036 Tokens make not cross into the next `code_node`, and some tokens can
1037 include the newline at the and of a `code_node`, we must be able to
1038 easily check if we have reached the end.  Equally we need to know if
1039 we are at the start of a node, as white space is treated a little
1040 differently there.
1041
1042 ###### internal functions
1043
1044         static int at_son(struct token_state *state)
1045         {
1046                 return state->offset == 0;
1047         }
1048
1049         static int at_eon(struct token_state *state)
1050         {
1051                 // at end-of-node ??
1052                 return state->node == NULL ||
1053                        state->offset >= state->node->code.len;
1054         }
1055
1056 ### Find a known word
1057
1058 As the known-word list is sorted we can use a simple binary search.
1059 Following the pattern established in "mdcode", we will use a `struct
1060 text` with start and length to represent the code fragment we are
1061 searching for.
1062
1063 ###### internal functions
1064         static int find_known(struct token_config *conf, struct text txt)
1065         {
1066                 int lo = 0;
1067                 int hi = conf->known_count;
1068
1069                 while (lo + 1 < hi) {
1070                         int mid = (lo + hi) / 2;
1071                         int cmp = strncmp(conf->words_marks[mid],
1072                                           txt.txt, txt.len);
1073                         if (cmp == 0 && conf->words_marks[mid][txt.len])
1074                                 cmp = 1;
1075                         if (cmp <= 0)
1076                                 lo = mid;
1077                         else
1078                                 hi = mid;
1079                 }
1080                 if (strncmp(conf->words_marks[lo],
1081                            txt.txt, txt.len) == 0
1082                     && conf->words_marks[lo][txt.len] == 0)
1083                         return lo;
1084                 else
1085                         return -1;
1086         }
1087
1088 ### Bringing it all together
1089
1090 Now we have all the bits there is just one section missing:  combining
1091 all the token parsing code into one block.
1092
1093 The handling of delayed tokens (Newlines, INs, OUTs) must come
1094 first before we try getting another character.
1095
1096 Then we parse all the test, making sure that we check for known marks
1097 before strings and comments, but unknown marks after strings and comments.
1098
1099 This block of code will either return a token, or will choose to
1100 ignore one, in which case it will `continue` around to the top of the
1101 loop.
1102
1103 ###### one token
1104         ## delayed tokens
1105
1106         ch = get_char(state);
1107
1108         ## white space
1109         ## parse number
1110         ## parse word
1111         ## parse mark
1112         ## parse string
1113         ## parse comment
1114         ## unknown mark
1115
1116 ### Start and stop
1117
1118 As well as getting tokens, we need to be able to create the
1119 `token_state` to start with, and discard it later.
1120
1121 ###### includes
1122         #include <malloc.h>
1123
1124 ###### main functions
1125         struct token_state *token_open(struct code_node *code, struct
1126                                        token_config *conf)
1127         {
1128                 struct token_state *state = malloc(sizeof(*state));
1129                 memset(state, 0, sizeof(*state));
1130                 state->node = code;
1131                 state->line = code->line_no;
1132                 state->col = code->indent;
1133                 state->conf = conf;
1134                 do_strip(state);
1135                 return state;
1136         }
1137         void token_close(struct token_state *state)
1138         {
1139                 free(state);
1140         }
1141
1142 ###### exported functions
1143         struct token_state *token_open(struct code_node *code, struct
1144                                        token_config *conf);
1145         void token_close(struct token_state *state);
1146
1147 ### Trace tokens
1148
1149 Getting tokens is the main thing but it is also useful to be able to
1150 print out token information, particularly for tracing and testing.
1151
1152 Known tokens are printed verbatim.  Other tokens are printed as
1153 `type(content)` where content is truncated to a given number of characters.
1154
1155 The function for printing a truncated string (`text_dump`) is also exported
1156 so that it can be used to tracing processed strings too.
1157
1158 ###### includes
1159         #include <stdio.h>
1160
1161 ###### exported functions
1162         void token_trace(FILE *f, struct token tok, int max);
1163         void text_dump(FILE *f, struct text t, int max);
1164
1165 ###### main functions
1166
1167         void text_dump(FILE *f, struct text txt, int max)
1168         {
1169                 int i;
1170                 if (txt.len > max)
1171                         max -= 2;
1172                 else
1173                         max = txt.len;
1174                 for (i = 0; i < max; i++) {
1175                         char c = txt.txt[i];
1176                         if (c < ' ' || c > '~')
1177                                 fprintf(f, "\\x%02x", c & 0xff);
1178                         else if (c == '\\')
1179                                 fprintf(f, "\\\\");
1180                         else
1181                                 fprintf(f, "%c", c);
1182                 }
1183                 if (i < txt.len)
1184                         fprintf(f, "..");
1185         }
1186
1187         void token_trace(FILE *f, struct token tok, int max)
1188         {
1189                 static char *types[] = {
1190                         [TK_ident] = "ident",
1191                         [TK_mark] = "mark",
1192                         [TK_number] = "number",
1193                         [TK_string] = "string",
1194                         [TK_multi_string] = "mstring",
1195                         [TK_line_comment] = "lcomment",
1196                         [TK_block_comment] = "bcomment",
1197                         [TK_in] = "in",
1198                         [TK_out] = "out",
1199                         [TK_newline] = "newline",
1200                         [TK_eof] = "eof",
1201                         [TK_error] = "ERROR",
1202                         };
1203
1204                 switch (tok.num) {
1205                 default: /* known word or mark */
1206                         fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1207                         break;
1208                 case TK_in:
1209                 case TK_out:
1210                 case TK_newline:
1211                 case TK_eof:
1212                         /* No token text included */
1213                         fprintf(f, "%s()", types[tok.num]);
1214                         break;
1215                 case TK_ident:
1216                 case TK_mark:
1217                 case TK_number:
1218                 case TK_string:
1219                 case TK_multi_string:
1220                 case TK_line_comment:
1221                 case TK_block_comment:
1222                 case TK_error:
1223                         fprintf(f, "%s(", types[tok.num]);
1224                         text_dump(f, tok.txt, max);
1225                         fprintf(f, ")");
1226                         break;
1227                 }
1228         }
1229
1230 ### And there we have it
1231
1232 We now have all the library functions defined for reading and printing
1233 tokens.  Now we just need C files to store them, and a mk file to make them.
1234
1235 ###### File: scanner.h
1236         ## public types
1237         ## exported functions
1238
1239 ###### File: libscanner.c
1240         ## includes
1241         #include "scanner.h"
1242         ## private types
1243         ## internal functions
1244         ## main functions
1245
1246 ###### File: scanner.mk
1247
1248         CFLAGS += -Wall -g
1249         all ::
1250         scanner.mk scanner.h libscanner.c : scanner.mdc
1251                 ./md2c scanner.mdc
1252         all :: libscanner.o
1253         libscanner.o : libscanner.c
1254                 $(CC) $(CFLAGS) -c libscanner.c
1255
1256 ## Processing numbers
1257
1258 Converting a `TK_number` token to a numerical value is a slightly
1259 higher level task than lexical analysis, and slightly lower than
1260 grammar parsing, so put it here - as an index if you like.
1261
1262 Importantly it will be used by the same testing rig that is used for
1263 testing the token scanner.
1264
1265 The numeric value that we will convert all numbers into is the `mpq_t`
1266 from the GNU high precision number library "libgmp".
1267
1268 ###### number includes
1269         #include <gmp.h>
1270         #include "mdcode.h"
1271
1272 Firstly we need to be able to parse a string of digits in a given base
1273 and possibly with a decimal marker.  We store this in an `mpz_t`
1274 integer and report the number of digits after the decimal mark.
1275
1276 On error we return zero and ensure that the 'mpz_t' has been freed, or
1277 had never been initialised.
1278
1279 ###### number functions
1280
1281         static int parse_digits(mpz_t num, struct text tok, int base,
1282                                 int *placesp)
1283         {
1284                 /* Accept digits up to 'base', ignore '_' and
1285                  * ' ' if they appear between two legal digits,
1286                  * and if `placesp` is not NULL, allow a single
1287                  * '.' or ',' and report the number of digits
1288                  * beyond there.
1289                  * Return number of characters processed (p),
1290                  * or 0 if something illegal was found.
1291                  */
1292                 int p;
1293                 int decimal = -1; // digits after marker
1294                 enum {Digit, Space, Other} prev = Other;
1295                 int digits = 0;
1296
1297                 for (p = 0; p < tok.len; p++) {
1298                         int dig;
1299                         char c = tok.txt[p];
1300
1301                         if (c == '_' || c == ' ') {
1302                                 if (prev != Digit)
1303                                         goto bad;
1304                                 prev = Space;
1305                                 continue;
1306                         }
1307                         if (c == '.' || c == ',') {
1308                                 if (prev != Digit)
1309                                         goto bad;
1310                                 if (!placesp || decimal >= 0)
1311                                         return p-1;
1312                                 decimal = 0;
1313                                 prev = Other;
1314                                 continue;
1315                         }
1316                         if (isdigit(c))
1317                                 dig = c - '0';
1318                         else if (isupper(c))
1319                                 dig = 10 + c - 'A';
1320                         else if (islower(c))
1321                                 dig = 10 + c - 'a';
1322                         else
1323                                 dig = base;
1324                         if (dig >= base) {
1325                                 if (prev == Space)
1326                                         p--;
1327                                 break;
1328                         }
1329                         prev = Digit;
1330                         if (digits)
1331                                 mpz_mul_ui(num, num, base);
1332                         else
1333                                 mpz_init(num);
1334                         digits += 1;
1335                         mpz_add_ui(num, num, dig);
1336                         if (decimal >= 0)
1337                                 decimal++;
1338                 }
1339                 if (digits == 0)
1340                         return 0;
1341                 if (placesp) {
1342                         if (decimal >= 0)
1343                                 *placesp = decimal;
1344                         else
1345                                 *placesp = 0;
1346                 }
1347                 return p;
1348         bad:
1349                 if (digits)
1350                         mpz_clear(num);
1351                 return 0;
1352         }
1353
1354 ###### number includes
1355         #include <ctype.h>
1356
1357 To parse a full number we need to consider the optional base, the
1358 mantissa, and the optional exponent.  We will treat these one at a
1359 time.
1360
1361 The base is indicated by a letter after a leading zero, which must be
1362 followed by a base letter or a period.  The base also determines the
1363 character which will mark an exponent.
1364
1365 ###### number vars
1366         int base = 10;
1367         char expc = 'e';
1368
1369 ###### parse base
1370
1371         if (tok.txt[0] == '0' && tok.len > 1) {
1372                 int skip = 0;
1373                 switch(tok.txt[1]) {
1374                 case 'x':
1375                 case 'X':
1376                         base = 16;
1377                         skip = 2;
1378                         expc = 'p';
1379                         break;
1380                 case 'o':
1381                 case 'O':
1382                         base = 8;
1383                         skip = 2;
1384                         expc = 'p';
1385                         break;
1386                 case 'b':
1387                 case 'B':
1388                         base = 2;
1389                         skip = 2;
1390                         expc = 'p';
1391                         break;
1392                 case '0':
1393                 case '1':
1394                 case '2':
1395                 case '3':
1396                 case '4':
1397                 case '5':
1398                 case '6':
1399                 case '7':
1400                 case '8':
1401                 case '9':
1402                 case '_':
1403                 case ' ':
1404                         // another digit is not permitted
1405                         // after a zero.
1406                         return 0;
1407                 default:
1408                         // must be decimal marker or trailing
1409                         // letter, which are OK;
1410                         break;
1411                 }
1412                 tok.txt += skip;
1413                 tok.len -= skip;
1414         }
1415
1416 After the base is the mantissa, which may contain a decimal mark, so
1417 we need to record the number of places.  We won't impose the number of
1418 places until we have the exponent as well.
1419
1420 ###### number vars
1421         int places =0;
1422         mpz_t mant;
1423         int d;
1424
1425 ###### parse mantissa
1426
1427         d = parse_digits(mant, tok, base, &places);
1428         if (d == 0)
1429                 return 0;
1430         tok.txt += d;
1431         tok.len -= d;
1432         mpq_init(num);
1433         mpq_set_z(num, mant);
1434         mpz_clear(mant);
1435
1436 After the mantissa number may come an exponent which may be positive
1437 or negative.  We assume at this point that we have seen the exponent
1438 character `expc`.
1439
1440 ###### number vars
1441         long lexp = 0;
1442         mpz_t exp;
1443         int esign = 1;
1444
1445 ###### parse exponent
1446         if (tok.len > 1) {
1447                 if (tok.txt[0] == '+') {
1448                         tok.txt++;
1449                         tok.len--;
1450                 } else if (tok.txt[0] == '-') {
1451                         esign = -1;
1452                         tok.txt++;
1453                         tok.len--;
1454                 }
1455         }
1456         d = parse_digits(exp, tok, 10, NULL);
1457         if (d == 0) {
1458                 mpq_clear(num);
1459                 return 0;
1460         }
1461         if (!mpz_fits_slong_p(exp)) {
1462                 mpq_clear(num);
1463                 mpz_clear(exp);
1464                 return 0;
1465         }
1466         lexp = mpz_get_si(exp) * esign;
1467         mpz_clear(exp);
1468         tok.txt += d;
1469         tok.len -= d;
1470
1471
1472 Now that we have the mantissa and the exponent we can multiply them
1473 together, also allowing for the number of digits after the decimal
1474 mark.
1475
1476 For base 10, we simply subtract the decimal places from the exponent.
1477 For the other bases, as the exponent is alway based on 2, even for
1478 octal and hex, we need a bit more detail.
1479 We then recover the sign from the exponent, as division is quite
1480 different from multiplication.
1481
1482 ###### calc exponent
1483         switch (base) {
1484         case 10:
1485         case 2:
1486                 lexp -= places;
1487                 break;
1488         case 16:
1489                 lexp -= 4*places;
1490                 break;
1491         case 8:
1492                 lexp -= 3*places;
1493                 break;
1494         }
1495         if (lexp < 0) {
1496                 lexp = -lexp;
1497                 esign = -1;
1498         } else
1499                 esign = 1;
1500
1501 Imposing the exponent on the number is also very different for base 10
1502 than for the others.  For the binary shift `gmp` provides a simple
1503 function.  For base 10 we use something like Russian Peasant
1504 Multiplication.
1505
1506 ###### calc exponent
1507         if (expc == 'e') {
1508                 mpq_t tens;
1509                 mpq_init(tens);
1510                 mpq_set_ui(tens, 10, 1);
1511                 while (1) {
1512                         if (lexp & 1) {
1513                                 if (esign > 0)
1514                                         mpq_mul(num, num, tens);
1515                                 else
1516                                         mpq_div(num, num, tens);
1517                         }
1518                         lexp >>= 1;
1519                         if (lexp == 0)
1520                                 break;
1521                         mpq_mul(tens, tens, tens);
1522                 }
1523                 mpq_clear(tens);
1524         } else {
1525                 if (esign > 0)
1526                         mpq_mul_2exp(num, num, lexp);
1527                 else
1528                         mpq_div_2exp(num, num, lexp);
1529         }
1530
1531 Now we are ready to parse a number: the base, mantissa, and exponent.
1532 If all goes well we check for the possible trailing letters and
1533 return.  Return value is 1 for success and 0 for failure.
1534
1535
1536 ###### number functions
1537         int number_parse(mpq_t num, char tail[3], struct text tok)
1538         {
1539                 ## number vars
1540                 int i;
1541
1542                 ## parse base
1543                 ## parse mantissa
1544                 if (tok.len > 1 && (tok.txt[0] == expc ||
1545                                     tok.txt[0] == toupper(expc))) {
1546                         tok.txt++;
1547                         tok.len--;
1548                         ## parse exponent
1549                 }
1550                 ## calc exponent
1551
1552                 for (i = 0; i < 2; i++) {
1553                         if (tok.len <= i)
1554                                 break;
1555                         if (!isalpha(tok.txt[i]))
1556                                 goto err;
1557                         tail[i] = tok.txt[i];
1558                 }
1559                 tail[i] = 0;
1560                 if (i == tok.len)
1561                         return 1;
1562         err:
1563                 mpq_clear(num);
1564                 return 0;
1565         }
1566
1567 Number parsing goes in `libnumber.c`
1568
1569 ###### File: libnumber.c
1570
1571         #include <unistd.h>
1572         #include <stdlib.h>
1573
1574         ## number includes
1575         ## number functions
1576
1577 ###### File: number.h
1578         int number_parse(mpq_t num, char tail[3], struct text tok);
1579
1580 ###### File: scanner.mk
1581         all :: libnumber.o
1582         libnumber.o : libnumber.c
1583                 $(CC) $(CFLAGS) -c libnumber.c
1584
1585 ## Processing strings
1586
1587 Both `TK_string` and `TK_multi_string` require post-processing which
1588 can be one of two types: literal or with escapes processed.
1589 Even literal processing is non-trivial as the file may contain indents
1590 which need to be stripped.
1591
1592 Errors can only occur when processing escapes.  Any unrecognised
1593 character following the escape character will cause an error.
1594
1595 Processing escapes and striping indents can only make the string
1596 shorter, not longer, so we allocate a buffer which is the same size as
1597 the string and process into that.
1598
1599 To request escape processing, we pass the character we want to use for
1600 quoting, usually '`\`'.  To avoid escape processing we pass a zero.
1601
1602 ###### string main
1603         int string_parse(struct token *tok, char escape,
1604                          struct text *str, char tail[3])
1605         {
1606                 ## string vars
1607                 struct text t = tok->txt;
1608
1609                 str->txt = NULL;
1610                 ## strip tail
1611                 if (tok->num == TK_string) {
1612                         ## strip single
1613                 } else {
1614                         ## strip multi
1615                 }
1616                 str->txt = malloc(t.len);
1617                 str->len = 0;
1618
1619                 ## process string
1620                 return 1;
1621         err:
1622                 free(str->txt);
1623                 str->txt = NULL;
1624                 return 0;
1625         }
1626
1627 ### strip tail
1628
1629 The tail of the string can be 0, 1, or 2 letters
1630
1631         i = t.len;
1632         if (i >= 0 && isalpha(t.txt[i-1]))
1633                 i -= 1;
1634         if (i >= 0 && isalpha(t.txt[i-1]))
1635                 i -= 1;
1636         strncpy(tail, t.txt+i, t.len-i);
1637         tail[t.len-i] = 0;
1638         t.len = i;
1639
1640 ###### string vars
1641         int i;
1642
1643 ### strip single
1644
1645 Stripping the quote of a single-line string is trivial.
1646 The only part that is at all interesting is that quote character must
1647 be remembered.
1648
1649         quote = t.txt[0];
1650         if (t.txt[t.len-1] != quote)
1651                 goto err;
1652         t.txt += 1;
1653         t.len -= 2;
1654
1655 ###### string vars
1656         char quote;
1657
1658 ### strip multi
1659
1660 For a multi-line string we have a little more work to do.  We need to
1661 remove 3 quotes, not 1, and need to count the indent of the close
1662 quote as it will need to be stripped from all lines.
1663
1664         quote = t.txt[0];
1665         if (t.len < 7 ||
1666             t.txt[1] != quote || t.txt[2] != quote ||
1667             !is_newline(t.txt[3]))
1668                 goto err;
1669         t.txt += 4;
1670         t.len -= 4;
1671         i = t.len;
1672         if (i <= 0 || t.txt[i-1] != quote)
1673                 goto err;
1674         i -= 1;
1675         if (i <= 0 || t.txt[i-1] != quote)
1676                 goto err;
1677         i -= 1;
1678         if (i <= 0 || t.txt[i-1] != quote)
1679                 goto err;
1680         i -= 1;
1681         t.len = i;
1682         while (i > 0 && !is_newline(t.txt[i-1]))
1683                 i--;
1684         indent = 0;
1685         while (i < t.len) {
1686                 if (t.txt[i] == ' ')
1687                         indent += 1;
1688                 if (t.txt[i] == '\t')
1689                         indent = indent_tab(indent);
1690                 i++;
1691         }
1692
1693 ###### string vars
1694         int indent = 0;
1695
1696 ### process string
1697
1698 Now we just take one byte at a time. trans-ASCII unicode won't look
1699 like anything we are interested in so it will just be copied byte by
1700 byte.
1701
1702         cp = str->txt;
1703         at_sol = 1;
1704         for (i = 0; i < t.len; i++) {
1705                 char c;
1706                 if (at_sol) {
1707                         at_sol = 0;
1708                         ## strip indent
1709                         if (i >= t.len)
1710                                 break;
1711                 }
1712                 c = t.txt[i];
1713                 if (c != escape) {
1714                         *cp = c;
1715                         cp += 1;
1716                         if (is_newline(c))
1717                                 at_sol = 1;
1718                 } else if (i+1 >= t.len) {
1719                         // escape and end of string
1720                         goto err;
1721                 } else {
1722                         i += 1;
1723                         c = t.txt[i];
1724                         ## parse escape
1725                 }
1726         }
1727         str->len = cp - str->txt;
1728
1729 ###### string vars
1730         char *cp;
1731         int at_sol;
1732
1733 ### strip indent
1734
1735 Every time we find a start of line, we strip spaces and tabs until the
1736 required indent is found.
1737
1738         int skipped = 0;
1739         while (i < t.len && skipped < indent) {
1740                 c = t.txt[i];
1741                 if (c == ' ')
1742                         skipped += 1;
1743                 else if (c == '\t')
1744                         skipped = indent_tab(c);
1745                 else
1746                         break;
1747                 i+= 1;
1748         }
1749
1750 ### parse escape
1751         switch (c) {
1752         case 'n':
1753                 *cp++ = '\n'; break;
1754         case 'r':
1755                 *cp++ = '\r'; break;
1756         case 't':
1757                 *cp++ = '\t'; break;
1758         case 'b':
1759                 *cp++ = '\b'; break;
1760         case 'q':
1761                 *cp++ = quote; break;
1762         case 'f':
1763                 *cp++ = '\f'; break;
1764         case 'v':
1765                 *cp++ = '\v'; break;
1766         case 'a':
1767                 *cp++ = '\a'; break;
1768         case '0':
1769         case '1':
1770         case '2':
1771         case '3':
1772                 // 3 digit octal number
1773                 if (i+2 >= t.len)
1774                         goto err;
1775                 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1776                     t.txt[i+2] < '0' || t.txt[i+1] > '7')
1777                         goto err;
1778                 n = (t.txt[i  ]-'0') * 64 +
1779                     (t.txt[i+1]-'0') *  8 +
1780                     (t.txt[i+2]-'0') *  1;
1781                 *cp++ = n;
1782                 i += 2;
1783                 break;
1784         case 'x':
1785                 // 2 hex digits
1786                 n = take_hex(2, t.txt+i+1, t.len-i-1);
1787                 if (n < 0)
1788                         goto err;
1789                 *cp++ = n;
1790                 i += 2;
1791                 break;
1792         case 'u':
1793         case 'U':
1794                 // 4 or 8 hex digits for unicode
1795                 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1796                 if (n < 0)
1797                         goto err;
1798                 memset(&pstate, 0, sizeof(pstate));
1799                 n = wcrtomb(cp, n, &pstate);
1800                 if (n <= 0)
1801                         goto err;
1802                 cp += n;
1803                 i += c == 'u' ? 4 : 8;
1804                 break;
1805         default:
1806                 if (c == escape)
1807                         *cp++ = c;
1808                 else if (is_newline(c))
1809                         at_sol = 1;
1810                 else
1811                         goto err;
1812         }
1813
1814 ###### string vars
1815         long n;
1816         mbstate_t pstate;
1817
1818 For `\x` `\u` and `\U` we need to collect a specific number of
1819 hexadecimal digits
1820
1821 ###### string functions
1822
1823         static long take_hex(int digits, char *cp, int l)
1824         {
1825                 long n = 0;
1826                 if (l < digits)
1827                         return -1;
1828                 while (digits) {
1829                         char  c = *cp;
1830                         int d;
1831                         if (!isxdigit(c))
1832                                 return -1;
1833                         if (isdigit(c))
1834                                 d = c - '0';
1835                         else if (isupper(c))
1836                                 d = 10 + c - 'A';
1837                         else
1838                                 d = 10 + c - 'a';
1839                         n = n * 16 + d;
1840                         digits--;
1841                         cp++;
1842                 }
1843                 return n;
1844         }
1845
1846 #### File: libstring.c
1847
1848 String parsing goes in `libstring.c`
1849
1850         #include <unistd.h>
1851         #include <stdlib.h>
1852         #include <stdio.h>
1853         #include <string.h>
1854         #include <ctype.h>
1855         #include <wchar.h>
1856         #include "mdcode.h"
1857         #include "scanner.h"
1858         ## string functions
1859         ## string main
1860
1861 ###### File: string.h
1862         int string_parse(struct token *tok, char escape,
1863                          struct text *str, char tail[3]);
1864
1865 ###### File: scanner.mk
1866         all :: libstring.o
1867         libstring.o : libstring.c
1868                 $(CC) $(CFLAGS) -c libstring.c
1869
1870
1871 ## Testing
1872
1873 As "untested code is buggy code" we need a program to easily test
1874 the scanner library.  This will simply parse a given file and report
1875 the tokens one per line.
1876
1877 ###### File: scanner.c
1878
1879         #include <unistd.h>
1880         #include <stdlib.h>
1881         #include <fcntl.h>
1882         #include <errno.h>
1883         #include <sys/mman.h>
1884         #include <string.h>
1885         #include <stdio.h>
1886         #include <gmp.h>
1887         #include <locale.h>
1888         #include "mdcode.h"
1889         #include "scanner.h"
1890         #include "number.h"
1891         #include "string.h"
1892
1893         static int errs;
1894         static void pr_err(char *msg)
1895         {
1896                 errs++;
1897                 fprintf(stderr, "%s\n", msg);
1898         }
1899
1900         int main(int argc, char *argv[])
1901         {
1902                 int fd;
1903                 int len;
1904                 char *file;
1905                 struct token_state *state;
1906                 const char *known[] = {
1907                         "==",
1908                         "else",
1909                         "if",
1910                         "then",
1911                         "while",
1912                         "{",
1913                         "}",
1914                 };
1915                 struct token_config conf = {
1916                         .word_start = "_$",
1917                         .word_cont = "",
1918                         .words_marks = known,
1919                         .number_chars = "., _+-",
1920                         .known_count = sizeof(known)/sizeof(known[0]),
1921                         .ignored = (0 << TK_line_comment)
1922                                   |(0 << TK_block_comment),
1923                 };
1924                 struct section *table, *s, *prev;
1925                 setlocale(LC_ALL,"");
1926                 if (argc != 2) {
1927                         fprintf(stderr, "Usage: scanner file\n");
1928                         exit(2);
1929                 }
1930                 fd = open(argv[1], O_RDONLY);
1931                 if (fd < 0) {
1932                         fprintf(stderr, "scanner: cannot open %s: %s\n",
1933                                 argv[1], strerror(errno));
1934                         exit(1);
1935                 }
1936                 len = lseek(fd, 0, 2);
1937                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
1938                 table = code_extract(file, file+len, pr_err);
1939
1940                 for (s = table; s;
1941                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
1942                         printf("Tokenizing: %.*s\n", s->section.len,
1943                                 s->section.txt);
1944                         state = token_open(s->code, &conf);
1945                         while(1) {
1946                                 struct token tk = token_next(state);
1947                                 printf("%d:%d ", tk.line, tk.col);
1948                                 token_trace(stdout, tk, 20);
1949                                 if (tk.num == TK_number) {
1950                                         mpq_t num;
1951                                         char tail[3];
1952                                         if (number_parse(num, tail,tk.txt)) {
1953                                                 printf(" %s ", tail);
1954                                                 mpq_out_str(stdout, 10, num);
1955                                                 mpq_clear(num);
1956                                         } else
1957                                                 printf(" BAD NUMBER");
1958                                 }
1959                                 if (tk.num == TK_string ||
1960                                     tk.num == TK_multi_string) {
1961                                         char esc = '\\';
1962                                         struct text str;
1963                                         char tail[3];
1964                                         if (tk.txt.txt[0] == '`')
1965                                                 esc = 0;
1966                                         if (string_parse(&tk, esc,
1967                                                          &str, tail)) {
1968                                                 printf(" %s ", tail);
1969                                                 text_dump(stdout, str, 20);
1970                                                 free(str.txt);
1971                                         } else
1972                                                 printf(" BAD STRING");
1973                                 }
1974                                 printf("\n");
1975                                 if (tk.num == TK_error)
1976                                         errs = 1;
1977                                 if (tk.num == TK_eof)
1978                                         break;
1979                         }
1980                 }
1981                 exit(!!errs);
1982         }
1983 ###### File: scanner.mk
1984         scanner.c : scanner.mdc
1985                 ./md2c scanner.mdc
1986         all :: scanner
1987         scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
1988                 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
1989                         libmdcode.o libnumber.o libstring.o -licuuc -lgmp
1990         scanner.o : scanner.c
1991                 $(CC) $(CFLAGS) -c scanner.c
1992