ocean-lang.org Git - ocean/blob - csrc/scanner.mdc

   1 # Lexical Scanner #
   2
   3 ## The Task at Hand ##
   4
   5 The main task of the lexical scanner is to convert a stream of
   6 characters into a stream of tokens.  The tokens are then typically
   7 used by a parser to extract the syntactic structure.
   8
   9 The stream of characters are assumed to be in memory identified by a
  10 linked list of blocks, such as provided by the "[mdcode][]" literate
  11 program extractor.  A single token may never cross a block boundary.
  12
  13 [mdcode]: mdcode.html
  14
  15 ###### includes
  16         #include "mdcode.h"
  17
  18 The text is assumed to be UTF-8 though some matching assumes the
  19 ASCII subset.  If the text provided does not conform to UTF-8 an error
  20 will be reported and some number of bytes will be skipped.
  21
  22 ###### public types
  23         #include <wchar.h>
  24         #include <wctype.h>
  25         #include <unicode/uchar.h>
  26
  27 Tokens are returned by successive calls to the main interface
  28 function: `token_next()` which has a `state` structure to keep track
  29 of where it is up to.  Each token carries not just a numeric
  30 identifier but also the code block, the line and character within that
  31 block, and the actual start and length using the `struct text` from
  32 "mdcode".
  33
  34 ###### public types
  35         struct token {
  36                 int               num;
  37                 struct code_node *node;
  38                 struct text       txt;
  39                 int               line, col;
  40         };
  41         struct token_state;
  42
  43 ###### private types
  44         struct token_state {
  45                 ## state fields
  46         };
  47
  48 ###### exported functions
  49         struct token token_next(struct token_state *state);
  50
  51 ###### main functions
  52         struct token token_next(struct token_state *state)
  53         {
  54                 ## token_next init
  55                 while (1) {
  56                         wint_t ch;
  57                         struct token tk;
  58
  59                         ## one token
  60                 }
  61         }
  62
  63 The `line` and `col` offsets are useful for reporting errors.
  64 The `txt` provides the content when that is important.
  65
  66 ### Token types and configuration ##
  67
  68 The scanner is not completely general, yet not completely specified.
  69 There are a fixed set of token types, though particular tokens within
  70 those types can be distinguish via configuration.
  71
  72 Most token types may be explicitly ignored, as typically comments
  73 would be.  The exact consequence of ignoring each token type varies
  74 from token to token.
  75
  76 ###### public types
  77         struct token_config {
  78                 int ignored;    // bit set of ignored tokens.
  79                 ## token config parameters
  80         };
  81
  82 ###### state fields
  83         struct token_config *conf;
  84
  85 ###### token_next init
  86         int ignored = state->conf->ignored;
  87
  88
  89 The different tokens are numbers, words, marks, strings, comments,
  90 newlines, EOF, and indents, each of which is examined in detail below.
  91
  92 There are various cases where no token can be found in part of the
  93 input.  All of these will be reported as an `TK_error` token.
  94
  95 It is possible to declare a number of strings which form distinct
  96 tokens (rather than being grouped as e.g. 'word').  These are given
  97 token numbers from `TK_reserved` upwards.
  98
  99 ###### public types
 100         enum token_num {
 101                 TK_error,
 102                 ## token types
 103                 TK_reserved
 104         };
 105
 106 ### Numbers
 107
 108 Numbers are the messiest tokens to parse, primarily because they can
 109 contain characters that also have meaning outside of number and,
 110 particularly, immediately after numbers.
 111
 112 The obvious example is the '`-`' sign.  It can come inside a number for
 113 a negative exponent, or after a number as a subtraction operator.  To
 114 be sure we have parsed as best as possible we need to only allow the
 115 '`-`' inside a number if it is after an exponent character.  This can be
 116 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
 117 digit, so we don't allow '`-`' after just any `e`.
 118
 119 To make matters worse, our language designer has decided to experiment
 120 with allowing commas to be used as the decimal indicator, and spaces
 121 to be used to separate groups of digits in large numbers.  Both of
 122 these can reasonably be restricted to appear between two digits, so we
 123 have to add that condition to our tests.
 124
 125 So we cannot just treat numbers as starting with a digit and being
 126 followed by some set of characters.  We need more structure than that.
 127
 128 So:
 129
 130 - Numbers must start with a digit.
 131 - If the first digit is zero, the next character must be a base
 132   signifier (one of `xob`) or a decimal marker (`.` or `,`).
 133   In the first case the first `p` or `P` may be followed by a sign.
 134 - If the number doesn't start with `0` followed by one of `xob`, the
 135   first `e` may be followed by a sign.
 136 - Any digit or hex digit may be followed by a space or underscore
 137   providing that the subsequence character is also a (hex) digit.
 138   This rule will require an extra level of 'unget' to be
 139   supported when handling characters.
 140 - Otherwise any digits or ASCII letters are allowed.  We do not at
 141   this point check that the digits given are permitted by the base.
 142   That will happen when the token is converted to a number.
 143
 144 To allow easy configuration, the various non alphanumeric characters
 145 are only permitted if they are listed in a configuration parameter.
 146
 147 ###### token config parameters
 148         char *number_chars;
 149
 150 Note that numbers may not start with a period, so `.75` is not a
 151 number.  This is not the norm, but is not unheard of.  Excluding these
 152 numbers simplifies the rule at very little cost.
 153
 154 ###### token types
 155         TK_number,
 156
 157 If TK_number is ignored, digits will result in an error unless they
 158 are declared to be a start character for words.
 159
 160 ###### includes
 161
 162         #include <string.h>
 163
 164 ###### parse number
 165
 166         if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
 167                 int prev_special = 0;
 168                 int expect_p = 0;
 169                 int decimal_mark = 0;
 170                 if (ch == '0') {
 171                         wchar_t ch2 = get_char(state);
 172                         if (strchr("xobXOB", ch2) != NULL)
 173                                 expect_p = 1;
 174                         unget_char(state);
 175                 }
 176                 while (1) {
 177                         int sign_ok = 0;
 178                         switch(expect_p) {
 179                         case 0:
 180                                 if (ch == 'e' || ch == 'E')
 181                                         sign_ok = 1;
 182                                 break;
 183                         case 1:
 184                                 if (ch == 'p' || ch == 'P')
 185                                         sign_ok = 1;
 186                                 break;
 187                         }
 188                         save_unget_state(state);
 189                         ch = get_char(state);
 190                         if (iswalnum(ch)) {
 191                                 prev_special = 0;
 192                                 continue;
 193                         }
 194                         if (ch == '+' || ch == '-') {
 195                                 if (!sign_ok)
 196                                         break;
 197                                 expect_p = -1;
 198                         }
 199                         if (ch == '.' || ch == ',') {
 200                                 if (decimal_mark)
 201                                         break;
 202                                 decimal_mark = 1;
 203                         }
 204                         if (prev_special) {
 205                                 /* Don't allow that special char,
 206                                  * need two 'ungets'
 207                                  */
 208                                 restore_unget_state(state);
 209                                 break;
 210                         }
 211                         if (strchr(state->conf->number_chars, ch)) {
 212                                 prev_special = 1;
 213                                 continue;
 214                         }
 215                         /* non-number char */
 216                         break;
 217                 }
 218                 /* We seem to have a "number" token */
 219                 unget_char(state);
 220                 close_token(state, &tk);
 221                 tk.num = TK_number;
 222                 return tk;
 223         }
 224
 225 ### Words
 226 Words start with a "start" character followed by the longest
 227 sequence of "continue" characters.  The Unicode ID_START and
 228 ID_CONTINUE sets are always permitted, but other ASCII characters
 229 can be added to these sets.
 230
 231 ###### token config parameters
 232         char *word_start;
 233         char *word_cont;
 234
 235 ###### internal functions
 236         static int is_word_start(wchar_t ch, struct token_config *conf)
 237         {
 238                 return iswalpha(ch) ||
 239                        strchr(conf->word_start, ch) != NULL ||
 240                        u_hasBinaryProperty(ch, UCHAR_ID_START);
 241         }
 242
 243         static int is_word_continue(wchar_t ch, struct token_config *conf)
 244         {
 245                 return iswalnum(ch) ||
 246                        strchr(conf->word_cont, ch) != NULL ||
 247                        u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
 248         }
 249
 250 Words can be either known or unknown.  Known words are referred to as
 251 "reserved words" and get a unique token number.  Unknown words are
 252 "identifiers" and are syntactically a single token.
 253
 254 ###### token types
 255         TK_ident,
 256
 257 A list of known words must be provided.  This list is shared with the
 258 "marks" which are described next.  The list must be lexically sorted
 259 and the length of the list must be given (`known_count`).
 260 Tokens matching these known words are reported as the index of the
 261 list added to `TK_reserved`.
 262
 263 ###### token config parameters
 264         const char **words_marks;
 265         int known_count;
 266
 267 ###### parse word
 268
 269         if (is_word_start(ch, state->conf)) {
 270                 int n;
 271                 /* A word: identifier or reserved */
 272                 do
 273                         ch = get_char(state);
 274                 while (is_word_continue(ch, state->conf));
 275                 unget_char(state);
 276                 close_token(state, &tk);
 277                 tk.num = TK_ident;
 278                 if (ignored & (1<<TK_ident))
 279                         tk.num = TK_error;
 280                 n = find_known(state->conf, tk.txt);
 281                 if (n >= 0)
 282                         tk.num = TK_reserved + n;
 283                 return tk;
 284         }
 285
 286 ### Marks
 287
 288 Marks are generally one or more punctuation marks joined together.  It
 289 would be nice to use the term "symbol" for these, but that causes
 290 confusion in a subsequent discussion of the grammar, which has terminal
 291 symbols and non-terminal symbols which are conceptually quite
 292 different.  So strings of punctuation characters will be marks.
 293
 294 A "mark" consists of ASCII characters that are not white space, are not
 295 "start" characters for words, and are not digits.
 296 These will collectively be called mark characters.
 297
 298 ###### internal functions
 299         static int is_mark(wchar_t ch, struct token_config *conf)
 300         {
 301                 return ch > ' ' &&
 302                        ch < 0x7f &&
 303                        !iswalnum(ch) &&
 304                        strchr(conf->word_start, ch) == NULL;
 305         }
 306
 307 As with words, there can be known and unknown marks, though the rules
 308 are slightly different.
 309
 310 Two marks do not need to be separated by a non-mark characters.  This
 311 is different from words which do need to be separated by at least one
 312 non-continue character.
 313
 314 The scanner will normally prefer longer sequences of mark characters,
 315 but will more strongly prefer known marks over unknown marks.  So if
 316 it finds a known mark where adding one more character does not result
 317 in a known mark, it will return that first known mark.
 318
 319 If no known mark is found we will test against strings and comments
 320 below before giving up and assuming an unknown mark.
 321
 322 If an unknown mark contains a quote character or a comment marker, and
 323 that token is not being ignored, then we terminate the unknown mark
 324 before that quote or comment.  This ensure that an unknown mark
 325 immediately before a string is handled correctly.
 326
 327 If `TK_mark` is ignored, then unknown marks as returned as an error.
 328
 329 ###### token types
 330         TK_mark,
 331
 332 Known marks are included in the same list as the list of known words.
 333
 334 ###### parse mark
 335         tk.num = TK_error;
 336         while (is_mark(ch, state->conf)) {
 337                 int n;
 338                 wchar_t prev;
 339                 close_token(state, &tk);
 340                 n = find_known(state->conf, tk.txt);
 341                 if (n >= 0)
 342                         tk.num = TK_reserved + n;
 343                 else if (tk.num != TK_error) {
 344                         /* found a longest-known-mark */
 345                         unget_char(state);
 346                         close_token(state, &tk);
 347                         return tk;
 348                 }
 349                 prev = ch;
 350                 save_unget_state(state);
 351                 ch = get_char(state);
 352                 if (!(ignored && (1<<TK_string)) && is_quote(ch))
 353                         break;
 354                 if (prev == '#')
 355                         break;
 356                 if (prev == '/' && ch == '/' && tk.txt.len > 1) {
 357                         restore_unget_state(state);
 358                         break;
 359                 }
 360                 if (prev == '/' && ch == '*' && tk.txt.len > 1) {
 361                         restore_unget_state(state);
 362                         break;
 363                 }
 364         }
 365         unget_char(state);
 366         if (tk.num != TK_error) {
 367                 close_token(state, &tk);
 368                 return tk;
 369         }
 370
 371 If we don't find a known mark, we will check for strings and comments
 372 before assuming that we have an unknown mark
 373
 374 ###### parse mark
 375         ## parse string
 376         ## parse comment
 377         ## unknown mark
 378
 379 ###### unknown mark
 380         if (tk.txt.len) {
 381                 if (ignored & (1<<TK_mark))
 382                         tk.num = TK_error;
 383                 else
 384                         tk.num = TK_mark;
 385                 return tk;
 386         }
 387
 388 ### Strings
 389
 390 Strings start with one of single quote, double quote, or back quote
 391 and continue until a matching character on the same line.  Any of
 392 these characters can be included in the list of known marks and then
 393 they will not be used for identifying strings.
 394
 395 Immediately following the close quote one or two ASCII letters may
 396 appear.  These are somewhat like the arbitrary letters allowed in
 397 "Numbers" above.  They can be used by the language in various ways.
 398
 399 If 3 identical quote characters appear in a row and are
 400 followed by a newline, then this forms a multi-line string which
 401 continues until an identical triple quote appears on a line preceded
 402 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
 403
 404 Multi-line strings may not extend beyond the end of the `code_node` in
 405 which they start.
 406
 407 Normal strings and multi-line strings are encoded as two different
 408 token types.
 409
 410 ###### token types
 411         TK_string,
 412         TK_multi_string,
 413
 414 ###### internal functions
 415         static int is_quote(wchar_t ch)
 416         {
 417                 return ch == '\'' || ch == '"' || ch == '`';
 418         }
 419
 420 #### Multi-line strings
 421
 422 The multi-line string is checked for first.  If they are being
 423 ignored, we fall through and treat a triple quote as an empty string
 424 followed by the start of a new string.
 425
 426 ###### parse string
 427         if (tk.txt.len == 3 &&
 428             !(ignored & (1 << TK_multi_string)) &&
 429             is_quote(tk.txt.txt[0]) &&
 430             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
 431             is_newline(tk.txt.txt[3])) {
 432                 // triple quote
 433                 wchar_t first = tk.txt.txt[0];
 434                 int qseen = 0;
 435                 int at_sol = 1;
 436                 while (!at_eon(state) && qseen < 3) {
 437                         ch = get_char(state);
 438                         if (is_newline(ch)) {
 439                                 at_sol = 1;
 440                                 qseen = 0;
 441                         } else if (at_sol && ch == first) {
 442                                 qseen += 1;
 443                         } else if (ch != ' ' && ch != '\t') {
 444                                 at_sol = 0;
 445                                 qseen = 0;
 446                         }
 447                 }
 448                 if (qseen != 3) {
 449                         /* Hit end of node - error.
 450                          * unget so the newline is seen,
 451                          * but return rest of string as an error.
 452                          */
 453                         if (is_newline(ch))
 454                                 unget_char(state);
 455                         close_token(state, &tk);
 456                         tk.num = TK_error;
 457                         return tk;
 458                 }
 459                 /* 2 letters are allowed */
 460                 ch = get_char(state);
 461                 if (iswalpha(ch))
 462                         ch = get_char(state);
 463                 if (iswalpha(ch))
 464                         ch = get_char(state);
 465                 /* Now we must have a newline, but we don't return it
 466                  * whatever it is.*/
 467                 unget_char(state);
 468                 close_token(state, &tk);
 469                 tk.num = TK_multi_string;
 470                 if (!is_newline(ch))
 471                         tk.num = TK_error;
 472                 return tk;
 473         }
 474
 475 #### Single-line strings
 476
 477 The sequence of marks collected may be more than a single-line
 478 string, so we reset to the start and collect characters until
 479 we find a close quote or a newline.
 480
 481 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
 482
 483 ###### parse string
 484         if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
 485             !(ignored & (1<<TK_string))) {
 486                 wchar_t first = tk.txt.txt[0];
 487                 reset_token(state, &tk);
 488                 ch = get_char(state);
 489                 tk.num = TK_error;
 490                 while (!at_eon(state) && !is_newline(ch)) {
 491                         ch = get_char(state);
 492                         if (ch == first) {
 493                                 tk.num = TK_string;
 494                                 break;
 495                         }
 496                         if (is_newline(ch)) {
 497                                 unget_char(state);
 498                                 break;
 499                         }
 500                 }
 501                 close_token(state, &tk);
 502                 return tk;
 503         }
 504
 505 ### Comments
 506
 507 Single line comments may start with '`//`' or '`#`' providing that these
 508 are not known marks.  They continue to the end of the line.
 509
 510 Block comments start with '`/*`' if this is not a known mark.  They
 511 continue to the first occurrence of '`*/`' and may not contain any
 512 occurrence of '`/*`'.
 513
 514 Block comments can be wholly within one line or can continue over
 515 multiple lines.  The multi-line version should be followed immediately
 516 by a newline.  The Linux kernel contains over 285000 multi-line
 517 comments are only 34 are followed by characters other than white space
 518 (which should be removed) or a backslash (only needed in macros).  So
 519 it would not suffer from this rule.
 520
 521 These two comment types are reported as two separate token types, and
 522 consequently can be ignored separately.  When ignored a comment is
 523 still parsed, but is discarded.
 524
 525 ###### token types
 526         TK_line_comment,
 527         TK_block_comment,
 528
 529 ###### internal functions
 530         static int is_line_comment(struct text txt)
 531         {
 532                 return (txt.len >= 1 && txt.txt[0] == '#') ||
 533                        (txt.len >= 2 && txt.txt[0] == '/' &&
 534                                         txt.txt[1] == '/');
 535         }
 536
 537         static int is_block_comment(struct text txt)
 538         {
 539                 return txt.len >= 2 && txt.txt[0] == '/' &&
 540                        txt.txt[1] == '*';
 541         }
 542
 543 #### Single line comments
 544
 545 A single-line comment continues up to, but not including the newline
 546 or end of node.
 547
 548 ###### parse comment
 549
 550         if (is_line_comment(tk.txt)) {
 551                 while (!is_newline(ch) && !at_eon(state))
 552                         ch = get_char(state);
 553                 if (is_newline(ch))
 554                         unget_char(state);
 555                 close_token(state, &tk);
 556                 tk.num = TK_line_comment;
 557                 if (ignored & (1 << TK_line_comment))
 558                         continue;
 559                 return tk;
 560         }
 561
 562 #### Block comments
 563
 564 The token text collected so far could exceed the comment, so we need
 565 to reset it first.
 566
 567 If we find an embedded `/*` we reset to just before the '/' and report
 568 an error.  That way the next thing to be parsed will be the rest of
 569 the comment.  This requires a double unget, so we need to save/restore
 570 the unget state (explained later).
 571
 572 ###### parse comment
 573
 574         if (is_block_comment(tk.txt)) {
 575                 wchar_t prev;
 576                 int newlines = 0;
 577                 reset_token(state, &tk);
 578                 get_char(state);
 579                 get_char(state);
 580                 save_unget_state(state);
 581                 ch = get_char(state);
 582                 prev = 0;
 583                 while (!at_eon(state) &&
 584                        (prev != '/' || ch != '*') &&
 585                        (prev != '*' || ch != '/')) {
 586                         if (is_newline(ch))
 587                                 newlines = 1;
 588                         prev = ch;
 589                         save_unget_state(state);
 590                         ch = get_char(state);
 591                 }
 592                 close_token(state, &tk);
 593                 if (at_eon(state)) {
 594                         tk.num = TK_error;
 595                         return tk;
 596                 }
 597                 if (prev == '/') {
 598                         /* embedded.  Need to unget twice! */
 599                         restore_unget_state(state);
 600                         unget_char(state);
 601                         tk.num = TK_error;
 602                         return tk;
 603                 }
 604                 tk.num = TK_block_comment;
 605                 if (newlines && !(ignored & (1<<TK_newline))) {
 606                         /* next char must be newline */
 607                         ch = get_char(state);
 608                         unget_char(state);
 609                         if (!is_newline(ch))
 610                                 tk.num = TK_error;
 611                 }
 612                 if (tk.num == TK_error ||
 613                     !(ignored & (1 << TK_block_comment)))
 614                         return tk;
 615                 continue;
 616         }
 617
 618 ### Indents, Newlines, and White Space.
 619
 620 Normally white space is ignored.  However newlines can be important as
 621 can indents, which are either after a newline or at the start of a
 622 node (detected by `at_son()`);
 623
 624 ###### exported functions
 625         static inline int is_newline(wchar_t ch)
 626         {
 627                 return ch == '\n' || ch == '\f' || ch == '\v';
 628         }
 629
 630 ###### white space
 631         if (ch <= ' ' && !is_newline(ch)
 632             && ! at_son(state))
 633                 continue;
 634
 635 If a line starts with more white-space than the previous non-blank
 636 line - or if the first non-blank line in the document starts with any
 637 white-space - then an "IN" is reported at the start of the line.
 638
 639 Before the next non-blank line which starts with less white space, or
 640 at the latest at the end of the document, a matching "OUT" token
 641 is reported.  There will always be an exact match between "IN" and
 642 "OUT" tokens.
 643
 644 It is possible for "OUT" to be followed (almost) immediately by an
 645 "IN".  This happens if, for example, the indent of three consecutive
 646 lines are 0, 8, 4 spaces.  Before the second line we report an
 647 "IN".  Before the third line we must report an "OUT", as 4 is less
 648 than 8, then also an Ident as 4 is greater than 0.
 649
 650 ###### token types
 651         TK_in,
 652         TK_out,
 653
 654 For the purpose of measuring the length of white space, a tab adds at
 655 least one space, and rounds up to a multiple of 8.
 656
 657 ###### exported functions
 658         static inline int indent_tab(int indent)
 659         {
 660                 return (indent|7)+1;
 661         }
 662
 663 We need to track the current levels of indent.  This requires some
 664 sort of stack as indent levels are pushed on and popped off.  In
 665 practice this stack is unlikely to often exceed 5 so we will used a
 666 fixed stack of 20 indent levels.  More than this will be silently
 667 ignored.
 668
 669 ###### state fields
 670         int     indent_level;
 671         int     indent_sizes[20];
 672
 673 #### Newlines
 674
 675 Newlines can optionally be reported.  Newlines within a block comment
 676 or a multi-line string are not reported separately, but each of these
 677 must be followed immediately by a newline so these constructs cannot
 678 hide the fact that a newline was present.
 679
 680 When indents are being reported, the Newline which would normally be
 681 reported immediately before the "IN" is delayed until after the
 682 matching "OUT".  This makes an indented section act like a
 683 continuation of the previous line to some extent.
 684
 685 A blank line would normally be reported simply as two consecutive Newline
 686 tokens.  However if the subsequent line is indented (and indents are being
 687 reported) then the right thing to do is less obvious as Newlines should be
 688 delayed - but how many Newlines?
 689
 690 The approach we will take is to report the extra Newlines immediately after
 691 the IN token, so the blank line is treated as though it were an indented
 692 blank line.
 693
 694 ###### token types
 695         TK_newline,
 696
 697 If we find a newline or white space at the start of a block, we keep
 698 collecting spaces, tabs, and newlines until we find some real text.
 699 Then depending on the indent we generate some number of tokens.  These
 700 will be a sequence of "Newline OUT" pairs representing a decrease
 701 in indent, then either a Newline or an IN depending on whether the
 702 next line is indented, then zero or more Newlines representing all the
 703 blank lines that have been skipped.
 704
 705 When a Newline leads to the next block of code there is a question of
 706 whether the various Newline and OUT/IN tokens should appear to
 707 pbelong to the earlier or later block.  This is addressed by processing
 708 the tokens in two stages based on the relative indent levels of the
 709 two blocks (each block has a base indent to which the actual indents
 710 are added).
 711
 712 Any "Newline OUT" pairs needed to reduce the current indent to the
 713 maximum of the base indents of the old and new blocks are generated
 714 against the old block.  Then if the next block does not have an
 715 increased indent, one more "Newline" is generated.
 716
 717 If further "Newline OUT" pairs are needed to get to the indent
 718 level of the 'next' block, they are generated against that block,
 719 though the first Newline is suppressed (it having already been
 720 generated).
 721
 722 Finally the Newline or IN for the first line of the new block is
 723 generated, unless the Newline needs to be suppressed because it
 724 appeared at the end of the previous block.
 725
 726 This means that a block may start with an OUT or an IN, but
 727 will only start with a Newline if it actually starts with a blank
 728 line.
 729
 730 We will need to represent in the `token_state` where in this sequence
 731 of delayed tokens we are.  As `state.col` records the target indent we
 732 don't need to record how many OUTs or INs are needed.  We do
 733 need to record the number of blank lines, and which of Newline and
 734 OUT is needed next in the initial sequence of pairs.
 735
 736 For this we store one more than the number of blank lines as
 737 `delayed_lines` and a flag for `out_next`.
 738
 739 ###### state fields
 740         int check_indent;
 741         int delayed_lines;
 742         int out_next;
 743
 744 Generating these tokens involve two separate pieces of code.
 745
 746 Firstly we need to recognise white space and count the indents and
 747 newlines.  These are recorded in the above state fields.
 748
 749 Separately we need, on each call to `token_next`, we need to check if
 750 there are some delayed tokens and if so we need to advance the state
 751 information and return one token.
 752
 753 ###### white space
 754         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
 755                 int newlines = 0;
 756                 int was_son = at_son(state);
 757                 if (ignored & (1<<TK_in)) {
 758                         if (!is_newline(ch))
 759                                 continue;
 760                         if (ignored & (1<<TK_newline))
 761                                 continue;
 762                         tk.num = TK_newline;
 763                         close_token(state, &tk);
 764                         return tk;
 765                 }
 766                 // Indents are needed, so check all white space.
 767                 while (ch <= ' ' && !at_eon(state)) {
 768                         if (is_newline(ch))
 769                                 newlines += 1;
 770                         ch = get_char(state);
 771                 }
 772                 if (at_eon(state)) {
 773                         newlines += 1;
 774                         if (state->node->next &&
 775                             state->node->next->indent > state->node->indent)
 776                                 state->col = state->node->next->indent;
 777                         else
 778                                 state->col = state->node->indent;
 779                 } else
 780                         unget_char(state);
 781                 state->delayed_lines = newlines;
 782                 state->out_next = was_son;
 783                 state->check_indent = 1;
 784                 continue;
 785         }
 786
 787
 788 ###### delayed tokens
 789
 790         if (state->check_indent || state->delayed_lines) {
 791                 if (state->col < state->indent_sizes[state->indent_level]) {
 792                         if (!state->out_next &&
 793                             !(ignored & (1<<TK_newline))) {
 794                                 state->out_next = 1;
 795                                 tk.num = TK_newline;
 796                                 return tk;
 797                         }
 798                         state->indent_level -= 1;
 799                         state->out_next = 0;
 800                         tk.num = TK_out;
 801                         return tk;
 802                 }
 803                 if (state->col > state->indent_sizes[state->indent_level] &&
 804                     state->indent_level < sizeof(state->indent_sizes)-1) {
 805                         state->indent_level += 1;
 806                         state->indent_sizes[state->indent_level] = state->col;
 807                         state->delayed_lines -= 1;
 808                         tk.num = TK_in;
 809                         return tk;
 810                 }
 811                 state->check_indent = 0;
 812                 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
 813                         tk.num = TK_newline;
 814                         state->delayed_lines -= 1;
 815                         return tk;
 816                 }
 817                 state->delayed_lines = 0;
 818                 continue;
 819         }
 820
 821 ### End of File
 822
 823 After the last newline in the file has been processed, a special
 824 end-of-file token will be returned.  any further attempts to get more
 825 tokens will continue to return the same end-of-file token.
 826
 827 ###### token types
 828         TK_eof,
 829
 830
 831 ###### white space
 832         if (ch == WEOF) {
 833                 if (state->col) {
 834                         state->col = 0;
 835                         state->check_indent = 1;
 836                         continue;
 837                 }
 838                 tk.num = TK_eof;
 839                 return tk;
 840         }
 841
 842 ### Unknown Marks, or errors.
 843
 844 We have now handled all the possible known mark-like tokens.
 845 If the token we have is not empty and `TK_mark` is allowed,
 846 we have an unknown mark, otherwise this must be an error.
 847
 848 ###### unknown mark
 849         /* one unknown character */
 850         close_token(state, &tk);
 851         tk.num = TK_error;
 852         return tk;
 853
 854 ## Tools For The Task
 855
 856 You may have noticed that are few gaps we left in the above -
 857 functions used without first defining them.  Doing so above would have
 858 broken the flow.
 859
 860 ### Character by character
 861
 862 As we walk through the various `code_node`s we need to process whole
 863 Unicode codepoints, and keep track of which line and column we are on.
 864 We will assume for now that any printing character uses one column,
 865 though that is not true in general.
 866
 867 As the text in a `code_node` may include an indent that identifies it as
 868 being code, we need to be careful to strip that.  The `code_node` has
 869 a flag that tells us whether or not we need to strip.
 870
 871 ###### includes
 872         #include <memory.h>
 873
 874 ###### state fields
 875         struct code_node *node;
 876         int    offset;
 877         int    line;
 878         int    col;
 879
 880 ###### internal functions
 881
 882         static void do_strip(struct token_state *state)
 883         {
 884                 if (state->node->needs_strip) {
 885                         int n = 4;
 886                         while (n && state->node->code.txt[state->offset] == ' ') {
 887                                 state->offset += 1;
 888                                 n -= 1;
 889                         }
 890                         while (n == 4 && state->node->code.txt[state->offset] == '\t') {
 891                                 state->offset += 1;
 892                                 n -= 4;
 893                         }
 894                 }
 895         }
 896
 897         static wint_t get_char(struct token_state *state)
 898         {
 899                 wchar_t next;
 900                 size_t n;
 901                 mbstate_t mbstate;
 902
 903                 if (state->node == NULL)
 904                         return WEOF;
 905                 if (state->node->code.len <= state->offset) {
 906                         do
 907                                 state->node = state->node->next;
 908                         while (state->node && state->node->code.txt == NULL);
 909                         state->offset = 0;
 910                         if (state->node == NULL)
 911                                 return WEOF;
 912                         do_strip(state);
 913                         state->line = state->node->line_no;
 914                         state->col = state->node->indent;
 915                 }
 916
 917                 ## before get_char
 918
 919                 memset(&mbstate, 0, sizeof(mbstate));
 920
 921                 n = mbrtowc(&next, state->node->code.txt + state->offset,
 922                             state->node->code.len - state->offset,
 923                             &mbstate);
 924                 if (n == -2 || n == 0) {
 925                         /* Not enough bytes - not really possible */
 926                         next = '\n';
 927                         state->offset = state->node->code.len;
 928                 } else if (n == -1) {
 929                         /* error */
 930                         state->offset += 1;
 931                         next = 0x7f; // an illegal character
 932                 } else
 933                         state->offset += n;
 934
 935                 if (next >= ' ') {
 936                         state->col += 1;
 937                 } else if (is_newline(next)) {
 938                         state->line += 1;
 939                         state->col = state->node->indent;
 940                         do_strip(state);
 941                 } else if (next == '\t') {
 942                         state->col = indent_tab(state->col);
 943                 }
 944                 return next;
 945         }
 946
 947 We will sometimes want to "unget" the last character as it needs to be
 948 considered again as part of the next token.  So we need to store a
 949 'previous' version of all metadata.
 950
 951 ###### state fields
 952         int    prev_offset;
 953         int    prev_line;
 954         int    prev_col;
 955
 956 ###### before get_char
 957         state->prev_offset = state->offset;
 958         state->prev_line   = state->line;
 959         state->prev_col    = state->col;
 960
 961 ###### internal functions
 962
 963         static void unget_char(struct token_state *state)
 964         {
 965                 if (state->node) {
 966                         state->offset = state->prev_offset;
 967                         state->line   = state->prev_line;
 968                         state->col    = state->prev_col;
 969                 }
 970         }
 971
 972 We occasionally need a double-unget, particularly for numbers and
 973 block comments.  We don't impose this cost on all scanning, but
 974 require those code sections that need it to call `save_unget_state`
 975 before each `get_char`, and then `restore_unget_state` when a
 976 double-unget is needed.
 977
 978 ###### state fields
 979         int     prev_offset2;
 980         int     prev_line2;
 981         int     prev_col2;
 982
 983 ###### internal functions
 984         static void save_unget_state(struct token_state *state)
 985         {
 986                 state->prev_offset2 = state->prev_offset;
 987                 state->prev_line2 = state->prev_line;
 988                 state->prev_col2 = state->prev_col;
 989         }
 990
 991         static void restore_unget_state(struct token_state *state)
 992         {
 993                 state->prev_offset = state->prev_offset2;
 994                 state->prev_line = state->prev_line2;
 995                 state->prev_col = state->prev_col2;
 996         }
 997
 998 At the start of a token we don't want to be at the end of a code block
 999 if we can help it.  To avoid this possibility, we 'get' and 'unget' a
1000 single character.  This will move into the next non-empty code block
1001 and leave the current pointer at the start of it.
1002
1003 This has to happen _after_ dealing with delayed tokens as some of them
1004 must appear in the previous node.  When we do this, we need to reset
1005 the data in the token.
1006
1007 ###### delayed tokens
1008         if (at_eon(state)) {
1009                 get_char(state);
1010                 unget_char(state);
1011                 tk.node = state->node;
1012                 if (state->node)
1013                         tk.txt.txt = state->node->code.txt + state->offset;
1014                 tk.line = state->line;
1015                 tk.col = state->col;
1016                 tk.txt.len = 0;
1017         }
1018
1019 ### Managing tokens
1020
1021 The current token is initialized to line up with the first character
1022 that we 'get' for each token.  When we have, or might have, a full
1023 token we can call `close_token` to set the `len` of the token
1024 appropriately.  This can safely be called multiple times.
1025
1026 Finally we occasionally (for single-line strings and block comments)
1027 need to reset to the beginning of the current token as we might have
1028 parsed too much already.  For that there is `reset_token`.
1029
1030 ###### one token
1031         tk.node = state->node;
1032         if (state->node)
1033                 tk.txt.txt = state->node->code.txt + state->offset;
1034         tk.line = state->line;
1035         tk.col = state->col;
1036         tk.txt.len = 0;
1037
1038 ###### internal functions
1039
1040         static void close_token(struct token_state *state,
1041                                 struct token *tk)
1042         {
1043                 tk->txt.len = (state->node->code.txt + state->offset)
1044                               - tk->txt.txt;
1045         }
1046
1047         static void reset_token(struct token_state *state, struct token *tok)
1048         {
1049                 state->prev_line = tok->line;
1050                 state->prev_col = tok->col;
1051                 state->prev_offset = tok->txt.txt - state->node->code.txt;
1052                 unget_char(state);
1053                 tok->txt.len = 0;
1054         }
1055
1056
1057 Tokens make not cross into the next `code_node`, and some tokens can
1058 include the newline at the and of a `code_node`, we must be able to
1059 easily check if we have reached the end.  Equally we need to know if
1060 we are at the start of a node, as white space is treated a little
1061 differently there.
1062
1063 ###### internal functions
1064
1065         static int at_son(struct token_state *state)
1066         {
1067                 return state->offset == 0;
1068         }
1069
1070         static int at_eon(struct token_state *state)
1071         {
1072                 // at end-of-node ??
1073                 return state->node == NULL ||
1074                        state->offset >= state->node->code.len;
1075         }
1076
1077 ### Find a known word
1078
1079 As the known-word list is sorted we can use a simple binary search.
1080 Following the pattern established in "mdcode", we will use a `struct
1081 text` with start and length to represent the code fragment we are
1082 searching for.
1083
1084 ###### internal functions
1085         static int find_known(struct token_config *conf, struct text txt)
1086         {
1087                 int lo = 0;
1088                 int hi = conf->known_count;
1089
1090                 while (lo + 1 < hi) {
1091                         int mid = (lo + hi) / 2;
1092                         int cmp = strncmp(conf->words_marks[mid],
1093                                           txt.txt, txt.len);
1094                         if (cmp == 0 && conf->words_marks[mid][txt.len])
1095                                 cmp = 1;
1096                         if (cmp <= 0)
1097                                 lo = mid;
1098                         else
1099                                 hi = mid;
1100                 }
1101                 if (strncmp(conf->words_marks[lo],
1102                            txt.txt, txt.len) == 0
1103                     && conf->words_marks[lo][txt.len] == 0)
1104                         return lo;
1105                 else
1106                         return -1;
1107         }
1108
1109 ### Bringing it all together
1110
1111 Now we have all the bits there is just one section missing:  combining
1112 all the token parsing code into one block.
1113
1114 The handling of delayed tokens (Newlines, INs, OUTs) must come
1115 first before we try getting another character.
1116
1117 Then we parse all the test, making sure that we check for known marks
1118 before strings and comments, but unknown marks after strings and comments.
1119
1120 This block of code will either return a token, or will choose to
1121 ignore one, in which case it will `continue` around to the top of the
1122 loop.
1123
1124 ###### one token
1125         ## delayed tokens
1126
1127         ch = get_char(state);
1128
1129         ## white space
1130         ## parse number
1131         ## parse word
1132         ## parse mark
1133
1134 ### Start and stop
1135
1136 As well as getting tokens, we need to be able to create the
1137 `token_state` to start with, and discard it later.
1138
1139 ###### includes
1140         #include <malloc.h>
1141
1142 ###### main functions
1143         struct token_state *token_open(struct code_node *code, struct
1144                                        token_config *conf)
1145         {
1146                 struct token_state *state = malloc(sizeof(*state));
1147                 memset(state, 0, sizeof(*state));
1148                 state->node = code;
1149                 state->line = code->line_no;
1150                 state->col = code->indent;
1151                 state->conf = conf;
1152                 do_strip(state);
1153                 return state;
1154         }
1155         void token_close(struct token_state *state)
1156         {
1157                 free(state);
1158         }
1159
1160 ###### exported functions
1161         struct token_state *token_open(struct code_node *code, struct
1162                                        token_config *conf);
1163         void token_close(struct token_state *state);
1164
1165 ### Trace tokens
1166
1167 Getting tokens is the main thing but it is also useful to be able to
1168 print out token information, particularly for tracing and testing.
1169
1170 Known tokens are printed verbatim.  Other tokens are printed as
1171 `type(content)` where content is truncated to a given number of characters.
1172
1173 The function for printing a truncated string (`text_dump`) is also exported
1174 so that it can be used to tracing processed strings too.
1175
1176 ###### includes
1177         #include <stdio.h>
1178
1179 ###### exported functions
1180         void token_trace(FILE *f, struct token tok, int max);
1181         void text_dump(FILE *f, struct text t, int max);
1182
1183 ###### main functions
1184
1185         void text_dump(FILE *f, struct text txt, int max)
1186         {
1187                 int i;
1188                 if (txt.len > max)
1189                         max -= 2;
1190                 else
1191                         max = txt.len;
1192                 for (i = 0; i < max; i++) {
1193                         char c = txt.txt[i];
1194                         if (c < ' ' || c > '~')
1195                                 fprintf(f, "\\x%02x", c & 0xff);
1196                         else if (c == '\\')
1197                                 fprintf(f, "\\\\");
1198                         else
1199                                 fprintf(f, "%c", c);
1200                 }
1201                 if (i < txt.len)
1202                         fprintf(f, "..");
1203         }
1204
1205         void token_trace(FILE *f, struct token tok, int max)
1206         {
1207                 static char *types[] = {
1208                         [TK_ident] = "ident",
1209                         [TK_mark] = "mark",
1210                         [TK_number] = "number",
1211                         [TK_string] = "string",
1212                         [TK_multi_string] = "mstring",
1213                         [TK_line_comment] = "lcomment",
1214                         [TK_block_comment] = "bcomment",
1215                         [TK_in] = "in",
1216                         [TK_out] = "out",
1217                         [TK_newline] = "newline",
1218                         [TK_eof] = "eof",
1219                         [TK_error] = "ERROR",
1220                         };
1221
1222                 switch (tok.num) {
1223                 default: /* known word or mark */
1224                         fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1225                         break;
1226                 case TK_in:
1227                 case TK_out:
1228                 case TK_newline:
1229                 case TK_eof:
1230                         /* No token text included */
1231                         fprintf(f, "%s()", types[tok.num]);
1232                         break;
1233                 case TK_ident:
1234                 case TK_mark:
1235                 case TK_number:
1236                 case TK_string:
1237                 case TK_multi_string:
1238                 case TK_line_comment:
1239                 case TK_block_comment:
1240                 case TK_error:
1241                         fprintf(f, "%s(", types[tok.num]);
1242                         text_dump(f, tok.txt, max);
1243                         fprintf(f, ")");
1244                         break;
1245                 }
1246         }
1247
1248 ### And there we have it
1249
1250 We now have all the library functions defined for reading and printing
1251 tokens.  Now we just need C files to store them, and a mk file to make them.
1252
1253 ###### File: scanner.h
1254         ## public types
1255         ## exported functions
1256
1257 ###### File: libscanner.c
1258         ## includes
1259         #include "scanner.h"
1260         ## private types
1261         ## internal functions
1262         ## main functions
1263
1264 ###### File: scanner.mk
1265
1266         CFLAGS += -Wall -g
1267         all ::
1268         scanner.mk scanner.h libscanner.c : scanner.mdc
1269                 ./md2c scanner.mdc
1270         all :: libscanner.o
1271         libscanner.o : libscanner.c
1272                 $(CC) $(CFLAGS) -c libscanner.c
1273
1274 ## Processing numbers
1275
1276 Converting a `TK_number` token to a numerical value is a slightly
1277 higher level task than lexical analysis, and slightly lower than
1278 grammar parsing, so put it here - as an index if you like.
1279
1280 Importantly it will be used by the same testing rig that is used for
1281 testing the token scanner.
1282
1283 The numeric value that we will convert all numbers into is the `mpq_t`
1284 from the GNU high precision number library "libgmp".
1285
1286 ###### number includes
1287         #include <gmp.h>
1288         #include "mdcode.h"
1289
1290 Firstly we need to be able to parse a string of digits in a given base
1291 and possibly with a decimal marker.  We store this in an `mpz_t`
1292 integer and report the number of digits after the decimal mark.
1293
1294 On error we return zero and ensure that the 'mpz_t' has been freed, or
1295 had never been initialised.
1296
1297 ###### number functions
1298
1299         static int parse_digits(mpz_t num, struct text tok, int base,
1300                                 int *placesp)
1301         {
1302                 /* Accept digits up to 'base', ignore '_' and
1303                  * ' ' if they appear between two legal digits,
1304                  * and if `placesp` is not NULL, allow a single
1305                  * '.' or ',' and report the number of digits
1306                  * beyond there.
1307                  * Return number of characters processed (p),
1308                  * or 0 if something illegal was found.
1309                  */
1310                 int p;
1311                 int decimal = -1; // digits after marker
1312                 enum {Digit, Space, Other} prev = Other;
1313                 int digits = 0;
1314
1315                 for (p = 0; p < tok.len; p++) {
1316                         int dig;
1317                         char c = tok.txt[p];
1318
1319                         if (c == '_' || c == ' ') {
1320                                 if (prev != Digit)
1321                                         goto bad;
1322                                 prev = Space;
1323                                 continue;
1324                         }
1325                         if (c == '.' || c == ',') {
1326                                 if (prev != Digit)
1327                                         goto bad;
1328                                 if (!placesp || decimal >= 0)
1329                                         return p-1;
1330                                 decimal = 0;
1331                                 prev = Other;
1332                                 continue;
1333                         }
1334                         if (isdigit(c))
1335                                 dig = c - '0';
1336                         else if (isupper(c))
1337                                 dig = 10 + c - 'A';
1338                         else if (islower(c))
1339                                 dig = 10 + c - 'a';
1340                         else
1341                                 dig = base;
1342                         if (dig >= base) {
1343                                 if (prev == Space)
1344                                         p--;
1345                                 break;
1346                         }
1347                         prev = Digit;
1348                         if (digits)
1349                                 mpz_mul_ui(num, num, base);
1350                         else
1351                                 mpz_init(num);
1352                         digits += 1;
1353                         mpz_add_ui(num, num, dig);
1354                         if (decimal >= 0)
1355                                 decimal++;
1356                 }
1357                 if (digits == 0)
1358                         return 0;
1359                 if (placesp) {
1360                         if (decimal >= 0)
1361                                 *placesp = decimal;
1362                         else
1363                                 *placesp = 0;
1364                 }
1365                 return p;
1366         bad:
1367                 if (digits)
1368                         mpz_clear(num);
1369                 return 0;
1370         }
1371
1372 ###### number includes
1373         #include <ctype.h>
1374
1375 To parse a full number we need to consider the optional base, the
1376 mantissa, and the optional exponent.  We will treat these one at a
1377 time.
1378
1379 The base is indicated by a letter after a leading zero, which must be
1380 followed by a base letter or a period.  The base also determines the
1381 character which will mark an exponent.
1382
1383 ###### number vars
1384         int base = 10;
1385         char expc = 'e';
1386
1387 ###### parse base
1388
1389         if (tok.txt[0] == '0' && tok.len > 1) {
1390                 int skip = 0;
1391                 switch(tok.txt[1]) {
1392                 case 'x':
1393                 case 'X':
1394                         base = 16;
1395                         skip = 2;
1396                         expc = 'p';
1397                         break;
1398                 case 'o':
1399                 case 'O':
1400                         base = 8;
1401                         skip = 2;
1402                         expc = 'p';
1403                         break;
1404                 case 'b':
1405                 case 'B':
1406                         base = 2;
1407                         skip = 2;
1408                         expc = 'p';
1409                         break;
1410                 case '0':
1411                 case '1':
1412                 case '2':
1413                 case '3':
1414                 case '4':
1415                 case '5':
1416                 case '6':
1417                 case '7':
1418                 case '8':
1419                 case '9':
1420                 case '_':
1421                 case ' ':
1422                         // another digit is not permitted
1423                         // after a zero.
1424                         return 0;
1425                 default:
1426                         // must be decimal marker or trailing
1427                         // letter, which are OK;
1428                         break;
1429                 }
1430                 tok.txt += skip;
1431                 tok.len -= skip;
1432         }
1433
1434 After the base is the mantissa, which may contain a decimal mark, so
1435 we need to record the number of places.  We won't impose the number of
1436 places until we have the exponent as well.
1437
1438 ###### number vars
1439         int places =0;
1440         mpz_t mant;
1441         int d;
1442
1443 ###### parse mantissa
1444
1445         d = parse_digits(mant, tok, base, &places);
1446         if (d == 0)
1447                 return 0;
1448         tok.txt += d;
1449         tok.len -= d;
1450         mpq_init(num);
1451         mpq_set_z(num, mant);
1452         mpz_clear(mant);
1453
1454 After the mantissa number may come an exponent which may be positive
1455 or negative.  We assume at this point that we have seen the exponent
1456 character `expc`.
1457
1458 ###### number vars
1459         long lexp = 0;
1460         mpz_t exp;
1461         int esign = 1;
1462
1463 ###### parse exponent
1464         if (tok.len > 1) {
1465                 if (tok.txt[0] == '+') {
1466                         tok.txt++;
1467                         tok.len--;
1468                 } else if (tok.txt[0] == '-') {
1469                         esign = -1;
1470                         tok.txt++;
1471                         tok.len--;
1472                 }
1473         }
1474         d = parse_digits(exp, tok, 10, NULL);
1475         if (d == 0) {
1476                 mpq_clear(num);
1477                 return 0;
1478         }
1479         if (!mpz_fits_slong_p(exp)) {
1480                 mpq_clear(num);
1481                 mpz_clear(exp);
1482                 return 0;
1483         }
1484         lexp = mpz_get_si(exp) * esign;
1485         mpz_clear(exp);
1486         tok.txt += d;
1487         tok.len -= d;
1488
1489
1490 Now that we have the mantissa and the exponent we can multiply them
1491 together, also allowing for the number of digits after the decimal
1492 mark.
1493
1494 For base 10, we simply subtract the decimal places from the exponent.
1495 For the other bases, as the exponent is alway based on 2, even for
1496 octal and hex, we need a bit more detail.
1497 We then recover the sign from the exponent, as division is quite
1498 different from multiplication.
1499
1500 ###### calc exponent
1501         switch (base) {
1502         case 10:
1503         case 2:
1504                 lexp -= places;
1505                 break;
1506         case 16:
1507                 lexp -= 4*places;
1508                 break;
1509         case 8:
1510                 lexp -= 3*places;
1511                 break;
1512         }
1513         if (lexp < 0) {
1514                 lexp = -lexp;
1515                 esign = -1;
1516         } else
1517                 esign = 1;
1518
1519 Imposing the exponent on the number is also very different for base 10
1520 than for the others.  For the binary shift `gmp` provides a simple
1521 function.  For base 10 we use something like Russian Peasant
1522 Multiplication.
1523
1524 ###### calc exponent
1525         if (expc == 'e') {
1526                 mpq_t tens;
1527                 mpq_init(tens);
1528                 mpq_set_ui(tens, 10, 1);
1529                 while (1) {
1530                         if (lexp & 1) {
1531                                 if (esign > 0)
1532                                         mpq_mul(num, num, tens);
1533                                 else
1534                                         mpq_div(num, num, tens);
1535                         }
1536                         lexp >>= 1;
1537                         if (lexp == 0)
1538                                 break;
1539                         mpq_mul(tens, tens, tens);
1540                 }
1541                 mpq_clear(tens);
1542         } else {
1543                 if (esign > 0)
1544                         mpq_mul_2exp(num, num, lexp);
1545                 else
1546                         mpq_div_2exp(num, num, lexp);
1547         }
1548
1549 Now we are ready to parse a number: the base, mantissa, and exponent.
1550 If all goes well we check for the possible trailing letters and
1551 return.  Return value is 1 for success and 0 for failure.
1552
1553
1554 ###### number functions
1555         int number_parse(mpq_t num, char tail[3], struct text tok)
1556         {
1557                 ## number vars
1558                 int i;
1559
1560                 ## parse base
1561                 ## parse mantissa
1562                 if (tok.len > 1 && (tok.txt[0] == expc ||
1563                                     tok.txt[0] == toupper(expc))) {
1564                         tok.txt++;
1565                         tok.len--;
1566                         ## parse exponent
1567                 }
1568                 ## calc exponent
1569
1570                 for (i = 0; i < 2; i++) {
1571                         if (tok.len <= i)
1572                                 break;
1573                         if (!isalpha(tok.txt[i]))
1574                                 goto err;
1575                         tail[i] = tok.txt[i];
1576                 }
1577                 tail[i] = 0;
1578                 if (i == tok.len)
1579                         return 1;
1580         err:
1581                 mpq_clear(num);
1582                 return 0;
1583         }
1584
1585 Number parsing goes in `libnumber.c`
1586
1587 ###### File: libnumber.c
1588
1589         #include <unistd.h>
1590         #include <stdlib.h>
1591
1592         ## number includes
1593         ## number functions
1594
1595 ###### File: number.h
1596         int number_parse(mpq_t num, char tail[3], struct text tok);
1597
1598 ###### File: scanner.mk
1599         all :: libnumber.o
1600         libnumber.o : libnumber.c
1601                 $(CC) $(CFLAGS) -c libnumber.c
1602
1603 ## Processing strings
1604
1605 Both `TK_string` and `TK_multi_string` require post-processing which
1606 can be one of two types: literal or with escapes processed.
1607 Even literal processing is non-trivial as the file may contain indents
1608 which need to be stripped.
1609
1610 Errors can only occur when processing escapes.  Any unrecognised
1611 character following the escape character will cause an error.
1612
1613 Processing escapes and striping indents can only make the string
1614 shorter, not longer, so we allocate a buffer which is the same size as
1615 the string and process into that.
1616
1617 To request escape processing, we pass the character we want to use for
1618 quoting, usually '`\`'.  To avoid escape processing we pass a zero.
1619
1620 ###### string main
1621         int string_parse(struct token *tok, char escape,
1622                          struct text *str, char tail[3])
1623         {
1624                 ## string vars
1625                 struct text t = tok->txt;
1626
1627                 str->txt = NULL;
1628                 ## strip tail
1629                 if (tok->num == TK_string) {
1630                         ## strip single
1631                 } else {
1632                         ## strip multi
1633                 }
1634                 str->txt = malloc(t.len);
1635                 str->len = 0;
1636
1637                 ## process string
1638                 return 1;
1639         err:
1640                 free(str->txt);
1641                 str->txt = NULL;
1642                 return 0;
1643         }
1644
1645 ### strip tail
1646
1647 The tail of the string can be 0, 1, or 2 letters
1648
1649         i = t.len;
1650         if (i >= 0 && isalpha(t.txt[i-1]))
1651                 i -= 1;
1652         if (i >= 0 && isalpha(t.txt[i-1]))
1653                 i -= 1;
1654         strncpy(tail, t.txt+i, t.len-i);
1655         tail[t.len-i] = 0;
1656         t.len = i;
1657
1658 ###### string vars
1659         int i;
1660
1661 ### strip single
1662
1663 Stripping the quote of a single-line string is trivial.
1664 The only part that is at all interesting is that quote character must
1665 be remembered.
1666
1667         quote = t.txt[0];
1668         if (t.txt[t.len-1] != quote)
1669                 goto err;
1670         t.txt += 1;
1671         t.len -= 2;
1672
1673 ###### string vars
1674         char quote;
1675
1676 ### strip multi
1677
1678 For a multi-line string we have a little more work to do.  We need to
1679 remove 3 quotes, not 1, and need to count the indent of the close
1680 quote as it will need to be stripped from all lines.
1681
1682         quote = t.txt[0];
1683         if (t.len < 7 ||
1684             t.txt[1] != quote || t.txt[2] != quote ||
1685             !is_newline(t.txt[3]))
1686                 goto err;
1687         t.txt += 4;
1688         t.len -= 4;
1689         i = t.len;
1690         if (i <= 0 || t.txt[i-1] != quote)
1691                 goto err;
1692         i -= 1;
1693         if (i <= 0 || t.txt[i-1] != quote)
1694                 goto err;
1695         i -= 1;
1696         if (i <= 0 || t.txt[i-1] != quote)
1697                 goto err;
1698         i -= 1;
1699         t.len = i;
1700         while (i > 0 && !is_newline(t.txt[i-1]))
1701                 i--;
1702         indent = 0;
1703         while (i < t.len) {
1704                 if (t.txt[i] == ' ')
1705                         indent += 1;
1706                 if (t.txt[i] == '\t')
1707                         indent = indent_tab(indent);
1708                 i++;
1709         }
1710
1711 ###### string vars
1712         int indent = 0;
1713
1714 ### process string
1715
1716 Now we just take one byte at a time. trans-ASCII unicode won't look
1717 like anything we are interested in so it will just be copied byte by
1718 byte.
1719
1720         cp = str->txt;
1721         at_sol = 1;
1722         for (i = 0; i < t.len; i++) {
1723                 char c;
1724                 if (at_sol) {
1725                         at_sol = 0;
1726                         ## strip indent
1727                         if (i >= t.len)
1728                                 break;
1729                 }
1730                 c = t.txt[i];
1731                 if (c != escape) {
1732                         *cp = c;
1733                         cp += 1;
1734                         if (is_newline(c))
1735                                 at_sol = 1;
1736                 } else if (i+1 >= t.len) {
1737                         // escape and end of string
1738                         goto err;
1739                 } else {
1740                         i += 1;
1741                         c = t.txt[i];
1742                         ## parse escape
1743                 }
1744         }
1745         str->len = cp - str->txt;
1746
1747 ###### string vars
1748         char *cp;
1749         int at_sol;
1750
1751 ### strip indent
1752
1753 Every time we find a start of line, we strip spaces and tabs until the
1754 required indent is found.
1755
1756         int skipped = 0;
1757         while (i < t.len && skipped < indent) {
1758                 c = t.txt[i];
1759                 if (c == ' ')
1760                         skipped += 1;
1761                 else if (c == '\t')
1762                         skipped = indent_tab(c);
1763                 else
1764                         break;
1765                 i+= 1;
1766         }
1767
1768 ### parse escape
1769         switch (c) {
1770         case 'n':
1771                 *cp++ = '\n'; break;
1772         case 'r':
1773                 *cp++ = '\r'; break;
1774         case 't':
1775                 *cp++ = '\t'; break;
1776         case 'b':
1777                 *cp++ = '\b'; break;
1778         case 'q':
1779                 *cp++ = quote; break;
1780         case 'f':
1781                 *cp++ = '\f'; break;
1782         case 'v':
1783                 *cp++ = '\v'; break;
1784         case 'a':
1785                 *cp++ = '\a'; break;
1786         case '0':
1787         case '1':
1788         case '2':
1789         case '3':
1790                 // 3 digit octal number
1791                 if (i+2 >= t.len)
1792                         goto err;
1793                 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1794                     t.txt[i+2] < '0' || t.txt[i+1] > '7')
1795                         goto err;
1796                 n = (t.txt[i  ]-'0') * 64 +
1797                     (t.txt[i+1]-'0') *  8 +
1798                     (t.txt[i+2]-'0') *  1;
1799                 *cp++ = n;
1800                 i += 2;
1801                 break;
1802         case 'x':
1803                 // 2 hex digits
1804                 n = take_hex(2, t.txt+i+1, t.len-i-1);
1805                 if (n < 0)
1806                         goto err;
1807                 *cp++ = n;
1808                 i += 2;
1809                 break;
1810         case 'u':
1811         case 'U':
1812                 // 4 or 8 hex digits for unicode
1813                 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1814                 if (n < 0)
1815                         goto err;
1816                 memset(&pstate, 0, sizeof(pstate));
1817                 n = wcrtomb(cp, n, &pstate);
1818                 if (n <= 0)
1819                         goto err;
1820                 cp += n;
1821                 i += c == 'u' ? 4 : 8;
1822                 break;
1823         default:
1824                 if (c == escape)
1825                         *cp++ = c;
1826                 else if (is_newline(c))
1827                         at_sol = 1;
1828                 else
1829                         goto err;
1830         }
1831
1832 ###### string vars
1833         long n;
1834         mbstate_t pstate;
1835
1836 For `\x` `\u` and `\U` we need to collect a specific number of
1837 hexadecimal digits
1838
1839 ###### string functions
1840
1841         static long take_hex(int digits, char *cp, int l)
1842         {
1843                 long n = 0;
1844                 if (l < digits)
1845                         return -1;
1846                 while (digits) {
1847                         char  c = *cp;
1848                         int d;
1849                         if (!isxdigit(c))
1850                                 return -1;
1851                         if (isdigit(c))
1852                                 d = c - '0';
1853                         else if (isupper(c))
1854                                 d = 10 + c - 'A';
1855                         else
1856                                 d = 10 + c - 'a';
1857                         n = n * 16 + d;
1858                         digits--;
1859                         cp++;
1860                 }
1861                 return n;
1862         }
1863
1864 #### File: libstring.c
1865
1866 String parsing goes in `libstring.c`
1867
1868         #include <unistd.h>
1869         #include <stdlib.h>
1870         #include <stdio.h>
1871         #include <string.h>
1872         #include <ctype.h>
1873         #include <wchar.h>
1874         #include "mdcode.h"
1875         #include "scanner.h"
1876         ## string functions
1877         ## string main
1878
1879 ###### File: string.h
1880         int string_parse(struct token *tok, char escape,
1881                          struct text *str, char tail[3]);
1882
1883 ###### File: scanner.mk
1884         all :: libstring.o
1885         libstring.o : libstring.c
1886                 $(CC) $(CFLAGS) -c libstring.c
1887
1888
1889 ## Testing
1890
1891 As "untested code is buggy code" we need a program to easily test
1892 the scanner library.  This will simply parse a given file and report
1893 the tokens one per line.
1894
1895 ###### File: scanner.c
1896
1897         #include <unistd.h>
1898         #include <stdlib.h>
1899         #include <fcntl.h>
1900         #include <errno.h>
1901         #include <sys/mman.h>
1902         #include <string.h>
1903         #include <stdio.h>
1904         #include <gmp.h>
1905         #include <locale.h>
1906         #include "mdcode.h"
1907         #include "scanner.h"
1908         #include "number.h"
1909         #include "string.h"
1910
1911         static int errs;
1912         static void pr_err(char *msg)
1913         {
1914                 errs++;
1915                 fprintf(stderr, "%s\n", msg);
1916         }
1917
1918         int main(int argc, char *argv[])
1919         {
1920                 int fd;
1921                 int len;
1922                 char *file;
1923                 struct token_state *state;
1924                 const char *known[] = {
1925                         "==",
1926                         "else",
1927                         "if",
1928                         "then",
1929                         "while",
1930                         "{",
1931                         "}",
1932                 };
1933                 struct token_config conf = {
1934                         .word_start = "_$",
1935                         .word_cont = "",
1936                         .words_marks = known,
1937                         .number_chars = "., _+-",
1938                         .known_count = sizeof(known)/sizeof(known[0]),
1939                         .ignored = (0 << TK_line_comment)
1940                                   |(0 << TK_block_comment),
1941                 };
1942                 struct section *table, *s, *prev;
1943                 setlocale(LC_ALL,"");
1944                 if (argc != 2) {
1945                         fprintf(stderr, "Usage: scanner file\n");
1946                         exit(2);
1947                 }
1948                 fd = open(argv[1], O_RDONLY);
1949                 if (fd < 0) {
1950                         fprintf(stderr, "scanner: cannot open %s: %s\n",
1951                                 argv[1], strerror(errno));
1952                         exit(1);
1953                 }
1954                 len = lseek(fd, 0, 2);
1955                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
1956                 table = code_extract(file, file+len, pr_err);
1957
1958                 for (s = table; s;
1959                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
1960                         printf("Tokenizing: %.*s\n", s->section.len,
1961                                 s->section.txt);
1962                         state = token_open(s->code, &conf);
1963                         while(1) {
1964                                 struct token tk = token_next(state);
1965                                 printf("%d:%d ", tk.line, tk.col);
1966                                 token_trace(stdout, tk, 20);
1967                                 if (tk.num == TK_number) {
1968                                         mpq_t num;
1969                                         char tail[3];
1970                                         if (number_parse(num, tail,tk.txt)) {
1971                                                 printf(" %s ", tail);
1972                                                 mpq_out_str(stdout, 10, num);
1973                                                 mpq_clear(num);
1974                                         } else
1975                                                 printf(" BAD NUMBER");
1976                                 }
1977                                 if (tk.num == TK_string ||
1978                                     tk.num == TK_multi_string) {
1979                                         char esc = '\\';
1980                                         struct text str;
1981                                         char tail[3];
1982                                         if (tk.txt.txt[0] == '`')
1983                                                 esc = 0;
1984                                         if (string_parse(&tk, esc,
1985                                                          &str, tail)) {
1986                                                 printf(" %s ", tail);
1987                                                 text_dump(stdout, str, 20);
1988                                                 free(str.txt);
1989                                         } else
1990                                                 printf(" BAD STRING");
1991                                 }
1992                                 printf("\n");
1993                                 if (tk.num == TK_error)
1994                                         errs = 1;
1995                                 if (tk.num == TK_eof)
1996                                         break;
1997                         }
1998                 }
1999                 exit(!!errs);
2000         }
2001 ###### File: scanner.mk
2002         scanner.c : scanner.mdc
2003                 ./md2c scanner.mdc
2004         all :: scanner
2005         scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
2006                 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
2007                         libmdcode.o libnumber.o libstring.o -licuuc -lgmp
2008         scanner.o : scanner.c
2009                 $(CC) $(CFLAGS) -c scanner.c
2010