ocean-lang.org Git - ocean/blob - csrc/scanner.mdc

   1 # Lexical Scanner #
   2
   3 ## The Task at Hand ##
   4
   5 The main task of the lexical scanner is to convert a stream of
   6 characters into a stream of tokens.  The tokens are then typically
   7 used by a parser to extract the syntactic structure.
   8
   9 The stream of characters are assumed to be in memory identified by a
  10 linked list of blocks, such as provided by the "[mdcode][]" literate
  11 program extractor.  A single token may never cross a block boundary.
  12
  13 [mdcode]: mdcode.html
  14
  15 ###### includes
  16         #include "mdcode.h"
  17
  18 The text is assumed to be UTF-8 though some matching assumes the
  19 ASCII subset.  If the text provided does not conform to UTF-8 an error
  20 will be reported and some number of bytes will be skipped.
  21
  22 ###### public types
  23         #include <wchar.h>
  24         #include <wctype.h>
  25         #include <unicode/uchar.h>
  26
  27 Tokens are returned by successive calls to the main interface
  28 function: `token_next()` which has a `state` structure to keep track
  29 of where it is up to.  Each token carries not just a numeric
  30 identifier but also the code block, the line and character within that
  31 block, and the actual start and length using the `struct text` from
  32 "mdcode".
  33
  34 ###### public types
  35         struct token {
  36                 int               num;
  37                 struct code_node *node;
  38                 struct text       txt;
  39                 int               line, col;
  40         };
  41         struct token_state;
  42
  43 ###### private types
  44         struct token_state {
  45                 ## state fields
  46         };
  47
  48 ###### exported functions
  49         struct token token_next(struct token_state *state);
  50
  51 ###### main functions
  52         struct token token_next(struct token_state *state)
  53         {
  54                 ## token_next init
  55                 while (1) {
  56                         wint_t ch;
  57                         struct token tk;
  58
  59                         ## one token
  60                 }
  61         }
  62
  63 The `line` and `col` offsets are useful for reporting errors.
  64 The `txt` provides the content when that is important.
  65
  66 ### Token types and configuration ##
  67
  68 The scanner is not completely general, yet not completely specified.
  69 There are a fixed set of token types, though particular tokens within
  70 those types can be distinguish via configuration.
  71
  72 Most token types may be explicitly ignored, as typically comments
  73 would be.  The exact consequence of ignoring each token type varies
  74 from token to token.
  75
  76 ###### public types
  77         struct token_config {
  78                 int ignored;    // bit set of ignored tokens.
  79                 ## token config parameters
  80         };
  81
  82 ###### state fields
  83         struct token_config *conf;
  84
  85 ###### token_next init
  86         int ignored = state->conf->ignored;
  87
  88
  89 The different tokens are numbers, words, marks, strings, comments,
  90 newlines, EOF, and indents, each of which is examined in detail below.
  91
  92 There are various cases where no token can be found in part of the
  93 input.  All of these will be reported as a `TK_error` token.
  94
  95 It is possible to declare a number of strings which form distinct
  96 tokens (rather than being grouped as e.g. 'word').  These are given
  97 token numbers from `TK_reserved` upwards.
  98
  99 ###### public types
 100         enum token_num {
 101                 TK_error,
 102                 ## token types
 103                 TK_reserved
 104         };
 105
 106 ### Numbers
 107
 108 Numbers are the messiest tokens to parse, primarily because they can
 109 contain characters that also have meaning outside of numbers and,
 110 particularly, immediately after numbers.
 111
 112 The obvious example is the '`-`' sign.  It can come inside a number for
 113 a negative exponent, or after a number as a subtraction operator.  To
 114 be sure we have parsed as best as possible we need to only allow the
 115 '`-`' inside a number if it is after an exponent character.  This can be
 116 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
 117 digit, so we don't allow '`-`' after just any `e`.
 118
 119 To make matters worse, our language designer has decided to experiment
 120 with allowing commas to be used as the decimal indicator, and spaces
 121 to be used to separate groups of digits in large numbers.  Both of
 122 these can reasonably be restricted to appear between two digits, so we
 123 have to add that condition to our tests.
 124
 125 So we cannot just treat numbers as starting with a digit and being
 126 followed by some set of characters.  We need more structure than that.
 127
 128 So:
 129
 130 - Numbers must start with a digit.
 131 - If the first digit is zero, the next character must be a base
 132   signifier (one of `xob`) or a decimal marker (`.` or `,`).
 133   In the first case the first `p` or `P` may be followed by a sign.
 134 - If the number doesn't start with `0` followed by one of `xob`, the
 135   first `e` may be followed by a sign.
 136 - Any digit or hex digit may be followed by a space or underscore
 137   providing that the subsequence character is also a (hex) digit.
 138   This rule will require an extra level of 'unget' to be
 139   supported when handling characters.
 140 - Otherwise any digits or ASCII letters are allowed.  We do not at
 141   this point check that the digits given are permitted by the base.
 142   That will happen when the token is converted to a number.
 143
 144 To allow easy configuration, the various non alphanumeric characters
 145 are only permitted if they are listed in a configuration parameter.
 146
 147 ###### token config parameters
 148         char *number_chars;
 149
 150 Note that numbers may not start with a period, so `.75` is not a
 151 number.  This is not the norm, but is not unheard of.  Excluding these
 152 numbers simplifies the rule at very little cost.
 153
 154 ###### token types
 155         TK_number,
 156
 157 If TK_number is ignored, digits will result in an error unless they
 158 are declared to be a start character for words.
 159
 160 ###### includes
 161
 162         #include <string.h>
 163
 164 ###### parse number
 165
 166         if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
 167                 int prev_special = 0;
 168                 int expect_p = 0;
 169                 int decimal_mark = 0;
 170                 if (ch == '0') {
 171                         wchar_t ch2 = get_char(state);
 172                         if (strchr("xobXOB", ch2) != NULL)
 173                                 expect_p = 1;
 174                         unget_char(state);
 175                 }
 176                 while (1) {
 177                         int sign_ok = 0;
 178                         switch(expect_p) {
 179                         case 0:
 180                                 if (ch == 'e' || ch == 'E')
 181                                         sign_ok = 1;
 182                                 break;
 183                         case 1:
 184                                 if (ch == 'p' || ch == 'P')
 185                                         sign_ok = 1;
 186                                 break;
 187                         }
 188                         save_unget_state(state);
 189                         ch = get_char(state);
 190                         if (iswalnum(ch)) {
 191                                 prev_special = 0;
 192                                 continue;
 193                         }
 194                         if (ch == '+' || ch == '-') {
 195                                 if (!sign_ok)
 196                                         break;
 197                                 expect_p = -1;
 198                         }
 199                         if (ch == '.' || ch == ',') {
 200                                 if (decimal_mark)
 201                                         break;
 202                                 decimal_mark = 1;
 203                         }
 204                         if (prev_special) {
 205                                 /* Don't allow that special char,
 206                                  * need two 'ungets'
 207                                  */
 208                                 restore_unget_state(state);
 209                                 break;
 210                         }
 211                         if (strchr(state->conf->number_chars, ch)) {
 212                                 prev_special = 1;
 213                                 continue;
 214                         }
 215                         /* non-number char */
 216                         break;
 217                 }
 218                 /* We seem to have a "number" token */
 219                 unget_char(state);
 220                 close_token(state, &tk);
 221                 tk.num = TK_number;
 222                 return tk;
 223         }
 224
 225 ### Words
 226 Words start with a "start" character followed by the longest
 227 sequence of "continue" characters.  The Unicode ID_START and
 228 ID_CONTINUE sets are always permitted, but other ASCII characters
 229 can be added to these sets.
 230
 231 ###### token config parameters
 232         char *word_start;
 233         char *word_cont;
 234
 235 ###### internal functions
 236         static int is_word_start(wchar_t ch, struct token_config *conf)
 237         {
 238                 return iswalpha(ch) ||
 239                        strchr(conf->word_start, ch) != NULL ||
 240                        u_hasBinaryProperty(ch, UCHAR_ID_START);
 241         }
 242
 243         static int is_word_continue(wchar_t ch, struct token_config *conf)
 244         {
 245                 return iswalnum(ch) ||
 246                        strchr(conf->word_cont, ch) != NULL ||
 247                        u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
 248         }
 249
 250 Words can be either known or unknown.  Known words are referred to as
 251 "reserved words" and get a unique token number.  Unknown words are
 252 "identifiers" and are syntactically a single token.
 253
 254 ###### token types
 255         TK_ident,
 256
 257 A list of known words must be provided.  This list is shared with the
 258 "marks" which are described next.  The list must be lexically sorted
 259 and the length of the list must be given (`known_count`).
 260 Tokens matching these known words are reported as the index of the
 261 list added to `TK_reserved`.
 262
 263 If identifiers are ignored, then any word which is not listed as a
 264 known word results in an error.
 265
 266 ###### token config parameters
 267         const char **words_marks;
 268         int known_count;
 269
 270 ###### parse word
 271
 272         if (is_word_start(ch, state->conf)) {
 273                 int n;
 274                 /* A word: identifier or reserved */
 275                 do
 276                         ch = get_char(state);
 277                 while (is_word_continue(ch, state->conf));
 278                 unget_char(state);
 279                 close_token(state, &tk);
 280                 tk.num = TK_ident;
 281                 if (ignored & (1<<TK_ident))
 282                         tk.num = TK_error;
 283                 n = find_known(state->conf, tk.txt);
 284                 if (n >= 0)
 285                         tk.num = TK_reserved + n;
 286                 return tk;
 287         }
 288
 289 ### Marks
 290
 291 Marks are generally one or more punctuation marks joined together.  It
 292 would be nice to use the term "symbol" for these, but that causes
 293 confusion in a subsequent discussion of the grammar, which has terminal
 294 symbols and non-terminal symbols which are conceptually quite
 295 different.  So strings of punctuation characters will be marks.
 296
 297 A "mark" consists of ASCII characters that are not white space, are not
 298 "start" characters for words, and are not digits.
 299 These will collectively be called mark characters.
 300
 301 ###### internal functions
 302         static int is_mark(wchar_t ch, struct token_config *conf)
 303         {
 304                 return ch > ' ' &&
 305                        ch < 0x7f &&
 306                        !iswalnum(ch) &&
 307                        strchr(conf->word_start, ch) == NULL;
 308         }
 309
 310 As with words, there can be known and unknown marks, though the rules
 311 are slightly different.
 312
 313 Two marks do not need to be separated by a non-mark characters.  This
 314 is different from words which do need to be separated by at least one
 315 non-continue character.
 316
 317 The scanner will normally prefer longer sequences of mark characters,
 318 but will more strongly prefer known marks over unknown marks.  So if
 319 it finds a known mark where adding one more character does not result
 320 in a known mark, it will return that first known mark.
 321
 322 If no known mark is found we will test against strings and comments
 323 below before giving up and assuming an unknown mark.
 324
 325 If an unknown mark contains a quote character or a comment marker, and
 326 that token is not being ignored, then we terminate the unknown mark
 327 before that quote or comment.  This ensures that an unknown mark
 328 immediately before a string is handled correctly.
 329
 330 If the first character of a comment marker (i.e. '/') is a known mark,
 331 the above rules would suggest that the start of a comment would be
 332 parsed as that mark, which is not what is wanted.  So the introductory
 333 sequences for a comment ("//" and "/*") are treated as
 334 partially-known.  They prevent the leading "/" from being a mark by
 335 itself, but do not actually constitute a stand-alone mark.
 336
 337 If `TK_mark` is ignored, then unknown marks are returned as errors.
 338
 339 ###### token types
 340         TK_mark,
 341
 342 Known marks are included in the same list as the list of known words.
 343
 344 ###### parse mark
 345         tk.num = TK_error;
 346         while (is_mark(ch, state->conf)) {
 347                 int n;
 348                 wchar_t prev;
 349                 close_token(state, &tk);
 350                 n = find_known(state->conf, tk.txt);
 351                 if (n >= 0)
 352                         tk.num = TK_reserved + n;
 353                 else if (tk.num != TK_error) {
 354                         /* found a longest-known-mark, still need to
 355                          * check for comments
 356                          */
 357                         if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
 358                             (ch == '/' || ch == '*')) {
 359                                 /* Yes, this is a comment, not a '/' */
 360                                 restore_unget_state(state);
 361                                 tk.num = TK_error;
 362                                 break;
 363                         }
 364                         unget_char(state);
 365                         close_token(state, &tk);
 366                         return tk;
 367                 }
 368                 prev = ch;
 369                 save_unget_state(state);
 370                 ch = get_char(state);
 371                 if (!(ignored && (1<<TK_string)) && is_quote(ch))
 372                         break;
 373                 if (prev == '#' && n < 0)
 374                         /* '#' is not a known mark, so assume it is a comment */
 375                         break;
 376                 if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
 377                         close_token(state, &tk);
 378                         restore_unget_state(state);
 379                         break;
 380                 }
 381                 if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
 382                         close_token(state, &tk);
 383                         restore_unget_state(state);
 384                         break;
 385                 }
 386         }
 387         unget_char(state);
 388         if (tk.num != TK_error) {
 389                 close_token(state, &tk);
 390                 return tk;
 391         }
 392
 393 If we don't find a known mark, we will check for strings and comments
 394 before assuming that we have an unknown mark
 395
 396 ###### parse mark
 397         ## parse string
 398         ## parse comment
 399         ## unknown mark
 400
 401 ###### unknown mark
 402         if (tk.txt.len) {
 403                 if (ignored & (1<<TK_mark))
 404                         tk.num = TK_error;
 405                 else
 406                         tk.num = TK_mark;
 407                 return tk;
 408         }
 409
 410 ### Strings
 411
 412 Strings start with one of single quote, double quote, or back quote
 413 and continue until a matching character on the same line.  Any of
 414 these characters can be included in the list of known marks and then
 415 they will not be used for identifying strings.
 416
 417 Immediately following the close quote, one or two ASCII letters may
 418 appear.  These are somewhat like the arbitrary letters allowed in
 419 "Numbers" above.  They can be used by the language in various ways.
 420
 421 If 3 identical quote characters appear in a row and are
 422 followed by a newline, then this forms a multi-line string which
 423 continues until an identical triple quote appears on a line preceded
 424 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
 425
 426 Multi-line strings may not extend beyond the end of the `code_node` in
 427 which they start.
 428
 429 Normal strings and multi-line strings are encoded as two different
 430 token types.
 431
 432 ###### token types
 433         TK_string,
 434         TK_multi_string,
 435
 436 ###### internal functions
 437         static int is_quote(wchar_t ch)
 438         {
 439                 return ch == '\'' || ch == '"' || ch == '`';
 440         }
 441
 442 #### Multi-line strings
 443
 444 The multi-line string is checked for first.  If they are being
 445 ignored, we fall through and treat a triple quote as an empty string
 446 followed by the start of a new string.
 447
 448 ###### parse string
 449         if (tk.txt.len == 3 &&
 450             !(ignored & (1 << TK_multi_string)) &&
 451             is_quote(tk.txt.txt[0]) &&
 452             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
 453             is_newline(tk.txt.txt[3])) {
 454                 // triple quote
 455                 wchar_t first = tk.txt.txt[0];
 456                 int qseen = 0;
 457                 int at_sol = 1;
 458                 while (!at_eon(state) && qseen < 3) {
 459                         ch = get_char(state);
 460                         if (is_newline(ch)) {
 461                                 at_sol = 1;
 462                                 qseen = 0;
 463                         } else if (at_sol && ch == first) {
 464                                 qseen += 1;
 465                         } else if (ch != ' ' && ch != '\t') {
 466                                 at_sol = 0;
 467                                 qseen = 0;
 468                         }
 469                 }
 470                 if (qseen != 3) {
 471                         /* Hit end of node - error.
 472                          * unget so the newline is seen,
 473                          * but return rest of string as an error.
 474                          */
 475                         if (is_newline(ch))
 476                                 unget_char(state);
 477                         close_token(state, &tk);
 478                         tk.num = TK_error;
 479                         return tk;
 480                 }
 481                 /* 2 letters are allowed */
 482                 ch = get_char(state);
 483                 if (iswalpha(ch))
 484                         ch = get_char(state);
 485                 if (iswalpha(ch))
 486                         ch = get_char(state);
 487                 /* Now we must have a newline, but we don't return it
 488                  * whatever it is.*/
 489                 unget_char(state);
 490                 close_token(state, &tk);
 491                 tk.num = TK_multi_string;
 492                 if (!is_newline(ch))
 493                         tk.num = TK_error;
 494                 return tk;
 495         }
 496
 497 #### Single-line strings
 498
 499 The sequence of marks collected may be more than a single-line
 500 string, so we reset to the start and collect characters until
 501 we find a close quote or a newline.
 502
 503 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
 504
 505 ###### parse string
 506         if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
 507             !(ignored & (1<<TK_string))) {
 508                 wchar_t first = tk.txt.txt[0];
 509                 reset_token(state, &tk);
 510                 ch = get_char(state);
 511                 tk.num = TK_error;
 512                 while (!at_eon(state) && !is_newline(ch)) {
 513                         ch = get_char(state);
 514                         if (ch == first) {
 515                                 tk.num = TK_string;
 516                                 break;
 517                         }
 518                         if (is_newline(ch)) {
 519                                 unget_char(state);
 520                                 break;
 521                         }
 522                 }
 523                 close_token(state, &tk);
 524                 return tk;
 525         }
 526
 527 ### Comments
 528
 529 Single line comments may start with '`//`' or '`#`' providing that these
 530 are not known marks.  They continue to the end of the line.
 531
 532 Block comments start with '`/*`' if this is not a known mark.  They
 533 continue to the first occurrence of '`*/`' and may not contain any
 534 occurrence of '`/*`'.
 535
 536 Block comments can be wholly within one line or can continue over
 537 multiple lines.  The multi-line version should be followed immediately
 538 by a newline.  The Linux kernel contains over 285000 multi-line
 539 comments are only 34 are followed by characters other than white space
 540 (which should be removed) or a backslash (only needed in macros).  So
 541 it would not suffer from this rule.
 542
 543 These two comment types are reported as two separate token types, and
 544 consequently can be ignored separately.  When ignored a comment is
 545 still parsed, but is discarded.
 546
 547 ###### token types
 548         TK_line_comment,
 549         TK_block_comment,
 550
 551 ###### internal functions
 552         static int is_line_comment(struct text txt)
 553         {
 554                 return (txt.len >= 1 && txt.txt[0] == '#') ||
 555                        (txt.len >= 2 && txt.txt[0] == '/' &&
 556                                         txt.txt[1] == '/');
 557         }
 558
 559         static int is_block_comment(struct text txt)
 560         {
 561                 return txt.len >= 2 && txt.txt[0] == '/' &&
 562                        txt.txt[1] == '*';
 563         }
 564
 565 #### Single line comments
 566
 567 A single-line comment continues up to, but not including the newline
 568 or end of node.
 569
 570 ###### parse comment
 571
 572         if (is_line_comment(tk.txt)) {
 573                 while (!is_newline(ch) && !at_eon(state))
 574                         ch = get_char(state);
 575                 if (is_newline(ch))
 576                         unget_char(state);
 577                 close_token(state, &tk);
 578                 tk.num = TK_line_comment;
 579                 if (ignored & (1 << TK_line_comment))
 580                         continue;
 581                 return tk;
 582         }
 583
 584 #### Block comments
 585
 586 The token text collected so far could exceed the comment, so we need
 587 to reset it first.
 588
 589 If we find an embedded `/*` we reset to just before the '/' and report
 590 an error.  That way the next thing to be parsed will be the rest of
 591 the comment.  This requires a double unget, so we need to save/restore
 592 the unget state (explained later).
 593
 594 ###### parse comment
 595
 596         if (is_block_comment(tk.txt)) {
 597                 wchar_t prev;
 598                 int newlines = 0;
 599                 reset_token(state, &tk);
 600                 get_char(state);
 601                 get_char(state);
 602                 save_unget_state(state);
 603                 ch = get_char(state);
 604                 prev = 0;
 605                 while (!at_eon(state) &&
 606                        (prev != '/' || ch != '*') &&
 607                        (prev != '*' || ch != '/')) {
 608                         if (is_newline(ch))
 609                                 newlines = 1;
 610                         prev = ch;
 611                         save_unget_state(state);
 612                         ch = get_char(state);
 613                 }
 614                 close_token(state, &tk);
 615                 if (at_eon(state)) {
 616                         tk.num = TK_error;
 617                         return tk;
 618                 }
 619                 if (prev == '/') {
 620                         /* embedded.  Need to unget twice! */
 621                         restore_unget_state(state);
 622                         unget_char(state);
 623                         tk.num = TK_error;
 624                         return tk;
 625                 }
 626                 tk.num = TK_block_comment;
 627                 if (newlines && !(ignored & (1<<TK_newline))) {
 628                         /* next char must be newline */
 629                         ch = get_char(state);
 630                         unget_char(state);
 631                         if (!is_newline(ch))
 632                                 tk.num = TK_error;
 633                 }
 634                 if (tk.num == TK_error ||
 635                     !(ignored & (1 << TK_block_comment)))
 636                         return tk;
 637                 continue;
 638         }
 639
 640 ### Indents, Newlines, and White Space.
 641
 642 Normally white space is ignored.  However newlines can be important as
 643 can indents, which are either after a newline or at the start of a
 644 node (detected by `at_son()`);
 645
 646 ###### exported functions
 647         static inline int is_newline(wchar_t ch)
 648         {
 649                 return ch == '\n' || ch == '\f' || ch == '\v';
 650         }
 651
 652 ###### white space
 653         if (ch <= ' ' && !is_newline(ch)
 654             && ! at_son(state))
 655                 continue;
 656
 657 If a line starts with more white-space than the previous non-blank
 658 line - or if the first non-blank line in the document starts with any
 659 white-space - then an "IN" is reported at the start of the line.
 660
 661 Before the next non-blank line which starts with less white space, or
 662 at the latest at the end of the document, a matching "OUT" token
 663 is reported.  There will always be an exact match between "IN" and
 664 "OUT" tokens.
 665
 666 It is possible for "OUT" to be followed (almost) immediately by an
 667 "IN".  This happens if, for example, the indent of three consecutive
 668 lines are 0, 8, 4 spaces.  Before the second line we report an
 669 "IN".  Before the third line we must report an "OUT", as 4 is less
 670 than 8, then also an Ident as 4 is greater than 0.
 671
 672 ###### token types
 673         TK_in,
 674         TK_out,
 675
 676 For the purpose of measuring the length of white space, a tab adds at
 677 least one space, and rounds up to a multiple of 8.
 678
 679 ###### exported functions
 680         static inline int indent_tab(int indent)
 681         {
 682                 return (indent|7)+1;
 683         }
 684
 685 We need to track the current levels of indent.  This requires some
 686 sort of stack as indent levels are pushed on and popped off.  In
 687 practice this stack is unlikely to often exceed 5 so we will used a
 688 fixed stack of 20 indent levels.  More than this will be silently
 689 ignored.
 690
 691 ###### state fields
 692         int     indent_level;
 693         int     indent_sizes[20];
 694
 695 #### Newlines
 696
 697 Newlines can optionally be reported.  Newlines within a block comment
 698 or a multi-line string are not reported separately, but each of these
 699 must be followed immediately by a newline so these constructs cannot
 700 hide the fact that a newline was present.
 701
 702 When indents are being reported, the Newline which would normally be
 703 reported immediately before the "IN" is delayed until after the
 704 matching "OUT".  This makes an indented section act like a
 705 continuation of the previous line to some extent.
 706
 707 A blank line would normally be reported simply as two consecutive Newline
 708 tokens.  However if the subsequent line is indented (and indents are being
 709 reported) then the right thing to do is less obvious as Newlines should be
 710 delayed - but how many Newlines?
 711
 712 The approach we will take is to report the extra Newlines immediately after
 713 the IN token, so the blank line is treated as though it were an indented
 714 blank line.
 715
 716 ###### token types
 717         TK_newline,
 718
 719 If we find a newline or white space at the start of a block, we keep
 720 collecting spaces, tabs, and newlines until we find some real text.
 721 Then depending on the indent we generate some number of tokens.  These
 722 will be a sequence of "Newline OUT" pairs representing a decrease
 723 in indent, then either a Newline or an IN depending on whether the
 724 next line is indented, then zero or more Newlines representing all the
 725 blank lines that have been skipped.
 726
 727 When a Newline leads to the next block of code there is a question of
 728 whether the various Newline and OUT/IN tokens should appear to
 729 pbelong to the earlier or later block.  This is addressed by processing
 730 the tokens in two stages based on the relative indent levels of the
 731 two blocks (each block has a base indent to which the actual indents
 732 are added).
 733
 734 Any "Newline OUT" pairs needed to reduce the current indent to the
 735 maximum of the base indents of the old and new blocks are generated
 736 against the old block.  Then if the next block does not have an
 737 increased indent, one more "Newline" is generated.
 738
 739 If further "Newline OUT" pairs are needed to get to the indent
 740 level of the 'next' block, they are generated against that block,
 741 though the first Newline is suppressed (it having already been
 742 generated).
 743
 744 Finally the Newline or IN for the first line of the new block is
 745 generated, unless the Newline needs to be suppressed because it
 746 appeared at the end of the previous block.
 747
 748 This means that a block may start with an OUT or an IN, but
 749 will only start with a Newline if it actually starts with a blank
 750 line.
 751
 752 We will need to represent in the `token_state` where in this sequence
 753 of delayed tokens we are.  As `state.col` records the target indent we
 754 don't need to record how many OUTs or INs are needed.  We do
 755 need to record the number of blank lines, and which of Newline and
 756 OUT is needed next in the initial sequence of pairs.
 757
 758 For this we store one more than the number of blank lines as
 759 `delayed_lines` and a flag for `out_next`.
 760
 761 ###### state fields
 762         int check_indent;
 763         int delayed_lines;
 764         int out_next;
 765
 766 Generating these tokens involve two separate pieces of code.
 767
 768 Firstly we need to recognise white space and count the indents and
 769 newlines.  These are recorded in the above state fields.
 770
 771 Separately we need, on each call to `token_next`, we need to check if
 772 there are some delayed tokens and if so we need to advance the state
 773 information and return one token.
 774
 775 ###### white space
 776         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
 777                 int newlines = 0;
 778                 int was_son = at_son(state);
 779                 if (ignored & (1<<TK_in)) {
 780                         if (!is_newline(ch))
 781                                 continue;
 782                         if (ignored & (1<<TK_newline))
 783                                 continue;
 784                         tk.num = TK_newline;
 785                         close_token(state, &tk);
 786                         return tk;
 787                 }
 788                 // Indents are needed, so check all white space.
 789                 while (ch <= ' ' && !at_eon(state)) {
 790                         if (is_newline(ch))
 791                                 newlines += 1;
 792                         ch = get_char(state);
 793                 }
 794                 if (at_eon(state)) {
 795                         newlines += 1;
 796                         if (state->node->next &&
 797                             state->node->next->indent > state->node->indent)
 798                                 state->col = state->node->next->indent;
 799                         else
 800                                 state->col = state->node->indent;
 801                 } else
 802                         unget_char(state);
 803                 state->delayed_lines = newlines;
 804                 state->out_next = was_son;
 805                 state->check_indent = 1;
 806                 continue;
 807         }
 808
 809
 810 ###### delayed tokens
 811
 812         if (state->check_indent || state->delayed_lines) {
 813                 if (state->col < state->indent_sizes[state->indent_level]) {
 814                         if (!state->out_next &&
 815                             !(ignored & (1<<TK_newline))) {
 816                                 state->out_next = 1;
 817                                 tk.num = TK_newline;
 818                                 return tk;
 819                         }
 820                         state->indent_level -= 1;
 821                         state->out_next = 0;
 822                         tk.num = TK_out;
 823                         return tk;
 824                 }
 825                 if (state->col > state->indent_sizes[state->indent_level] &&
 826                     state->indent_level < sizeof(state->indent_sizes)-1) {
 827                         state->indent_level += 1;
 828                         state->indent_sizes[state->indent_level] = state->col;
 829                         state->delayed_lines -= 1;
 830                         tk.num = TK_in;
 831                         return tk;
 832                 }
 833                 state->check_indent = 0;
 834                 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
 835                         tk.num = TK_newline;
 836                         state->delayed_lines -= 1;
 837                         return tk;
 838                 }
 839                 state->delayed_lines = 0;
 840                 continue;
 841         }
 842
 843 ### End of File
 844
 845 After the last newline in the file has been processed, a special
 846 end-of-file token will be returned.  any further attempts to get more
 847 tokens will continue to return the same end-of-file token.
 848
 849 ###### token types
 850         TK_eof,
 851
 852
 853 ###### white space
 854         if (ch == WEOF) {
 855                 if (state->col) {
 856                         state->col = 0;
 857                         state->check_indent = 1;
 858                         continue;
 859                 }
 860                 tk.num = TK_eof;
 861                 return tk;
 862         }
 863
 864 ### Unknown Marks, or errors.
 865
 866 We have now handled all the possible known mark-like tokens.
 867 If the token we have is not empty and `TK_mark` is allowed,
 868 we have an unknown mark, otherwise this must be an error.
 869
 870 ###### unknown mark
 871         /* one unknown character */
 872         close_token(state, &tk);
 873         tk.num = TK_error;
 874         return tk;
 875
 876 ## Tools For The Task
 877
 878 You may have noticed that are few gaps we left in the above -
 879 functions used without first defining them.  Doing so above would have
 880 broken the flow.
 881
 882 ### Character by character
 883
 884 As we walk through the various `code_node`s we need to process whole
 885 Unicode codepoints, and keep track of which line and column we are on.
 886 We will assume for now that any printing character uses one column,
 887 though that is not true in general.
 888
 889 As the text in a `code_node` may include an indent that identifies it as
 890 being code, we need to be careful to strip that.  The `code_node` has
 891 a flag that tells us whether or not we need to strip.
 892
 893 ###### includes
 894         #include <memory.h>
 895
 896 ###### state fields
 897         struct code_node *node;
 898         int    offset;
 899         int    line;
 900         int    col;
 901
 902 ###### internal functions
 903
 904         static void do_strip(struct token_state *state)
 905         {
 906                 if (state->node->needs_strip) {
 907                         int n = 4;
 908                         while (n && state->node->code.txt[state->offset] == ' ') {
 909                                 state->offset += 1;
 910                                 n -= 1;
 911                         }
 912                         while (n == 4 && state->node->code.txt[state->offset] == '\t') {
 913                                 state->offset += 1;
 914                                 n -= 4;
 915                         }
 916                 }
 917         }
 918
 919         static wint_t get_char(struct token_state *state)
 920         {
 921                 wchar_t next;
 922                 size_t n;
 923                 mbstate_t mbstate;
 924
 925                 if (state->node == NULL)
 926                         return WEOF;
 927                 if (state->node->code.len <= state->offset) {
 928                         do
 929                                 state->node = state->node->next;
 930                         while (state->node && state->node->code.txt == NULL);
 931                         state->offset = 0;
 932                         if (state->node == NULL)
 933                                 return WEOF;
 934                         do_strip(state);
 935                         state->line = state->node->line_no;
 936                         state->col = state->node->indent;
 937                 }
 938
 939                 ## before get_char
 940
 941                 memset(&mbstate, 0, sizeof(mbstate));
 942
 943                 n = mbrtowc(&next, state->node->code.txt + state->offset,
 944                             state->node->code.len - state->offset,
 945                             &mbstate);
 946                 if (n == -2 || n == 0) {
 947                         /* Not enough bytes - not really possible */
 948                         next = '\n';
 949                         state->offset = state->node->code.len;
 950                 } else if (n == -1) {
 951                         /* error */
 952                         state->offset += 1;
 953                         next = 0x7f; // an illegal character
 954                 } else
 955                         state->offset += n;
 956
 957                 if (next >= ' ') {
 958                         state->col += 1;
 959                 } else if (is_newline(next)) {
 960                         state->line += 1;
 961                         state->col = state->node->indent;
 962                         do_strip(state);
 963                 } else if (next == '\t') {
 964                         state->col = indent_tab(state->col);
 965                 }
 966                 return next;
 967         }
 968
 969 We will sometimes want to "unget" the last character as it needs to be
 970 considered again as part of the next token.  So we need to store a
 971 'previous' version of all metadata.
 972
 973 ###### state fields
 974         int    prev_offset;
 975         int    prev_line;
 976         int    prev_col;
 977
 978 ###### before get_char
 979         state->prev_offset = state->offset;
 980         state->prev_line   = state->line;
 981         state->prev_col    = state->col;
 982
 983 ###### internal functions
 984
 985         static void unget_char(struct token_state *state)
 986         {
 987                 if (state->node) {
 988                         state->offset = state->prev_offset;
 989                         state->line   = state->prev_line;
 990                         state->col    = state->prev_col;
 991                 }
 992         }
 993
 994 We occasionally need a double-unget, particularly for numbers and
 995 block comments.  We don't impose this cost on all scanning, but
 996 require those code sections that need it to call `save_unget_state`
 997 before each `get_char`, and then `restore_unget_state` when a
 998 double-unget is needed.
 999
1000 ###### state fields
1001         int     prev_offset2;
1002         int     prev_line2;
1003         int     prev_col2;
1004
1005 ###### internal functions
1006         static void save_unget_state(struct token_state *state)
1007         {
1008                 state->prev_offset2 = state->prev_offset;
1009                 state->prev_line2 = state->prev_line;
1010                 state->prev_col2 = state->prev_col;
1011         }
1012
1013         static void restore_unget_state(struct token_state *state)
1014         {
1015                 state->prev_offset = state->prev_offset2;
1016                 state->prev_line = state->prev_line2;
1017                 state->prev_col = state->prev_col2;
1018         }
1019
1020 At the start of a token we don't want to be at the end of a code block
1021 if we can help it.  To avoid this possibility, we 'get' and 'unget' a
1022 single character.  This will move into the next non-empty code block
1023 and leave the current pointer at the start of it.
1024
1025 This has to happen _after_ dealing with delayed tokens as some of them
1026 must appear in the previous node.  When we do this, we need to reset
1027 the data in the token.
1028
1029 ###### delayed tokens
1030         if (at_eon(state)) {
1031                 get_char(state);
1032                 unget_char(state);
1033                 tk.node = state->node;
1034                 if (state->node)
1035                         tk.txt.txt = state->node->code.txt + state->offset;
1036                 tk.line = state->line;
1037                 tk.col = state->col;
1038                 tk.txt.len = 0;
1039         }
1040
1041 ### Managing tokens
1042
1043 The current token is initialized to line up with the first character
1044 that we 'get' for each token.  When we have, or might have, a full
1045 token we can call `close_token` to set the `len` of the token
1046 appropriately.  This can safely be called multiple times.
1047
1048 Finally we occasionally (for single-line strings and block comments)
1049 need to reset to the beginning of the current token as we might have
1050 parsed too much already.  For that there is `reset_token`.
1051
1052 ###### one token
1053         tk.node = state->node;
1054         if (state->node)
1055                 tk.txt.txt = state->node->code.txt + state->offset;
1056         tk.line = state->line;
1057         tk.col = state->col;
1058         tk.txt.len = 0;
1059
1060 ###### internal functions
1061
1062         static void close_token(struct token_state *state,
1063                                 struct token *tk)
1064         {
1065                 tk->txt.len = (state->node->code.txt + state->offset)
1066                               - tk->txt.txt;
1067         }
1068
1069         static void reset_token(struct token_state *state, struct token *tok)
1070         {
1071                 state->prev_line = tok->line;
1072                 state->prev_col = tok->col;
1073                 state->prev_offset = tok->txt.txt - state->node->code.txt;
1074                 unget_char(state);
1075                 tok->txt.len = 0;
1076         }
1077
1078
1079 Tokens make not cross into the next `code_node`, and some tokens can
1080 include the newline at the and of a `code_node`, we must be able to
1081 easily check if we have reached the end.  Equally we need to know if
1082 we are at the start of a node, as white space is treated a little
1083 differently there.
1084
1085 ###### internal functions
1086
1087         static int at_son(struct token_state *state)
1088         {
1089                 return state->offset == 0;
1090         }
1091
1092         static int at_eon(struct token_state *state)
1093         {
1094                 // at end-of-node ??
1095                 return state->node == NULL ||
1096                        state->offset >= state->node->code.len;
1097         }
1098
1099 ### Find a known word
1100
1101 As the known-word list is sorted we can use a simple binary search.
1102 Following the pattern established in "mdcode", we will use a `struct
1103 text` with start and length to represent the code fragment we are
1104 searching for.
1105
1106 ###### internal functions
1107         static int find_known(struct token_config *conf, struct text txt)
1108         {
1109                 int lo = 0;
1110                 int hi = conf->known_count;
1111
1112                 while (lo + 1 < hi) {
1113                         int mid = (lo + hi) / 2;
1114                         int cmp = strncmp(conf->words_marks[mid],
1115                                           txt.txt, txt.len);
1116                         if (cmp == 0 && conf->words_marks[mid][txt.len])
1117                                 cmp = 1;
1118                         if (cmp <= 0)
1119                                 lo = mid;
1120                         else
1121                                 hi = mid;
1122                 }
1123                 if (strncmp(conf->words_marks[lo],
1124                            txt.txt, txt.len) == 0
1125                     && conf->words_marks[lo][txt.len] == 0)
1126                         return lo;
1127                 else
1128                         return -1;
1129         }
1130
1131 ### Bringing it all together
1132
1133 Now we have all the bits there is just one section missing:  combining
1134 all the token parsing code into one block.
1135
1136 The handling of delayed tokens (Newlines, INs, OUTs) must come
1137 first before we try getting another character.
1138
1139 Then we parse all the test, making sure that we check for known marks
1140 before strings and comments, but unknown marks after strings and comments.
1141
1142 This block of code will either return a token, or will choose to
1143 ignore one, in which case it will `continue` around to the top of the
1144 loop.
1145
1146 ###### one token
1147         ## delayed tokens
1148
1149         ch = get_char(state);
1150
1151         ## white space
1152         ## parse number
1153         ## parse word
1154         ## parse mark
1155
1156 ### Start and stop
1157
1158 As well as getting tokens, we need to be able to create the
1159 `token_state` to start with, and discard it later.
1160
1161 ###### includes
1162         #include <malloc.h>
1163
1164 ###### main functions
1165         struct token_state *token_open(struct code_node *code, struct
1166                                        token_config *conf)
1167         {
1168                 struct token_state *state = malloc(sizeof(*state));
1169                 memset(state, 0, sizeof(*state));
1170                 state->node = code;
1171                 state->line = code->line_no;
1172                 state->col = code->indent;
1173                 state->conf = conf;
1174                 do_strip(state);
1175                 return state;
1176         }
1177         void token_close(struct token_state *state)
1178         {
1179                 free(state);
1180         }
1181
1182 ###### exported functions
1183         struct token_state *token_open(struct code_node *code, struct
1184                                        token_config *conf);
1185         void token_close(struct token_state *state);
1186
1187 ### Trace tokens
1188
1189 Getting tokens is the main thing but it is also useful to be able to
1190 print out token information, particularly for tracing and testing.
1191
1192 Known tokens are printed verbatim.  Other tokens are printed as
1193 `type(content)` where content is truncated to a given number of characters.
1194
1195 The function for printing a truncated string (`text_dump`) is also exported
1196 so that it can be used to tracing processed strings too.
1197
1198 ###### includes
1199         #include <stdio.h>
1200
1201 ###### exported functions
1202         void token_trace(FILE *f, struct token tok, int max);
1203         void text_dump(FILE *f, struct text t, int max);
1204
1205 ###### main functions
1206
1207         void text_dump(FILE *f, struct text txt, int max)
1208         {
1209                 int i;
1210                 if (txt.len > max)
1211                         max -= 2;
1212                 else
1213                         max = txt.len;
1214                 for (i = 0; i < max; i++) {
1215                         char c = txt.txt[i];
1216                         if (c < ' ' || c > '~')
1217                                 fprintf(f, "\\x%02x", c & 0xff);
1218                         else if (c == '\\')
1219                                 fprintf(f, "\\\\");
1220                         else
1221                                 fprintf(f, "%c", c);
1222                 }
1223                 if (i < txt.len)
1224                         fprintf(f, "..");
1225         }
1226
1227         void token_trace(FILE *f, struct token tok, int max)
1228         {
1229                 static char *types[] = {
1230                         [TK_ident] = "ident",
1231                         [TK_mark] = "mark",
1232                         [TK_number] = "number",
1233                         [TK_string] = "string",
1234                         [TK_multi_string] = "mstring",
1235                         [TK_line_comment] = "lcomment",
1236                         [TK_block_comment] = "bcomment",
1237                         [TK_in] = "in",
1238                         [TK_out] = "out",
1239                         [TK_newline] = "newline",
1240                         [TK_eof] = "eof",
1241                         [TK_error] = "ERROR",
1242                         };
1243
1244                 switch (tok.num) {
1245                 default: /* known word or mark */
1246                         fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1247                         break;
1248                 case TK_in:
1249                 case TK_out:
1250                 case TK_newline:
1251                 case TK_eof:
1252                         /* No token text included */
1253                         fprintf(f, "%s()", types[tok.num]);
1254                         break;
1255                 case TK_ident:
1256                 case TK_mark:
1257                 case TK_number:
1258                 case TK_string:
1259                 case TK_multi_string:
1260                 case TK_line_comment:
1261                 case TK_block_comment:
1262                 case TK_error:
1263                         fprintf(f, "%s(", types[tok.num]);
1264                         text_dump(f, tok.txt, max);
1265                         fprintf(f, ")");
1266                         break;
1267                 }
1268         }
1269
1270 ### And there we have it
1271
1272 We now have all the library functions defined for reading and printing
1273 tokens.  Now we just need C files to store them, and a mk file to make them.
1274
1275 ###### File: scanner.h
1276         ## public types
1277         ## exported functions
1278
1279 ###### File: libscanner.c
1280         ## includes
1281         #include "scanner.h"
1282         ## private types
1283         ## internal functions
1284         ## main functions
1285
1286 ###### File: scanner.mk
1287
1288         CFLAGS += -Wall -g
1289         all ::
1290         scanner.mk scanner.h libscanner.c : scanner.mdc
1291                 ./md2c scanner.mdc
1292         all :: libscanner.o
1293         libscanner.o : libscanner.c
1294                 $(CC) $(CFLAGS) -c libscanner.c
1295
1296 ## Processing numbers
1297
1298 Converting a `TK_number` token to a numerical value is a slightly
1299 higher level task than lexical analysis, and slightly lower than
1300 grammar parsing, so put it here - as an index if you like.
1301
1302 Importantly it will be used by the same testing rig that is used for
1303 testing the token scanner.
1304
1305 The numeric value that we will convert all numbers into is the `mpq_t`
1306 from the GNU high precision number library "libgmp".
1307
1308 ###### number includes
1309         #include <gmp.h>
1310         #include "mdcode.h"
1311
1312 Firstly we need to be able to parse a string of digits in a given base
1313 and possibly with a decimal marker.  We store this in an `mpz_t`
1314 integer and report the number of digits after the decimal mark.
1315
1316 On error we return zero and ensure that the 'mpz_t' has been freed, or
1317 had never been initialised.
1318
1319 ###### number functions
1320
1321         static int parse_digits(mpz_t num, struct text tok, int base,
1322                                 int *placesp)
1323         {
1324                 /* Accept digits up to 'base', ignore '_' and
1325                  * ' ' if they appear between two legal digits,
1326                  * and if `placesp` is not NULL, allow a single
1327                  * '.' or ',' and report the number of digits
1328                  * beyond there.
1329                  * Return number of characters processed (p),
1330                  * or 0 if something illegal was found.
1331                  */
1332                 int p;
1333                 int decimal = -1; // digits after marker
1334                 enum {Digit, Space, Other} prev = Other;
1335                 int digits = 0;
1336
1337                 for (p = 0; p < tok.len; p++) {
1338                         int dig;
1339                         char c = tok.txt[p];
1340
1341                         if (c == '_' || c == ' ') {
1342                                 if (prev != Digit)
1343                                         goto bad;
1344                                 prev = Space;
1345                                 continue;
1346                         }
1347                         if (c == '.' || c == ',') {
1348                                 if (prev != Digit)
1349                                         goto bad;
1350                                 if (!placesp || decimal >= 0)
1351                                         return p-1;
1352                                 decimal = 0;
1353                                 prev = Other;
1354                                 continue;
1355                         }
1356                         if (isdigit(c))
1357                                 dig = c - '0';
1358                         else if (isupper(c))
1359                                 dig = 10 + c - 'A';
1360                         else if (islower(c))
1361                                 dig = 10 + c - 'a';
1362                         else
1363                                 dig = base;
1364                         if (dig >= base) {
1365                                 if (prev == Space)
1366                                         p--;
1367                                 break;
1368                         }
1369                         prev = Digit;
1370                         if (digits)
1371                                 mpz_mul_ui(num, num, base);
1372                         else
1373                                 mpz_init(num);
1374                         digits += 1;
1375                         mpz_add_ui(num, num, dig);
1376                         if (decimal >= 0)
1377                                 decimal++;
1378                 }
1379                 if (digits == 0)
1380                         return 0;
1381                 if (placesp) {
1382                         if (decimal >= 0)
1383                                 *placesp = decimal;
1384                         else
1385                                 *placesp = 0;
1386                 }
1387                 return p;
1388         bad:
1389                 if (digits)
1390                         mpz_clear(num);
1391                 return 0;
1392         }
1393
1394 ###### number includes
1395         #include <ctype.h>
1396
1397 To parse a full number we need to consider the optional base, the
1398 mantissa, and the optional exponent.  We will treat these one at a
1399 time.
1400
1401 The base is indicated by a letter after a leading zero, which must be
1402 followed by a base letter or a period.  The base also determines the
1403 character which will mark an exponent.
1404
1405 ###### number vars
1406         int base = 10;
1407         char expc = 'e';
1408
1409 ###### parse base
1410
1411         if (tok.txt[0] == '0' && tok.len > 1) {
1412                 int skip = 0;
1413                 switch(tok.txt[1]) {
1414                 case 'x':
1415                 case 'X':
1416                         base = 16;
1417                         skip = 2;
1418                         expc = 'p';
1419                         break;
1420                 case 'o':
1421                 case 'O':
1422                         base = 8;
1423                         skip = 2;
1424                         expc = 'p';
1425                         break;
1426                 case 'b':
1427                 case 'B':
1428                         base = 2;
1429                         skip = 2;
1430                         expc = 'p';
1431                         break;
1432                 case '0':
1433                 case '1':
1434                 case '2':
1435                 case '3':
1436                 case '4':
1437                 case '5':
1438                 case '6':
1439                 case '7':
1440                 case '8':
1441                 case '9':
1442                 case '_':
1443                 case ' ':
1444                         // another digit is not permitted
1445                         // after a zero.
1446                         return 0;
1447                 default:
1448                         // must be decimal marker or trailing
1449                         // letter, which are OK;
1450                         break;
1451                 }
1452                 tok.txt += skip;
1453                 tok.len -= skip;
1454         }
1455
1456 After the base is the mantissa, which may contain a decimal mark, so
1457 we need to record the number of places.  We won't impose the number of
1458 places until we have the exponent as well.
1459
1460 ###### number vars
1461         int places =0;
1462         mpz_t mant;
1463         int d;
1464
1465 ###### parse mantissa
1466
1467         d = parse_digits(mant, tok, base, &places);
1468         if (d == 0)
1469                 return 0;
1470         tok.txt += d;
1471         tok.len -= d;
1472         mpq_init(num);
1473         mpq_set_z(num, mant);
1474         mpz_clear(mant);
1475
1476 After the mantissa number may come an exponent which may be positive
1477 or negative.  We assume at this point that we have seen the exponent
1478 character `expc`.
1479
1480 ###### number vars
1481         long lexp = 0;
1482         mpz_t exp;
1483         int esign = 1;
1484
1485 ###### parse exponent
1486         if (tok.len > 1) {
1487                 if (tok.txt[0] == '+') {
1488                         tok.txt++;
1489                         tok.len--;
1490                 } else if (tok.txt[0] == '-') {
1491                         esign = -1;
1492                         tok.txt++;
1493                         tok.len--;
1494                 }
1495         }
1496         d = parse_digits(exp, tok, 10, NULL);
1497         if (d == 0) {
1498                 mpq_clear(num);
1499                 return 0;
1500         }
1501         if (!mpz_fits_slong_p(exp)) {
1502                 mpq_clear(num);
1503                 mpz_clear(exp);
1504                 return 0;
1505         }
1506         lexp = mpz_get_si(exp) * esign;
1507         mpz_clear(exp);
1508         tok.txt += d;
1509         tok.len -= d;
1510
1511
1512 Now that we have the mantissa and the exponent we can multiply them
1513 together, also allowing for the number of digits after the decimal
1514 mark.
1515
1516 For base 10, we simply subtract the decimal places from the exponent.
1517 For the other bases, as the exponent is alway based on 2, even for
1518 octal and hex, we need a bit more detail.
1519 We then recover the sign from the exponent, as division is quite
1520 different from multiplication.
1521
1522 ###### calc exponent
1523         switch (base) {
1524         case 10:
1525         case 2:
1526                 lexp -= places;
1527                 break;
1528         case 16:
1529                 lexp -= 4*places;
1530                 break;
1531         case 8:
1532                 lexp -= 3*places;
1533                 break;
1534         }
1535         if (lexp < 0) {
1536                 lexp = -lexp;
1537                 esign = -1;
1538         } else
1539                 esign = 1;
1540
1541 Imposing the exponent on the number is also very different for base 10
1542 than for the others.  For the binary shift `gmp` provides a simple
1543 function.  For base 10 we use something like Russian Peasant
1544 Multiplication.
1545
1546 ###### calc exponent
1547         if (expc == 'e') {
1548                 mpq_t tens;
1549                 mpq_init(tens);
1550                 mpq_set_ui(tens, 10, 1);
1551                 while (1) {
1552                         if (lexp & 1) {
1553                                 if (esign > 0)
1554                                         mpq_mul(num, num, tens);
1555                                 else
1556                                         mpq_div(num, num, tens);
1557                         }
1558                         lexp >>= 1;
1559                         if (lexp == 0)
1560                                 break;
1561                         mpq_mul(tens, tens, tens);
1562                 }
1563                 mpq_clear(tens);
1564         } else {
1565                 if (esign > 0)
1566                         mpq_mul_2exp(num, num, lexp);
1567                 else
1568                         mpq_div_2exp(num, num, lexp);
1569         }
1570
1571 Now we are ready to parse a number: the base, mantissa, and exponent.
1572 If all goes well we check for the possible trailing letters and
1573 return.  Return value is 1 for success and 0 for failure.
1574
1575
1576 ###### number functions
1577         int number_parse(mpq_t num, char tail[3], struct text tok)
1578         {
1579                 ## number vars
1580                 int i;
1581
1582                 ## parse base
1583                 ## parse mantissa
1584                 if (tok.len > 1 && (tok.txt[0] == expc ||
1585                                     tok.txt[0] == toupper(expc))) {
1586                         tok.txt++;
1587                         tok.len--;
1588                         ## parse exponent
1589                 }
1590                 ## calc exponent
1591
1592                 for (i = 0; i < 2; i++) {
1593                         if (tok.len <= i)
1594                                 break;
1595                         if (!isalpha(tok.txt[i]))
1596                                 goto err;
1597                         tail[i] = tok.txt[i];
1598                 }
1599                 tail[i] = 0;
1600                 if (i == tok.len)
1601                         return 1;
1602         err:
1603                 mpq_clear(num);
1604                 return 0;
1605         }
1606
1607 Number parsing goes in `libnumber.c`
1608
1609 ###### File: libnumber.c
1610
1611         #include <unistd.h>
1612         #include <stdlib.h>
1613
1614         ## number includes
1615         ## number functions
1616
1617 ###### File: number.h
1618         int number_parse(mpq_t num, char tail[3], struct text tok);
1619
1620 ###### File: scanner.mk
1621         all :: libnumber.o
1622         libnumber.o : libnumber.c
1623                 $(CC) $(CFLAGS) -c libnumber.c
1624
1625 ## Processing strings
1626
1627 Both `TK_string` and `TK_multi_string` require post-processing which
1628 can be one of two types: literal or with escapes processed.
1629 Even literal processing is non-trivial as the file may contain indents
1630 which need to be stripped.
1631
1632 Errors can only occur when processing escapes.  Any unrecognised
1633 character following the escape character will cause an error.
1634
1635 Processing escapes and striping indents can only make the string
1636 shorter, not longer, so we allocate a buffer which is the same size as
1637 the string and process into that.
1638
1639 To request escape processing, we pass the character we want to use for
1640 quoting, usually '`\`'.  To avoid escape processing we pass a zero.
1641
1642 ###### string main
1643         int string_parse(struct token *tok, char escape,
1644                          struct text *str, char tail[3])
1645         {
1646                 ## string vars
1647                 struct text t = tok->txt;
1648
1649                 str->txt = NULL;
1650                 ## strip tail
1651                 if (tok->num == TK_string) {
1652                         ## strip single
1653                 } else {
1654                         ## strip multi
1655                 }
1656                 str->txt = malloc(t.len);
1657                 str->len = 0;
1658
1659                 ## process string
1660                 return 1;
1661         err:
1662                 free(str->txt);
1663                 str->txt = NULL;
1664                 return 0;
1665         }
1666
1667 ### strip tail
1668
1669 The tail of the string can be 0, 1, or 2 letters
1670
1671         i = t.len;
1672         if (i >= 0 && isalpha(t.txt[i-1]))
1673                 i -= 1;
1674         if (i >= 0 && isalpha(t.txt[i-1]))
1675                 i -= 1;
1676         strncpy(tail, t.txt+i, t.len-i);
1677         tail[t.len-i] = 0;
1678         t.len = i;
1679
1680 ###### string vars
1681         int i;
1682
1683 ### strip single
1684
1685 Stripping the quote of a single-line string is trivial.
1686 The only part that is at all interesting is that quote character must
1687 be remembered.
1688
1689         quote = t.txt[0];
1690         if (t.txt[t.len-1] != quote)
1691                 goto err;
1692         t.txt += 1;
1693         t.len -= 2;
1694
1695 ###### string vars
1696         char quote;
1697
1698 ### strip multi
1699
1700 For a multi-line string we have a little more work to do.  We need to
1701 remove 3 quotes, not 1, and need to count the indent of the close
1702 quote as it will need to be stripped from all lines.
1703
1704         quote = t.txt[0];
1705         if (t.len < 7 ||
1706             t.txt[1] != quote || t.txt[2] != quote ||
1707             !is_newline(t.txt[3]))
1708                 goto err;
1709         t.txt += 4;
1710         t.len -= 4;
1711         i = t.len;
1712         if (i <= 0 || t.txt[i-1] != quote)
1713                 goto err;
1714         i -= 1;
1715         if (i <= 0 || t.txt[i-1] != quote)
1716                 goto err;
1717         i -= 1;
1718         if (i <= 0 || t.txt[i-1] != quote)
1719                 goto err;
1720         i -= 1;
1721         t.len = i;
1722         while (i > 0 && !is_newline(t.txt[i-1]))
1723                 i--;
1724         indent = 0;
1725         while (i < t.len) {
1726                 if (t.txt[i] == ' ')
1727                         indent += 1;
1728                 if (t.txt[i] == '\t')
1729                         indent = indent_tab(indent);
1730                 i++;
1731         }
1732
1733 ###### string vars
1734         int indent = 0;
1735
1736 ### process string
1737
1738 Now we just take one byte at a time. trans-ASCII unicode won't look
1739 like anything we are interested in so it will just be copied byte by
1740 byte.
1741
1742         cp = str->txt;
1743         at_sol = 1;
1744         for (i = 0; i < t.len; i++) {
1745                 char c;
1746                 if (at_sol) {
1747                         at_sol = 0;
1748                         ## strip indent
1749                         if (i >= t.len)
1750                                 break;
1751                 }
1752                 c = t.txt[i];
1753                 if (c != escape) {
1754                         *cp = c;
1755                         cp += 1;
1756                         if (is_newline(c))
1757                                 at_sol = 1;
1758                 } else if (i+1 >= t.len) {
1759                         // escape and end of string
1760                         goto err;
1761                 } else {
1762                         i += 1;
1763                         c = t.txt[i];
1764                         ## parse escape
1765                 }
1766         }
1767         str->len = cp - str->txt;
1768
1769 ###### string vars
1770         char *cp;
1771         int at_sol;
1772
1773 ### strip indent
1774
1775 Every time we find a start of line, we strip spaces and tabs until the
1776 required indent is found.
1777
1778         int skipped = 0;
1779         while (i < t.len && skipped < indent) {
1780                 c = t.txt[i];
1781                 if (c == ' ')
1782                         skipped += 1;
1783                 else if (c == '\t')
1784                         skipped = indent_tab(skipped);
1785                 else
1786                         break;
1787                 i+= 1;
1788         }
1789
1790 ### parse escape
1791         switch (c) {
1792         case 'n':
1793                 *cp++ = '\n'; break;
1794         case 'r':
1795                 *cp++ = '\r'; break;
1796         case 't':
1797                 *cp++ = '\t'; break;
1798         case 'b':
1799                 *cp++ = '\b'; break;
1800         case 'q':
1801                 *cp++ = quote; break;
1802         case 'f':
1803                 *cp++ = '\f'; break;
1804         case 'v':
1805                 *cp++ = '\v'; break;
1806         case 'a':
1807                 *cp++ = '\a'; break;
1808         case '0':
1809         case '1':
1810         case '2':
1811         case '3':
1812                 // 3 digit octal number
1813                 if (i+2 >= t.len)
1814                         goto err;
1815                 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1816                     t.txt[i+2] < '0' || t.txt[i+1] > '7')
1817                         goto err;
1818                 n = (t.txt[i  ]-'0') * 64 +
1819                     (t.txt[i+1]-'0') *  8 +
1820                     (t.txt[i+2]-'0') *  1;
1821                 *cp++ = n;
1822                 i += 2;
1823                 break;
1824         case 'x':
1825                 // 2 hex digits
1826                 n = take_hex(2, t.txt+i+1, t.len-i-1);
1827                 if (n < 0)
1828                         goto err;
1829                 *cp++ = n;
1830                 i += 2;
1831                 break;
1832         case 'u':
1833         case 'U':
1834                 // 4 or 8 hex digits for unicode
1835                 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1836                 if (n < 0)
1837                         goto err;
1838                 memset(&pstate, 0, sizeof(pstate));
1839                 n = wcrtomb(cp, n, &pstate);
1840                 if (n <= 0)
1841                         goto err;
1842                 cp += n;
1843                 i += c == 'u' ? 4 : 8;
1844                 break;
1845         default:
1846                 if (c == escape)
1847                         *cp++ = c;
1848                 else if (is_newline(c))
1849                         at_sol = 1;
1850                 else
1851                         goto err;
1852         }
1853
1854 ###### string vars
1855         long n;
1856         mbstate_t pstate;
1857
1858 For `\x` `\u` and `\U` we need to collect a specific number of
1859 hexadecimal digits
1860
1861 ###### string functions
1862
1863         static long take_hex(int digits, char *cp, int l)
1864         {
1865                 long n = 0;
1866                 if (l < digits)
1867                         return -1;
1868                 while (digits) {
1869                         char  c = *cp;
1870                         int d;
1871                         if (!isxdigit(c))
1872                                 return -1;
1873                         if (isdigit(c))
1874                                 d = c - '0';
1875                         else if (isupper(c))
1876                                 d = 10 + c - 'A';
1877                         else
1878                                 d = 10 + c - 'a';
1879                         n = n * 16 + d;
1880                         digits--;
1881                         cp++;
1882                 }
1883                 return n;
1884         }
1885
1886 #### File: libstring.c
1887
1888 String parsing goes in `libstring.c`
1889
1890         #include <unistd.h>
1891         #include <stdlib.h>
1892         #include <stdio.h>
1893         #include <string.h>
1894         #include <ctype.h>
1895         #include <wchar.h>
1896         #include "mdcode.h"
1897         #include "scanner.h"
1898         ## string functions
1899         ## string main
1900
1901 ###### File: string.h
1902         int string_parse(struct token *tok, char escape,
1903                          struct text *str, char tail[3]);
1904
1905 ###### File: scanner.mk
1906         all :: libstring.o
1907         libstring.o : libstring.c
1908                 $(CC) $(CFLAGS) -c libstring.c
1909
1910
1911 ## Testing
1912
1913 As "untested code is buggy code" we need a program to easily test
1914 the scanner library.  This will simply parse a given file and report
1915 the tokens one per line.
1916
1917 ###### File: scanner.c
1918
1919         #include <unistd.h>
1920         #include <stdlib.h>
1921         #include <fcntl.h>
1922         #include <errno.h>
1923         #include <sys/mman.h>
1924         #include <string.h>
1925         #include <stdio.h>
1926         #include <gmp.h>
1927         #include <locale.h>
1928         #include "mdcode.h"
1929         #include "scanner.h"
1930         #include "number.h"
1931         #include "string.h"
1932
1933         static int errs;
1934         static void pr_err(char *msg)
1935         {
1936                 errs++;
1937                 fprintf(stderr, "%s\n", msg);
1938         }
1939
1940         int main(int argc, char *argv[])
1941         {
1942                 int fd;
1943                 int len;
1944                 char *file;
1945                 struct token_state *state;
1946                 const char *known[] = {
1947                         "==",
1948                         "else",
1949                         "if",
1950                         "then",
1951                         "while",
1952                         "{",
1953                         "}",
1954                 };
1955                 struct token_config conf = {
1956                         .word_start = "_$",
1957                         .word_cont = "",
1958                         .words_marks = known,
1959                         .number_chars = "., _+-",
1960                         .known_count = sizeof(known)/sizeof(known[0]),
1961                         .ignored = (0 << TK_line_comment)
1962                                   |(0 << TK_block_comment),
1963                 };
1964                 struct section *table, *s, *prev;
1965                 setlocale(LC_ALL,"");
1966                 if (argc != 2) {
1967                         fprintf(stderr, "Usage: scanner file\n");
1968                         exit(2);
1969                 }
1970                 fd = open(argv[1], O_RDONLY);
1971                 if (fd < 0) {
1972                         fprintf(stderr, "scanner: cannot open %s: %s\n",
1973                                 argv[1], strerror(errno));
1974                         exit(1);
1975                 }
1976                 len = lseek(fd, 0, 2);
1977                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
1978                 table = code_extract(file, file+len, pr_err);
1979
1980                 for (s = table; s;
1981                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
1982                         printf("Tokenizing: %.*s\n", s->section.len,
1983                                 s->section.txt);
1984                         state = token_open(s->code, &conf);
1985                         while(1) {
1986                                 struct token tk = token_next(state);
1987                                 printf("%d:%d ", tk.line, tk.col);
1988                                 token_trace(stdout, tk, 20);
1989                                 if (tk.num == TK_number) {
1990                                         mpq_t num;
1991                                         char tail[3];
1992                                         if (number_parse(num, tail,tk.txt)) {
1993                                                 printf(" %s ", tail);
1994                                                 mpq_out_str(stdout, 10, num);
1995                                                 mpq_clear(num);
1996                                         } else
1997                                                 printf(" BAD NUMBER");
1998                                 }
1999                                 if (tk.num == TK_string ||
2000                                     tk.num == TK_multi_string) {
2001                                         char esc = '\\';
2002                                         struct text str;
2003                                         char tail[3];
2004                                         if (tk.txt.txt[0] == '`')
2005                                                 esc = 0;
2006                                         if (string_parse(&tk, esc,
2007                                                          &str, tail)) {
2008                                                 printf(" %s ", tail);
2009                                                 text_dump(stdout, str, 20);
2010                                                 free(str.txt);
2011                                         } else
2012                                                 printf(" BAD STRING");
2013                                 }
2014                                 printf("\n");
2015                                 if (tk.num == TK_error)
2016                                         errs = 1;
2017                                 if (tk.num == TK_eof)
2018                                         break;
2019                         }
2020                 }
2021                 exit(!!errs);
2022         }
2023 ###### File: scanner.mk
2024         scanner.c : scanner.mdc
2025                 ./md2c scanner.mdc
2026         all :: scanner
2027         scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
2028                 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
2029                         libmdcode.o libnumber.o libstring.o -licuuc -lgmp
2030         scanner.o : scanner.c
2031                 $(CC) $(CFLAGS) -c scanner.c