ocean-lang.org Git - ocean/blob - csrc/scanner.mdc

   1 # Lexical Scanner #
   2
   3 ## The Task at Hand ##
   4
   5 The main task of the lexical scanner is to convert a stream of
   6 characters into a stream of tokens.  The tokens are then typically
   7 used by a parser to extract the syntactic structure.
   8
   9 The stream of characters are assumed to be in memory identified by a
  10 linked list of blocks, such as provided by the "[mdcode][]" literate
  11 program extractor.  A single token may never cross a block boundary.
  12
  13 [mdcode]: mdcode.html
  14
  15 ###### includes
  16         #include "mdcode.h"
  17
  18 The text is assumed to be UTF-8 though some matching assumes the
  19 ASCII subset.  If the text provided does not conform to UTF-8 an error
  20 will be reported and some number of bytes will be skipped.
  21
  22 ###### public types
  23         #include <wchar.h>
  24         #include <wctype.h>
  25         #include <unicode/uchar.h>
  26
  27 Tokens are returned by successive calls to the main interface
  28 function: `token_next()` which has a `state` structure to keep track
  29 of where it is up to.  Each token carries not just a numeric
  30 identifier but also the code block, the line and character within that
  31 block, and the actual start and length using the `struct text` from
  32 "mdcode".
  33
  34 ###### public types
  35         struct token {
  36                 int               num;
  37                 struct code_node *node;
  38                 struct text       txt;
  39                 int               line, col;
  40         };
  41         struct token_state;
  42
  43 ###### private types
  44         struct token_state {
  45                 ## state fields
  46         };
  47
  48 ###### exported functions
  49         struct token token_next(struct token_state *state);
  50
  51 ###### main functions
  52         struct token token_next(struct token_state *state)
  53         {
  54                 ## token_next init
  55                 while (1) {
  56                         wint_t ch;
  57                         struct token tk;
  58
  59                         ## one token
  60                 }
  61         }
  62
  63 The `line` and `col` offsets are useful for reporting errors.
  64 The `txt` provides the content when that is important.
  65
  66 ### Token types and configuration ##
  67
  68 The scanner is not completely general, yet not completely specified.
  69 There are a fixed set of token types, though particular tokens within
  70 those types can be distinguish via configuration.
  71
  72 Most token types may be explicitly ignored, as typically comments
  73 would be.  The exact consequence of ignoring each token type varies
  74 from token to token.
  75
  76 ###### public types
  77         struct token_config {
  78                 int ignored;    // bit set of ignored tokens.
  79                 ## token config parameters
  80         };
  81
  82 ###### state fields
  83         struct token_config *conf;
  84
  85 ###### token_next init
  86         int ignored = state->conf->ignored;
  87
  88 The different tokens are numbers, words, marks, strings, comments,
  89 newlines, EOF, and indents, each of which is examined in detail below.
  90
  91 There are various cases where no token can be found in part of the
  92 input.  All of these will be reported as a `TK_error` token.
  93
  94 It is possible to declare a number of strings which form distinct
  95 tokens (rather than being grouped as e.g. 'word').  These are given
  96 token numbers from `TK_reserved` upwards.
  97
  98 ###### public types
  99         enum token_num {
 100                 TK_error,
 101                 ## token types
 102                 TK_reserved
 103         };
 104
 105 ### Numbers
 106
 107 Numbers are the messiest tokens to parse, primarily because they can
 108 contain characters that also have meaning outside of numbers and,
 109 particularly, immediately after numbers.
 110
 111 The obvious example is the '`-`' sign.  It can come inside a number for
 112 a negative exponent, or after a number as a subtraction operator.  To
 113 be sure we have parsed as best as possible we need to only allow the
 114 '`-`' inside a number if it is after an exponent character.  This can be
 115 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
 116 digit, so we don't allow '`-`' after just any `e`.
 117
 118 To make matters worse, our language designer has decided to experiment
 119 with allowing commas to be used as the decimal indicator, and spaces
 120 to be used to separate groups of digits in large numbers.  Both of
 121 these can reasonably be restricted to appear between two digits, so we
 122 have to add that condition to our tests.
 123
 124 So we cannot just treat numbers as starting with a digit and being
 125 followed by some set of characters.  We need more structure than that.
 126
 127 So:
 128
 129 - Numbers must start with a digit.
 130 - If the first digit is zero, the next character must be a base
 131   signifier (one of `xob`) or a decimal marker (`.` or `,`).
 132   In the first case the first `p` or `P` may be followed by a sign.
 133 - If the number doesn't start with `0` followed by one of `xob`, the
 134   first `e` may be followed by a sign.
 135 - Any digit or hex digit may be followed by a space or underscore
 136   providing that the subsequence character is also a (hex) digit.
 137   This rule will require an extra level of 'unget' to be
 138   supported when handling characters.
 139 - Otherwise any digits or ASCII letters are allowed.  We do not at
 140   this point check that the digits given are permitted by the base.
 141   That will happen when the token is converted to a number.
 142
 143 To allow easy configuration, the various non alphanumeric characters
 144 are only permitted if they are listed in a configuration parameter.
 145
 146 ###### token config parameters
 147         char *number_chars;
 148
 149 Note that numbers may not start with a period, so `.75` is not a
 150 number.  This is not the norm, but is not unheard of.  Excluding these
 151 numbers simplifies the rule at very little cost.
 152
 153 ###### token types
 154         TK_number,
 155
 156 If TK_number is ignored, digits will result in an error unless they
 157 are declared to be a start character for words.
 158
 159 ###### includes
 160
 161         #include <string.h>
 162
 163 ###### parse number
 164
 165         if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
 166                 int prev_special = 0;
 167                 int expect_p = 0;
 168                 int decimal_mark = 0;
 169                 if (ch == '0') {
 170                         wchar_t ch2 = get_char(state);
 171                         if (strchr("xobXOB", ch2) != NULL)
 172                                 expect_p = 1;
 173                         unget_char(state);
 174                 }
 175                 while (1) {
 176                         int sign_ok = 0;
 177                         switch(expect_p) {
 178                         case 0:
 179                                 if (ch == 'e' || ch == 'E')
 180                                         sign_ok = 1;
 181                                 break;
 182                         case 1:
 183                                 if (ch == 'p' || ch == 'P')
 184                                         sign_ok = 1;
 185                                 break;
 186                         }
 187                         save_unget_state(state);
 188                         ch = get_char(state);
 189                         if (iswalnum(ch)) {
 190                                 prev_special = 0;
 191                                 continue;
 192                         }
 193                         if (ch == '+' || ch == '-') {
 194                                 if (!sign_ok)
 195                                         break;
 196                                 expect_p = -1;
 197                         }
 198                         if (ch == '.' || ch == ',') {
 199                                 if (decimal_mark)
 200                                         break;
 201                                 decimal_mark = 1;
 202                         }
 203                         if (prev_special) {
 204                                 /* Don't allow that special char,
 205                                  * need two 'ungets'
 206                                  */
 207                                 restore_unget_state(state);
 208                                 break;
 209                         }
 210                         if (strchr(state->conf->number_chars, ch)) {
 211                                 prev_special = 1;
 212                                 continue;
 213                         }
 214                         /* non-number char */
 215                         break;
 216                 }
 217                 /* We seem to have a "number" token */
 218                 unget_char(state);
 219                 close_token(state, &tk);
 220                 tk.num = TK_number;
 221                 return tk;
 222         }
 223
 224 ### Words
 225 Words start with a "start" character followed by the longest
 226 sequence of "continue" characters.  The Unicode ID_START and
 227 ID_CONTINUE sets are always permitted, but other ASCII characters
 228 can be added to these sets.
 229
 230 ###### token config parameters
 231         char *word_start;
 232         char *word_cont;
 233
 234 ###### internal functions
 235         static int is_word_start(wchar_t ch, struct token_config *conf)
 236         {
 237                 return iswalpha(ch) ||
 238                        strchr(conf->word_start, ch) != NULL ||
 239                        u_hasBinaryProperty(ch, UCHAR_ID_START);
 240         }
 241
 242         static int is_word_continue(wchar_t ch, struct token_config *conf)
 243         {
 244                 return iswalnum(ch) ||
 245                        strchr(conf->word_cont, ch) != NULL ||
 246                        u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
 247         }
 248
 249 Words can be either known or unknown.  Known words are referred to as
 250 "reserved words" and get a unique token number.  Unknown words are
 251 "identifiers" and are syntactically a single token.
 252
 253 ###### token types
 254         TK_ident,
 255
 256 A list of known words must be provided.  This list is shared with the
 257 "marks" which are described next.  The list must be lexically sorted
 258 and the length of the list must be given (`known_count`).
 259 Tokens matching these known words are reported as the index of the
 260 list added to `TK_reserved`.
 261
 262 If identifiers are ignored, then any word which is not listed as a
 263 known word results in an error.
 264
 265 ###### token config parameters
 266         const char **words_marks;
 267         int known_count;
 268
 269 ###### parse word
 270
 271         if (is_word_start(ch, state->conf)) {
 272                 int n;
 273                 /* A word: identifier or reserved */
 274                 do
 275                         ch = get_char(state);
 276                 while (is_word_continue(ch, state->conf));
 277                 unget_char(state);
 278                 close_token(state, &tk);
 279                 tk.num = TK_ident;
 280                 if (ignored & (1<<TK_ident))
 281                         tk.num = TK_error;
 282                 n = find_known(state->conf, tk.txt);
 283                 if (n >= 0)
 284                         tk.num = TK_reserved + n;
 285                 return tk;
 286         }
 287
 288 ### Marks
 289
 290 Marks are generally one or more punctuation marks joined together.  It
 291 would be nice to use the term "symbol" for these, but that causes
 292 confusion in a subsequent discussion of the grammar, which has terminal
 293 symbols and non-terminal symbols which are conceptually quite
 294 different.  So strings of punctuation characters will be marks.
 295
 296 A "mark" consists of ASCII characters that are not white space, are not
 297 "start" characters for words, and are not digits.
 298 These will collectively be called mark characters.
 299
 300 ###### internal functions
 301         static int is_mark(wchar_t ch, struct token_config *conf)
 302         {
 303                 return ch > ' ' &&
 304                        ch < 0x7f &&
 305                        !iswalnum(ch) &&
 306                        strchr(conf->word_start, ch) == NULL;
 307         }
 308
 309 As with words, there can be known and unknown marks, though the rules
 310 are slightly different.
 311
 312 Two marks do not need to be separated by a non-mark characters.  This
 313 is different from words which do need to be separated by at least one
 314 non-continue character.
 315
 316 The scanner will normally prefer longer sequences of mark characters,
 317 but will more strongly prefer known marks over unknown marks.  So if
 318 it finds a known mark where adding one more character does not result
 319 in a known mark, it will return that first known mark.
 320
 321 If no known mark is found we will test against strings and comments
 322 below before giving up and assuming an unknown mark.
 323
 324 If an unknown mark contains a quote character or a comment marker, and
 325 that token is not being ignored, then we terminate the unknown mark
 326 before that quote or comment.  This ensures that an unknown mark
 327 immediately before a string is handled correctly.
 328
 329 If the first character of a comment marker (i.e. '/') is a known mark,
 330 the above rules would suggest that the start of a comment would be
 331 parsed as that mark, which is not what is wanted.  So the introductory
 332 sequences for a comment ("//" and "/*") are treated as
 333 partially-known.  They prevent the leading "/" from being a mark by
 334 itself, but do not actually constitute a stand-alone mark.
 335
 336 If `TK_mark` is ignored, then unknown marks are returned as errors.
 337
 338 ###### token types
 339         TK_mark,
 340
 341 Known marks are included in the same list as the list of known words.
 342
 343 ###### parse mark
 344         tk.num = TK_error;
 345         while (is_mark(ch, state->conf)) {
 346                 int n;
 347                 wchar_t prev;
 348                 close_token(state, &tk);
 349                 n = find_known(state->conf, tk.txt);
 350                 if (n >= 0)
 351                         tk.num = TK_reserved + n;
 352                 else if (tk.num != TK_error) {
 353                         /* found a longest-known-mark, still need to
 354                          * check for comments
 355                          */
 356                         if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
 357                             (ch == '/' || ch == '*')) {
 358                                 /* Yes, this is a comment, not a '/' */
 359                                 restore_unget_state(state);
 360                                 tk.num = TK_error;
 361                                 break;
 362                         }
 363                         unget_char(state);
 364                         close_token(state, &tk);
 365                         return tk;
 366                 }
 367                 prev = ch;
 368                 save_unget_state(state);
 369                 ch = get_char(state);
 370                 if (!(ignored & (1<<TK_string)) && n < 0 &&is_quote(ch) && !is_quote(prev))
 371                         /* If strings are allowed, a quote (Which isn't a known mark)
 372                          * mustn't be treated as part of an unknown mark.  It can be
 373                          * part of a multi-line srtings though.
 374                          */
 375                         break;
 376                 if (prev == '#' && n < 0)
 377                         /* '#' is not a known mark, so assume it is a comment */
 378                         break;
 379                 if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
 380                         close_token(state, &tk);
 381                         restore_unget_state(state);
 382                         break;
 383                 }
 384                 if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
 385                         close_token(state, &tk);
 386                         restore_unget_state(state);
 387                         break;
 388                 }
 389         }
 390         unget_char(state);
 391         if (tk.num != TK_error) {
 392                 close_token(state, &tk);
 393                 return tk;
 394         }
 395
 396 If we don't find a known mark, we will check for strings and comments
 397 before assuming that we have an unknown mark
 398
 399 ###### parse mark
 400         ## parse string
 401         ## parse comment
 402         ## unknown mark
 403
 404 ### Strings
 405
 406 Strings start with one of single quote, double quote, or back quote
 407 and continue until a matching character on the same line.  Any of
 408 these characters can be included in the list of known marks and then
 409 they will not be used for identifying strings.
 410
 411 Immediately following the close quote, one or two ASCII letters may
 412 appear.  These are somewhat like the arbitrary letters allowed in
 413 "Numbers" above.  They can be used by the language in various ways.
 414
 415 If 3 identical quote characters appear in a row and are
 416 followed by a newline, then this forms a multi-line string which
 417 continues until an identical triple quote appears on a line preceded
 418 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
 419
 420 Multi-line strings may not extend beyond the end of the `code_node` in
 421 which they start.
 422
 423 Normal strings and multi-line strings are encoded as two different
 424 token types.
 425
 426 ###### token types
 427         TK_string,
 428         TK_multi_string,
 429
 430 ###### internal functions
 431         static int is_quote(wchar_t ch)
 432         {
 433                 return ch == '\'' || ch == '"' || ch == '`'; // "
 434         }
 435
 436 #### Multi-line strings
 437
 438 The multi-line string is checked for first.  If they are being
 439 ignored, we fall through and treat a triple quote as an empty string
 440 followed by the start of a new string.
 441
 442 ###### parse string
 443         if (tk.txt.len == 3 &&
 444             !(ignored & (1 << TK_multi_string)) &&
 445             is_quote(tk.txt.txt[0]) &&
 446             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
 447             is_newline(tk.txt.txt[3])) {
 448                 // triple quote
 449                 wchar_t first = tk.txt.txt[0];
 450                 int qseen = 0;
 451                 int at_sol = 1;
 452                 while (!at_eon(state) && qseen < 3) {
 453                         ch = get_char(state);
 454                         if (is_newline(ch)) {
 455                                 at_sol = 1;
 456                                 qseen = 0;
 457                         } else if (at_sol && ch == first) {
 458                                 qseen += 1;
 459                         } else if (ch != ' ' && ch != '\t') {
 460                                 at_sol = 0;
 461                                 qseen = 0;
 462                         }
 463                 }
 464                 if (qseen != 3) {
 465                         /* Hit end of node - error.
 466                          * unget so the newline is seen,
 467                          * but return rest of string as an error.
 468                          */
 469                         if (is_newline(ch))
 470                                 unget_char(state);
 471                         close_token(state, &tk);
 472                         tk.num = TK_error;
 473                         return tk;
 474                 }
 475                 /* 2 letters are allowed */
 476                 ch = get_char(state);
 477                 if (iswalpha(ch))
 478                         ch = get_char(state);
 479                 if (iswalpha(ch))
 480                         ch = get_char(state);
 481                 /* Now we must have a newline, but we don't return it
 482                  * whatever it is.*/
 483                 unget_char(state);
 484                 close_token(state, &tk);
 485                 tk.num = TK_multi_string;
 486                 if (!is_newline(ch))
 487                         tk.num = TK_error;
 488                 return tk;
 489         }
 490
 491 #### Single-line strings
 492
 493 The sequence of marks collected may be more than a single-line
 494 string, so we reset to the start and collect characters until
 495 we find a close quote or a newline.
 496
 497 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
 498
 499 ###### parse string
 500         if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
 501             !(ignored & (1<<TK_string))) {
 502                 wchar_t first = tk.txt.txt[0];
 503                 reset_token(state, &tk);
 504                 ch = get_char(state);
 505                 tk.num = TK_error;
 506                 while (!at_eon(state) && !is_newline(ch)) {
 507                         ch = get_char(state);
 508                         if (ch == first) {
 509                                 tk.num = TK_string;
 510                                 break;
 511                         }
 512                         if (is_newline(ch)) {
 513                                 unget_char(state);
 514                                 break;
 515                         }
 516                 }
 517                 while (!at_eon(state) && (ch = get_char(state)) &&
 518                                           iswalpha(ch))
 519                         ;
 520                 unget_char(state);
 521                 close_token(state, &tk);
 522                 return tk;
 523         }
 524
 525 ### Comments
 526
 527 Single line comments may start with '`//`' or '`#`' providing that these
 528 are not known marks.  They continue to the end of the line.
 529
 530 Block comments start with '`/*`' if this is not a known mark.  They
 531 continue to the first occurrence of '`*/`' and may not contain any
 532 occurrence of '`/*`'.
 533
 534 Block comments can be wholly within one line or can continue over
 535 multiple lines.  The multi-line version should be followed immediately
 536 by a newline.  The Linux kernel contains over 285000 multi-line
 537 comments are only 34 are followed by characters other than white space
 538 (which should be removed) or a backslash (only needed in macros).  So
 539 it would not suffer from this rule.
 540
 541 These two comment types are reported as two separate token types, and
 542 consequently can be ignored separately.  When ignored a comment is
 543 still parsed, but is discarded.
 544
 545 ###### token types
 546         TK_line_comment,
 547         TK_block_comment,
 548
 549 ###### internal functions
 550         static int is_line_comment(struct text txt)
 551         {
 552                 return (txt.len >= 1 && txt.txt[0] == '#') ||
 553                        (txt.len >= 2 && txt.txt[0] == '/' &&
 554                                         txt.txt[1] == '/');
 555         }
 556
 557         static int is_block_comment(struct text txt)
 558         {
 559                 return txt.len >= 2 && txt.txt[0] == '/' &&
 560                        txt.txt[1] == '*';
 561         }
 562
 563 #### Single line comments
 564
 565 A single-line comment continues up to, but not including the newline
 566 or end of node.
 567
 568 ###### parse comment
 569
 570         if (is_line_comment(tk.txt)) {
 571                 while (!is_newline(ch) && !at_eon(state))
 572                         ch = get_char(state);
 573                 if (is_newline(ch))
 574                         unget_char(state);
 575                 close_token(state, &tk);
 576                 tk.num = TK_line_comment;
 577                 if (ignored & (1 << TK_line_comment))
 578                         continue;
 579                 return tk;
 580         }
 581
 582 #### Block comments
 583
 584 The token text collected so far could exceed the comment, so we need
 585 to reset it first.
 586
 587 If we find an embedded `/*` we reset to just before the '/' and report
 588 an error.  That way the next thing to be parsed will be the rest of
 589 the comment.  This requires a double unget, so we need to save/restore
 590 the unget state (explained later).
 591
 592 ###### parse comment
 593
 594         if (is_block_comment(tk.txt)) {
 595                 wchar_t prev;
 596                 int newlines = 0;
 597                 reset_token(state, &tk);
 598                 get_char(state);
 599                 get_char(state);
 600                 save_unget_state(state);
 601                 ch = get_char(state);
 602                 prev = 0;
 603                 while (!at_eon(state) &&
 604                        (prev != '/' || ch != '*') &&
 605                        (prev != '*' || ch != '/')) {
 606                         if (is_newline(ch))
 607                                 newlines = 1;
 608                         prev = ch;
 609                         save_unget_state(state);
 610                         ch = get_char(state);
 611                 }
 612                 close_token(state, &tk);
 613                 if (at_eon(state)) {
 614                         tk.num = TK_error;
 615                         return tk;
 616                 }
 617                 if (prev == '/') {
 618                         /* embedded.  Need to unget twice! */
 619                         restore_unget_state(state);
 620                         unget_char(state);
 621                         tk.num = TK_error;
 622                         return tk;
 623                 }
 624                 tk.num = TK_block_comment;
 625                 if (newlines && !(ignored & (1<<TK_newline))) {
 626                         /* next char must be newline */
 627                         ch = get_char(state);
 628                         unget_char(state);
 629                         if (!is_newline(ch))
 630                                 tk.num = TK_error;
 631                 }
 632                 if (tk.num == TK_error ||
 633                     !(ignored & (1 << TK_block_comment)))
 634                         return tk;
 635                 continue;
 636         }
 637
 638 ### Indents, Newlines, and White Space.
 639
 640 Normally white space is ignored.  However newlines can be important as
 641 can indents, which are either after a newline or at the start of a
 642 node (detected by `at_son()`);
 643
 644 ###### exported functions
 645         static inline int is_newline(wchar_t ch)
 646         {
 647                 return ch == '\n' || ch == '\f' || ch == '\v';
 648         }
 649
 650 ###### white space
 651         if (ch <= ' ' && !is_newline(ch)
 652             && ! at_son(state))
 653                 continue;
 654
 655 If a line starts with more white-space than the previous non-blank
 656 line - or if the first non-blank line in the document starts with any
 657 white-space - then an "IN" is reported at the start of the line.
 658
 659 Before the next non-blank line which starts with less white space, or
 660 at the latest at the end of the document, a matching "OUT" token
 661 is reported.  There will always be an exact match between "IN" and
 662 "OUT" tokens.
 663
 664 It is possible for "OUT" to be followed (almost) immediately by an
 665 "IN".  This happens if, for example, the indent of three consecutive
 666 lines are 0, 8, 4 spaces.  Before the second line we report an
 667 "IN".  Before the third line we must report an "OUT", as 4 is less
 668 than 8, then also an Ident as 4 is greater than 0.
 669
 670 ###### token types
 671         TK_in,
 672         TK_out,
 673
 674 For the purpose of measuring the length of white space, a tab adds at
 675 least one space, and rounds up to a multiple of 8.
 676
 677 ###### exported functions
 678         static inline int indent_tab(int indent)
 679         {
 680                 return (indent|7)+1;
 681         }
 682
 683 We need to track the current levels of indent.  This requires some
 684 sort of stack as indent levels are pushed on and popped off.  In
 685 practice this stack is unlikely to often exceed 5 so we will used a
 686 fixed stack of 20 indent levels.  More than this will be silently
 687 ignored.
 688
 689 ###### state fields
 690         int     indent_level;
 691         int     indent_sizes[20];
 692
 693 #### Newlines
 694
 695 Newlines can optionally be reported.  Newlines within a block comment
 696 or a multi-line string are not reported separately, but each of these
 697 must be followed immediately by a newline so these constructs cannot
 698 hide the fact that a newline was present.
 699
 700 When indents are being reported, the Newline which would normally be
 701 reported immediately before the "IN" is delayed until after the
 702 matching "OUT".  This makes an indented section act like a
 703 continuation of the previous line to some extent.
 704
 705 A blank line would normally be reported simply as two consecutive Newline
 706 tokens.  However if the subsequent line is indented (and indents are being
 707 reported) then the right thing to do is less obvious as Newlines should be
 708 delayed - but how many Newlines?
 709
 710 The approach we will take is to report the extra Newlines immediately after
 711 the IN token, so the blank line is treated as though it were an indented
 712 blank line.
 713
 714 ###### token types
 715         TK_newline,
 716
 717 If we find a newline or white space at the start of a block, we keep
 718 collecting spaces, tabs, and newlines until we find some real text.
 719 Then depending on the indent we generate some number of tokens.  These
 720 will be a sequence of "Newline OUT" pairs representing a decrease
 721 in indent, then either a Newline or an IN depending on whether the
 722 next line is indented, then zero or more Newlines representing all the
 723 blank lines that have been skipped.
 724
 725 When a Newline leads to the next block of code there is a question of
 726 whether the various Newline and OUT/IN tokens should appear to
 727 pbelong to the earlier or later block.  This is addressed by processing
 728 the tokens in two stages based on the relative indent levels of the
 729 two blocks (each block has a base indent to which the actual indents
 730 are added).
 731
 732 Any "Newline OUT" pairs needed to reduce the current indent to the
 733 maximum of the base indents of the old and new blocks are generated
 734 against the old block.  Then if the next block does not have an
 735 increased indent, one more "Newline" is generated.
 736
 737 If further "Newline OUT" pairs are needed to get to the indent
 738 level of the 'next' block, they are generated against that block,
 739 though the first Newline is suppressed (it having already been
 740 generated).
 741
 742 Finally the Newline or IN for the first line of the new block is
 743 generated, unless the Newline needs to be suppressed because it
 744 appeared at the end of the previous block.
 745
 746 This means that a block may start with an OUT or an IN, but
 747 will only start with a Newline if it actually starts with a blank
 748 line.
 749
 750 We will need to represent in the `token_state` where in this sequence
 751 of delayed tokens we are.  As `state.col` records the target indent we
 752 don't need to record how many OUTs or INs are needed.  We do
 753 need to record the number of blank lines, and which of Newline and
 754 OUT is needed next in the initial sequence of pairs.
 755
 756 For this we store one more than the number of blank lines as
 757 `delayed_lines` and a flag for `out_next`.
 758
 759 ###### state fields
 760         int check_indent;
 761         int delayed_lines;
 762         int out_next;
 763
 764 Generating these tokens involve two separate pieces of code.
 765
 766 Firstly we need to recognise white space and count the indents and
 767 newlines.  These are recorded in the above state fields.
 768
 769 Separately we need, on each call to `token_next`, we need to check if
 770 there are some delayed tokens and if so we need to advance the state
 771 information and return one token.
 772
 773 ###### white space
 774         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
 775                 int newlines = 0;
 776                 int was_son = at_son(state);
 777                 if (ignored & (1<<TK_in)) {
 778                         if (!is_newline(ch))
 779                                 continue;
 780                         if (ignored & (1<<TK_newline))
 781                                 continue;
 782                         tk.num = TK_newline;
 783                         close_token(state, &tk);
 784                         return tk;
 785                 }
 786                 // Indents are needed, so check all white space.
 787                 while (ch <= ' ' && !at_eon(state)) {
 788                         if (is_newline(ch))
 789                                 newlines += 1;
 790                         ch = get_char(state);
 791                 }
 792                 if (at_eon(state)) {
 793                         newlines += 1;
 794                         if (state->node->next &&
 795                             state->node->next->indent > state->node->indent)
 796                                 state->col = state->node->next->indent;
 797                         else
 798                                 state->col = state->node->indent;
 799                 } else
 800                         unget_char(state);
 801                 state->delayed_lines = newlines;
 802                 state->out_next = was_son;
 803                 state->check_indent = 1;
 804                 continue;
 805         }
 806
 807 ###### delayed tokens
 808
 809         if (state->check_indent || state->delayed_lines) {
 810                 if (state->col < state->indent_sizes[state->indent_level]) {
 811                         if (!state->out_next &&
 812                             !(ignored & (1<<TK_newline))) {
 813                                 state->out_next = 1;
 814                                 tk.num = TK_newline;
 815                                 return tk;
 816                         }
 817                         state->indent_level -= 1;
 818                         state->out_next = 0;
 819                         tk.num = TK_out;
 820                         return tk;
 821                 }
 822                 if (state->col > state->indent_sizes[state->indent_level] &&
 823                     state->indent_level < sizeof(state->indent_sizes)-1) {
 824                         state->indent_level += 1;
 825                         state->indent_sizes[state->indent_level] = state->col;
 826                         state->delayed_lines -= 1;
 827                         tk.num = TK_in;
 828                         return tk;
 829                 }
 830                 state->check_indent = 0;
 831                 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
 832                         tk.num = TK_newline;
 833                         state->delayed_lines -= 1;
 834                         return tk;
 835                 }
 836                 state->delayed_lines = 0;
 837                 continue;
 838         }
 839
 840 ### End of File
 841
 842 After the last newline in the file has been processed, a special
 843 end-of-file token will be returned.  any further attempts to get more
 844 tokens will continue to return the same end-of-file token.
 845
 846 ###### token types
 847         TK_eof,
 848
 849 ###### white space
 850         if (ch == WEOF) {
 851                 if (state->col) {
 852                         state->col = 0;
 853                         state->check_indent = 1;
 854                         continue;
 855                 }
 856                 tk.num = TK_eof;
 857                 return tk;
 858         }
 859
 860 ### Unknown Marks, or errors.
 861
 862 We have now handled all the possible known mark-like tokens.
 863 If the token we have is not empty and `TK_mark` is allowed,
 864 we have an unknown mark, otherwise this must be an error.
 865
 866 ###### unknown mark
 867
 868         /* one unknown mark character */
 869         if (tk.txt.len) {
 870                 close_token(state, &tk);
 871                 if (ignored & (1<<TK_mark))
 872                         tk.num = TK_error;
 873                 else
 874                         tk.num = TK_mark;
 875                 return tk;
 876         }
 877         /* Completely unrecognised character is next, possibly
 878          * a digit and we are ignoring numbers.
 879          * What ever it is, make it an error.
 880          */
 881         get_char(state);
 882         close_token(state, &tk);
 883         tk.num = TK_error;
 884         return tk;
 885
 886 ## Tools For The Task
 887
 888 You may have noticed that are few gaps we left in the above -
 889 functions used without first defining them.  Doing so above would have
 890 broken the flow.
 891
 892 ### Character by character
 893
 894 As we walk through the various `code_node`s we need to process whole
 895 Unicode codepoints, and keep track of which line and column we are on.
 896 We will assume for now that any printing character uses one column,
 897 though that is not true in general.
 898
 899 As the text in a `code_node` may include an indent that identifies it as
 900 being code, we need to be careful to strip that.  The `code_node` has
 901 a flag that tells us whether or not we need to strip.
 902
 903 ###### includes
 904         #include <memory.h>
 905
 906 ###### state fields
 907         struct code_node *node;
 908         int    offset;
 909         int    line;
 910         int    col;
 911
 912 ###### internal functions
 913
 914         static int do_strip(struct token_state *state)
 915         {
 916                 int indent = 0;
 917                 if (state->node->needs_strip) {
 918                         int n = 4;
 919                         while (n && state->node->code.txt[state->offset] == ' ') {
 920                                 indent += 1;
 921                                 state->offset += 1;
 922                                 n -= 1;
 923                         }
 924                         while (n == 4 && state->node->code.txt[state->offset] == '\t') {
 925                                 indent = indent_tab(indent);
 926                                 state->offset += 1;
 927                                 n -= 4;
 928                         }
 929                 }
 930                 return indent;
 931         }
 932
 933         static wint_t get_char(struct token_state *state)
 934         {
 935                 wchar_t next;
 936                 size_t n;
 937                 mbstate_t mbstate;
 938
 939                 if (state->node == NULL)
 940                         return WEOF;
 941                 if (state->node->code.len <= state->offset) {
 942                         do
 943                                 state->node = state->node->next;
 944                         while (state->node && state->node->code.txt == NULL);
 945                         state->offset = 0;
 946                         if (state->node == NULL)
 947                                 return WEOF;
 948                         state->line = state->node->line_no;
 949                         state->col = do_strip(state);
 950                 }
 951
 952                 ## before get_char
 953
 954                 memset(&mbstate, 0, sizeof(mbstate));
 955
 956                 n = mbrtowc(&next, state->node->code.txt + state->offset,
 957                             state->node->code.len - state->offset,
 958                             &mbstate);
 959                 if (n == -2 || n == 0) {
 960                         /* Not enough bytes - not really possible */
 961                         next = '\n';
 962                         state->offset = state->node->code.len;
 963                 } else if (n == -1) {
 964                         /* error */
 965                         state->offset += 1;
 966                         next = 0x7f; // an illegal character
 967                 } else
 968                         state->offset += n;
 969
 970                 if (next >= ' ') {
 971                         state->col += 1;
 972                 } else if (is_newline(next)) {
 973                         state->line += 1;
 974                         state->col = do_strip(state);
 975                 } else if (next == '\t') {
 976                         state->col = indent_tab(state->col);
 977                 }
 978                 return next;
 979         }
 980
 981 We will sometimes want to "unget" the last character as it needs to be
 982 considered again as part of the next token.  So we need to store a
 983 'previous' version of all metadata.
 984
 985 ###### state fields
 986         int    prev_offset;
 987         int    prev_line;
 988         int    prev_col;
 989
 990 ###### before get_char
 991         state->prev_offset = state->offset;
 992         state->prev_line   = state->line;
 993         state->prev_col    = state->col;
 994
 995 ###### internal functions
 996
 997         static void unget_char(struct token_state *state)
 998         {
 999                 if (state->node) {
1000                         state->offset = state->prev_offset;
1001                         state->line   = state->prev_line;
1002                         state->col    = state->prev_col;
1003                 }
1004         }
1005
1006 We occasionally need a double-unget, particularly for numbers and
1007 block comments.  We don't impose this cost on all scanning, but
1008 require those code sections that need it to call `save_unget_state`
1009 before each `get_char`, and then `restore_unget_state` when a
1010 double-unget is needed.
1011
1012 ###### state fields
1013         int     prev_offset2;
1014         int     prev_line2;
1015         int     prev_col2;
1016
1017 ###### internal functions
1018         static void save_unget_state(struct token_state *state)
1019         {
1020                 state->prev_offset2 = state->prev_offset;
1021                 state->prev_line2 = state->prev_line;
1022                 state->prev_col2 = state->prev_col;
1023         }
1024
1025         static void restore_unget_state(struct token_state *state)
1026         {
1027                 state->prev_offset = state->prev_offset2;
1028                 state->prev_line = state->prev_line2;
1029                 state->prev_col = state->prev_col2;
1030         }
1031
1032 At the start of a token we don't want to be at the end of a code block
1033 if we can help it.  To avoid this possibility, we 'get' and 'unget' a
1034 single character.  This will move into the next non-empty code block
1035 and leave the current pointer at the start of it.
1036
1037 This has to happen _after_ dealing with delayed tokens as some of them
1038 must appear in the previous node.  When we do this, we need to reset
1039 the data in the token.
1040
1041 ###### delayed tokens
1042         if (at_eon(state)) {
1043                 get_char(state);
1044                 unget_char(state);
1045                 tk.node = state->node;
1046                 if (state->node)
1047                         tk.txt.txt = state->node->code.txt + state->offset;
1048                 tk.line = state->line;
1049                 tk.col = state->col;
1050                 tk.txt.len = 0;
1051         }
1052
1053 ### Managing tokens
1054
1055 The current token is initialized to line up with the first character
1056 that we 'get' for each token.  When we have, or might have, a full
1057 token we can call `close_token` to set the `len` of the token
1058 appropriately.  This can safely be called multiple times.
1059
1060 Finally we occasionally (for single-line strings and block comments)
1061 need to reset to the beginning of the current token as we might have
1062 parsed too much already.  For that there is `reset_token`.
1063
1064 ###### one token
1065         tk.node = state->node;
1066         if (state->node)
1067                 tk.txt.txt = state->node->code.txt + state->offset;
1068         tk.line = state->line;
1069         tk.col = state->col;
1070         tk.txt.len = 0;
1071
1072 ###### internal functions
1073
1074         static void close_token(struct token_state *state,
1075                                 struct token *tk)
1076         {
1077                 if (state->node != tk->node)
1078                         tk->txt.len = tk->node->code.len - (tk->txt.txt - tk->node->code.txt);
1079                 else
1080                         tk->txt.len = (state->node->code.txt + state->offset)
1081                                       - tk->txt.txt;
1082         }
1083
1084         static void reset_token(struct token_state *state, struct token *tok)
1085         {
1086                 state->prev_line = tok->line;
1087                 state->prev_col = tok->col;
1088                 state->prev_offset = tok->txt.txt - state->node->code.txt;
1089                 unget_char(state);
1090                 tok->txt.len = 0;
1091         }
1092
1093 Tokens make not cross into the next `code_node`, and some tokens can
1094 include the newline at the and of a `code_node`, we must be able to
1095 easily check if we have reached the end.  Equally we need to know if
1096 we are at the start of a node, as white space is treated a little
1097 differently there.
1098
1099 ###### internal functions
1100
1101         static int at_son(struct token_state *state)
1102         {
1103                 return state->offset == 0;
1104         }
1105
1106         static int at_eon(struct token_state *state)
1107         {
1108                 // at end-of-node ??
1109                 return state->node == NULL ||
1110                        state->offset >= state->node->code.len;
1111         }
1112
1113 ### Find a known word
1114
1115 As the known-word list is sorted we can use a simple binary search.
1116 Following the pattern established in "mdcode", we will use a `struct
1117 text` with start and length to represent the code fragment we are
1118 searching for.
1119
1120 ###### internal functions
1121         static int find_known(struct token_config *conf, struct text txt)
1122         {
1123                 int lo = 0;
1124                 int hi = conf->known_count;
1125
1126                 while (lo + 1 < hi) {
1127                         int mid = (lo + hi) / 2;
1128                         int cmp = strncmp(conf->words_marks[mid],
1129                                           txt.txt, txt.len);
1130                         if (cmp == 0 && conf->words_marks[mid][txt.len])
1131                                 cmp = 1;
1132                         if (cmp <= 0)
1133                                 lo = mid;
1134                         else
1135                                 hi = mid;
1136                 }
1137                 if (strncmp(conf->words_marks[lo],
1138                            txt.txt, txt.len) == 0
1139                     && conf->words_marks[lo][txt.len] == 0)
1140                         return lo;
1141                 else
1142                         return -1;
1143         }
1144
1145 ### Bringing it all together
1146
1147 Now we have all the bits there is just one section missing:  combining
1148 all the token parsing code into one block.
1149
1150 The handling of delayed tokens (Newlines, INs, OUTs) must come
1151 first before we try getting another character.
1152
1153 Then we parse all the test, making sure that we check for known marks
1154 before strings and comments, but unknown marks after strings and comments.
1155
1156 This block of code will either return a token, or will choose to
1157 ignore one, in which case it will `continue` around to the top of the
1158 loop.
1159
1160 ###### one token
1161         ## delayed tokens
1162
1163         ch = get_char(state);
1164
1165         ## white space
1166         ## parse number
1167         ## parse word
1168         ## parse mark
1169
1170 ### Start and stop
1171
1172 As well as getting tokens, we need to be able to create the
1173 `token_state` to start with, and discard it later.
1174
1175 ###### includes
1176         #include <malloc.h>
1177
1178 ###### main functions
1179         struct token_state *token_open(struct code_node *code, struct
1180                                        token_config *conf)
1181         {
1182                 struct token_state *state = malloc(sizeof(*state));
1183                 memset(state, 0, sizeof(*state));
1184                 state->node = code;
1185                 state->line = code->line_no;
1186                 state->col = do_strip(state);
1187                 state->conf = conf;
1188                 return state;
1189         }
1190         void token_close(struct token_state *state)
1191         {
1192                 free(state);
1193         }
1194
1195 ###### exported functions
1196         struct token_state *token_open(struct code_node *code, struct
1197                                        token_config *conf);
1198         void token_close(struct token_state *state);
1199
1200 ### Trace tokens
1201
1202 Getting tokens is the main thing but it is also useful to be able to
1203 print out token information, particularly for tracing and testing.
1204
1205 Known tokens are printed verbatim.  Other tokens are printed as
1206 `type(content)` where content is truncated to a given number of characters.
1207
1208 The function for printing a truncated string (`text_dump`) is also exported
1209 so that it can be used to tracing processed strings too.
1210
1211 ###### includes
1212         #include <stdio.h>
1213
1214 ###### exported functions
1215         void token_trace(FILE *f, struct token tok, int max);
1216         void text_dump(FILE *f, struct text t, int max);
1217
1218 ###### main functions
1219
1220         void text_dump(FILE *f, struct text txt, int max)
1221         {
1222                 int i;
1223                 if (txt.len > max)
1224                         max -= 2;
1225                 else
1226                         max = txt.len;
1227                 for (i = 0; i < max; i++) {
1228                         char c = txt.txt[i];
1229                         if (c < ' ' || c > '~')
1230                                 fprintf(f, "\\x%02x", c & 0xff);
1231                         else if (c == '\\')
1232                                 fprintf(f, "\\\\");
1233                         else
1234                                 fprintf(f, "%c", c);
1235                 }
1236                 if (i < txt.len)
1237                         fprintf(f, "..");
1238         }
1239
1240         void token_trace(FILE *f, struct token tok, int max)
1241         {
1242                 static char *types[] = {
1243                         [TK_ident] = "ident",
1244                         [TK_mark] = "mark",
1245                         [TK_number] = "number",
1246                         [TK_string] = "string",
1247                         [TK_multi_string] = "mstring",
1248                         [TK_line_comment] = "lcomment",
1249                         [TK_block_comment] = "bcomment",
1250                         [TK_in] = "in",
1251                         [TK_out] = "out",
1252                         [TK_newline] = "newline",
1253                         [TK_eof] = "eof",
1254                         [TK_error] = "ERROR",
1255                         };
1256
1257                 switch (tok.num) {
1258                 default: /* known word or mark */
1259                         fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1260                         break;
1261                 case TK_in:
1262                 case TK_out:
1263                 case TK_newline:
1264                 case TK_eof:
1265                         /* No token text included */
1266                         fprintf(f, "%s()", types[tok.num]);
1267                         break;
1268                 case TK_ident:
1269                 case TK_mark:
1270                 case TK_number:
1271                 case TK_string:
1272                 case TK_multi_string:
1273                 case TK_line_comment:
1274                 case TK_block_comment:
1275                 case TK_error:
1276                         fprintf(f, "%s(", types[tok.num]);
1277                         text_dump(f, tok.txt, max);
1278                         fprintf(f, ")");
1279                         break;
1280                 }
1281         }
1282
1283 ### And there we have it
1284
1285 We now have all the library functions defined for reading and printing
1286 tokens.  Now we just need C files to store them, and a mk file to make them.
1287
1288 ###### File: scanner.h
1289         ## public types
1290         ## exported functions
1291
1292 ###### File: libscanner.c
1293         ## includes
1294         #include "scanner.h"
1295         ## private types
1296         ## internal functions
1297         ## main functions
1298
1299 ###### File: scanner.mk
1300
1301         CFLAGS += -Wall -g
1302         all ::
1303         scanner.mk scanner.h libscanner.c : scanner.mdc
1304                 ./md2c scanner.mdc
1305         all :: libscanner.o
1306         libscanner.o : libscanner.c
1307                 $(CC) $(CFLAGS) -c libscanner.c
1308
1309 ## Processing numbers
1310
1311 Converting a `TK_number` token to a numerical value is a slightly
1312 higher level task than lexical analysis, and slightly lower than
1313 grammar parsing, so put it here - as an index if you like.
1314
1315 Importantly it will be used by the same testing rig that is used for
1316 testing the token scanner.
1317
1318 The numeric value that we will convert all numbers into is the `mpq_t`
1319 from the GNU high precision number library "libgmp".
1320
1321 ###### number includes
1322         #include <gmp.h>
1323         #include "mdcode.h"
1324
1325 Firstly we need to be able to parse a string of digits in a given base
1326 and possibly with a decimal marker.  We store this in an `mpz_t`
1327 integer and report the number of digits after the decimal mark.
1328
1329 On error we return zero and ensure that the 'mpz_t' has been freed, or
1330 had never been initialised.
1331
1332 ###### number functions
1333
1334         static int parse_digits(mpz_t num, struct text tok, int base,
1335                                 int *placesp)
1336         {
1337                 /* Accept digits up to 'base', ignore '_' and
1338                  * ' ' if they appear between two legal digits,
1339                  * and if `placesp` is not NULL, allow a single
1340                  * '.' or ',' and report the number of digits
1341                  * beyond there.
1342                  * Return number of characters processed (p),
1343                  * or 0 if something illegal was found.
1344                  */
1345                 int p;
1346                 int decimal = -1; // digits after marker
1347                 enum {Digit, Space, Other} prev = Other;
1348                 int digits = 0;
1349
1350                 for (p = 0; p < tok.len; p++) {
1351                         int dig;
1352                         char c = tok.txt[p];
1353
1354                         if (c == '_' || c == ' ') {
1355                                 if (prev != Digit)
1356                                         goto bad;
1357                                 prev = Space;
1358                                 continue;
1359                         }
1360                         if (c == '.' || c == ',') {
1361                                 if (prev != Digit)
1362                                         goto bad;
1363                                 if (!placesp || decimal >= 0)
1364                                         return p-1;
1365                                 decimal = 0;
1366                                 prev = Other;
1367                                 continue;
1368                         }
1369                         if (isdigit(c))
1370                                 dig = c - '0';
1371                         else if (isupper(c))
1372                                 dig = 10 + c - 'A';
1373                         else if (islower(c))
1374                                 dig = 10 + c - 'a';
1375                         else
1376                                 dig = base;
1377                         if (dig >= base) {
1378                                 if (prev == Space)
1379                                         p--;
1380                                 break;
1381                         }
1382                         prev = Digit;
1383                         if (digits)
1384                                 mpz_mul_ui(num, num, base);
1385                         else
1386                                 mpz_init(num);
1387                         digits += 1;
1388                         mpz_add_ui(num, num, dig);
1389                         if (decimal >= 0)
1390                                 decimal++;
1391                 }
1392                 if (digits == 0)
1393                         return 0;
1394                 if (placesp) {
1395                         if (decimal >= 0)
1396                                 *placesp = decimal;
1397                         else
1398                                 *placesp = 0;
1399                 }
1400                 return p;
1401         bad:
1402                 if (digits)
1403                         mpz_clear(num);
1404                 return 0;
1405         }
1406
1407 ###### number includes
1408         #include <ctype.h>
1409
1410 To parse a full number we need to consider the optional base, the
1411 mantissa, and the optional exponent.  We will treat these one at a
1412 time.
1413
1414 The base is indicated by a letter after a leading zero, which must be
1415 followed by a base letter or a period.  The base also determines the
1416 character which will mark an exponent.
1417
1418 ###### number vars
1419         int base = 10;
1420         char expc = 'e';
1421
1422 ###### parse base
1423
1424         if (tok.txt[0] == '0' && tok.len > 1) {
1425                 int skip = 0;
1426                 switch(tok.txt[1]) {
1427                 case 'x':
1428                 case 'X':
1429                         base = 16;
1430                         skip = 2;
1431                         expc = 'p';
1432                         break;
1433                 case 'o':
1434                 case 'O':
1435                         base = 8;
1436                         skip = 2;
1437                         expc = 'p';
1438                         break;
1439                 case 'b':
1440                 case 'B':
1441                         base = 2;
1442                         skip = 2;
1443                         expc = 'p';
1444                         break;
1445                 case '0':
1446                 case '1':
1447                 case '2':
1448                 case '3':
1449                 case '4':
1450                 case '5':
1451                 case '6':
1452                 case '7':
1453                 case '8':
1454                 case '9':
1455                 case '_':
1456                 case ' ':
1457                         // another digit is not permitted
1458                         // after a zero.
1459                         return 0;
1460                 default:
1461                         // must be decimal marker or trailing
1462                         // letter, which are OK;
1463                         break;
1464                 }
1465                 tok.txt += skip;
1466                 tok.len -= skip;
1467         }
1468
1469 After the base is the mantissa, which may contain a decimal mark, so
1470 we need to record the number of places.  We won't impose the number of
1471 places until we have the exponent as well.
1472
1473 ###### number vars
1474         int places =0;
1475         mpz_t mant;
1476         int d;
1477
1478 ###### parse mantissa
1479
1480         d = parse_digits(mant, tok, base, &places);
1481         if (d == 0)
1482                 return 0;
1483         tok.txt += d;
1484         tok.len -= d;
1485         mpq_init(num);
1486         mpq_set_z(num, mant);
1487         mpz_clear(mant);
1488
1489 After the mantissa number may come an exponent which may be positive
1490 or negative.  We assume at this point that we have seen the exponent
1491 character `expc`.
1492
1493 ###### number vars
1494         long lexp = 0;
1495         mpz_t exp;
1496         int esign = 1;
1497
1498 ###### parse exponent
1499         if (tok.len > 1) {
1500                 if (tok.txt[0] == '+') {
1501                         tok.txt++;
1502                         tok.len--;
1503                 } else if (tok.txt[0] == '-') {
1504                         esign = -1;
1505                         tok.txt++;
1506                         tok.len--;
1507                 }
1508         }
1509         d = parse_digits(exp, tok, 10, NULL);
1510         if (d == 0) {
1511                 mpq_clear(num);
1512                 return 0;
1513         }
1514         if (!mpz_fits_slong_p(exp)) {
1515                 mpq_clear(num);
1516                 mpz_clear(exp);
1517                 return 0;
1518         }
1519         lexp = mpz_get_si(exp) * esign;
1520         mpz_clear(exp);
1521         tok.txt += d;
1522         tok.len -= d;
1523
1524 Now that we have the mantissa and the exponent we can multiply them
1525 together, also allowing for the number of digits after the decimal
1526 mark.
1527
1528 For base 10, we simply subtract the decimal places from the exponent.
1529 For the other bases, as the exponent is alway based on 2, even for
1530 octal and hex, we need a bit more detail.
1531 We then recover the sign from the exponent, as division is quite
1532 different from multiplication.
1533
1534 ###### calc exponent
1535         switch (base) {
1536         case 10:
1537         case 2:
1538                 lexp -= places;
1539                 break;
1540         case 16:
1541                 lexp -= 4*places;
1542                 break;
1543         case 8:
1544                 lexp -= 3*places;
1545                 break;
1546         }
1547         if (lexp < 0) {
1548                 lexp = -lexp;
1549                 esign = -1;
1550         } else
1551                 esign = 1;
1552
1553 Imposing the exponent on the number is also very different for base 10
1554 than for the others.  For the binary shift `gmp` provides a simple
1555 function.  For base 10 we use something like Russian Peasant
1556 Multiplication.
1557
1558 ###### calc exponent
1559         if (expc == 'e') {
1560                 mpq_t tens;
1561                 mpq_init(tens);
1562                 mpq_set_ui(tens, 10, 1);
1563                 while (1) {
1564                         if (lexp & 1) {
1565                                 if (esign > 0)
1566                                         mpq_mul(num, num, tens);
1567                                 else
1568                                         mpq_div(num, num, tens);
1569                         }
1570                         lexp >>= 1;
1571                         if (lexp == 0)
1572                                 break;
1573                         mpq_mul(tens, tens, tens);
1574                 }
1575                 mpq_clear(tens);
1576         } else {
1577                 if (esign > 0)
1578                         mpq_mul_2exp(num, num, lexp);
1579                 else
1580                         mpq_div_2exp(num, num, lexp);
1581         }
1582
1583 Now we are ready to parse a number: the base, mantissa, and exponent.
1584 If all goes well we check for the possible trailing letters and
1585 return.  Return value is 1 for success and 0 for failure.
1586
1587 ###### number functions
1588         int number_parse(mpq_t num, char tail[3], struct text tok)
1589         {
1590                 ## number vars
1591                 int i;
1592
1593                 ## parse base
1594                 ## parse mantissa
1595                 if (tok.len > 1 && (tok.txt[0] == expc ||
1596                                     tok.txt[0] == toupper(expc))) {
1597                         tok.txt++;
1598                         tok.len--;
1599                         ## parse exponent
1600                 }
1601                 ## calc exponent
1602
1603                 for (i = 0; i < 2; i++) {
1604                         if (tok.len <= i)
1605                                 break;
1606                         if (!isalpha(tok.txt[i]))
1607                                 goto err;
1608                         tail[i] = tok.txt[i];
1609                 }
1610                 tail[i] = 0;
1611                 if (i == tok.len)
1612                         return 1;
1613         err:
1614                 mpq_clear(num);
1615                 return 0;
1616         }
1617
1618 Number parsing goes in `libnumber.c`
1619
1620 ###### File: libnumber.c
1621
1622         #include <unistd.h>
1623         #include <stdlib.h>
1624
1625         ## number includes
1626         ## number functions
1627
1628 ###### File: number.h
1629         int number_parse(mpq_t num, char tail[3], struct text tok);
1630
1631 ###### File: scanner.mk
1632         all :: libnumber.o
1633         libnumber.o : libnumber.c
1634                 $(CC) $(CFLAGS) -c libnumber.c
1635
1636 ## Processing strings
1637
1638 Both `TK_string` and `TK_multi_string` require post-processing which
1639 can be one of two types: literal or with escapes processed.
1640 Even literal processing is non-trivial as the file may contain indents
1641 which need to be stripped.
1642
1643 Errors can only occur when processing escapes.  Any unrecognised
1644 character following the escape character will cause an error.
1645
1646 Processing escapes and striping indents can only make the string
1647 shorter, not longer, so we allocate a buffer which is the same size as
1648 the string and process into that.
1649
1650 To request escape processing, we pass the character we want to use for
1651 quoting, usually '`\`'.  To avoid escape processing we pass a zero.
1652
1653 ###### string main
1654         int string_parse(struct token *tok, char escape,
1655                          struct text *str, char tail[3])
1656         {
1657                 ## string vars
1658                 struct text t = tok->txt;
1659
1660                 str->txt = NULL;
1661                 ## strip tail
1662                 if (tok->num == TK_string) {
1663                         ## strip single
1664                 } else {
1665                         ## strip multi
1666                 }
1667                 str->txt = malloc(t.len);
1668                 str->len = 0;
1669
1670                 ## process string
1671                 return 1;
1672         err:
1673                 free(str->txt);
1674                 str->txt = NULL;
1675                 return 0;
1676         }
1677
1678 ### strip tail
1679
1680 The tail of the string can be 0, 1, or 2 letters
1681
1682         i = t.len;
1683         if (i >= 0 && isalpha(t.txt[i-1]))
1684                 i -= 1;
1685         if (i >= 0 && isalpha(t.txt[i-1]))
1686                 i -= 1;
1687         strncpy(tail, t.txt+i, t.len-i);
1688         tail[t.len-i] = 0;
1689         t.len = i;
1690
1691 ###### string vars
1692         int i;
1693
1694 ### strip single
1695
1696 Stripping the quote of a single-line string is trivial.
1697 The only part that is at all interesting is that quote character must
1698 be remembered.
1699
1700         quote = t.txt[0];
1701         if (t.txt[t.len-1] != quote)
1702                 goto err;
1703         t.txt += 1;
1704         t.len -= 2;
1705
1706 ###### string vars
1707         char quote;
1708
1709 ### strip multi
1710
1711 For a multi-line string we have a little more work to do.  We need to
1712 remove 3 quotes, not 1, and need to count the indent of the close
1713 quote as it will need to be stripped from all lines.
1714
1715         quote = t.txt[0];
1716         if (t.len < 7 ||
1717             t.txt[1] != quote || t.txt[2] != quote ||
1718             !is_newline(t.txt[3]))
1719                 goto err;
1720         t.txt += 4;
1721         t.len -= 4;
1722         i = t.len;
1723         if (i <= 0 || t.txt[i-1] != quote)
1724                 goto err;
1725         i -= 1;
1726         if (i <= 0 || t.txt[i-1] != quote)
1727                 goto err;
1728         i -= 1;
1729         if (i <= 0 || t.txt[i-1] != quote)
1730                 goto err;
1731         i -= 1;
1732         t.len = i;
1733         while (i > 0 && !is_newline(t.txt[i-1]))
1734                 i--;
1735         indent = 0;
1736         while (i < t.len) {
1737                 if (t.txt[i] == ' ')
1738                         indent += 1;
1739                 if (t.txt[i] == '\t')
1740                         indent = indent_tab(indent);
1741                 i++;
1742         }
1743
1744 ###### string vars
1745         int indent = 0;
1746
1747 ### process string
1748
1749 Now we just take one byte at a time. trans-ASCII unicode won't look
1750 like anything we are interested in so it will just be copied byte by
1751 byte.
1752
1753         cp = str->txt;
1754         at_sol = 1;
1755         for (i = 0; i < t.len; i++) {
1756                 char c;
1757                 if (at_sol) {
1758                         at_sol = 0;
1759                         ## strip indent
1760                         if (i >= t.len)
1761                                 break;
1762                 }
1763                 c = t.txt[i];
1764                 if (c != escape) {
1765                         *cp = c;
1766                         cp += 1;
1767                         if (is_newline(c))
1768                                 at_sol = 1;
1769                 } else if (i+1 >= t.len) {
1770                         // escape and end of string
1771                         goto err;
1772                 } else {
1773                         i += 1;
1774                         c = t.txt[i];
1775                         ## parse escape
1776                 }
1777         }
1778         str->len = cp - str->txt;
1779
1780 ###### string vars
1781         char *cp;
1782         int at_sol;
1783
1784 ### strip indent
1785
1786 Every time we find a start of line, we strip spaces and tabs until the
1787 required indent is found.
1788
1789         int skipped = 0;
1790         while (i < t.len && skipped < indent) {
1791                 c = t.txt[i];
1792                 if (c == ' ')
1793                         skipped += 1;
1794                 else if (c == '\t')
1795                         skipped = indent_tab(skipped);
1796                 else
1797                         break;
1798                 i+= 1;
1799         }
1800
1801 ### parse escape
1802         switch (c) {
1803         case 'n':
1804                 *cp++ = '\n'; break;
1805         case 'r':
1806                 *cp++ = '\r'; break;
1807         case 't':
1808                 *cp++ = '\t'; break;
1809         case 'b':
1810                 *cp++ = '\b'; break;
1811         case 'q':
1812                 *cp++ = quote; break;
1813         case 'f':
1814                 *cp++ = '\f'; break;
1815         case 'v':
1816                 *cp++ = '\v'; break;
1817         case 'a':
1818                 *cp++ = '\a'; break;
1819         case '0':
1820         case '1':
1821         case '2':
1822         case '3':
1823                 // 3 digit octal number
1824                 if (i+2 >= t.len)
1825                         goto err;
1826                 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1827                     t.txt[i+2] < '0' || t.txt[i+1] > '7')
1828                         goto err;
1829                 n = (t.txt[i  ]-'0') * 64 +
1830                     (t.txt[i+1]-'0') *  8 +
1831                     (t.txt[i+2]-'0') *  1;
1832                 *cp++ = n;
1833                 i += 2;
1834                 break;
1835         case 'x':
1836                 // 2 hex digits
1837                 n = take_hex(2, t.txt+i+1, t.len-i-1);
1838                 if (n < 0)
1839                         goto err;
1840                 *cp++ = n;
1841                 i += 2;
1842                 break;
1843         case 'u':
1844         case 'U':
1845                 // 4 or 8 hex digits for unicode
1846                 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1847                 if (n < 0)
1848                         goto err;
1849                 memset(&pstate, 0, sizeof(pstate));
1850                 n = wcrtomb(cp, n, &pstate);
1851                 if (n <= 0)
1852                         goto err;
1853                 cp += n;
1854                 i += c == 'u' ? 4 : 8;
1855                 break;
1856         default:
1857                 if (c == escape)
1858                         *cp++ = c;
1859                 else if (is_newline(c))
1860                         at_sol = 1;
1861                 else
1862                         goto err;
1863         }
1864
1865 ###### string vars
1866         long n;
1867         mbstate_t pstate;
1868
1869 For `\x` `\u` and `\U` we need to collect a specific number of
1870 hexadecimal digits
1871
1872 ###### string functions
1873
1874         static long take_hex(int digits, char *cp, int l)
1875         {
1876                 long n = 0;
1877                 if (l < digits)
1878                         return -1;
1879                 while (digits) {
1880                         char  c = *cp;
1881                         int d;
1882                         if (!isxdigit(c))
1883                                 return -1;
1884                         if (isdigit(c))
1885                                 d = c - '0';
1886                         else if (isupper(c))
1887                                 d = 10 + c - 'A';
1888                         else
1889                                 d = 10 + c - 'a';
1890                         n = n * 16 + d;
1891                         digits--;
1892                         cp++;
1893                 }
1894                 return n;
1895         }
1896
1897 #### File: libstring.c
1898
1899 String parsing goes in `libstring.c`
1900
1901         #include <unistd.h>
1902         #include <stdlib.h>
1903         #include <stdio.h>
1904         #include <string.h>
1905         #include <ctype.h>
1906         #include <wchar.h>
1907         #include "mdcode.h"
1908         #include "scanner.h"
1909         ## string functions
1910         ## string main
1911
1912 ###### File: string.h
1913         int string_parse(struct token *tok, char escape,
1914                          struct text *str, char tail[3]);
1915
1916 ###### File: scanner.mk
1917         all :: libstring.o
1918         libstring.o : libstring.c
1919                 $(CC) $(CFLAGS) -c libstring.c
1920
1921 ## Testing
1922
1923 As "untested code is buggy code" we need a program to easily test
1924 the scanner library.  This will simply parse a given file and report
1925 the tokens one per line.
1926
1927 ###### File: scanner.c
1928
1929         #include <unistd.h>
1930         #include <stdlib.h>
1931         #include <fcntl.h>
1932         #include <errno.h>
1933         #include <sys/mman.h>
1934         #include <string.h>
1935         #include <stdio.h>
1936         #include <gmp.h>
1937         #include <locale.h>
1938         #include <getopt.h>
1939         #include "mdcode.h"
1940         #include "scanner.h"
1941         #include "number.h"
1942         #include "string.h"
1943
1944         static int errs;
1945         static void pr_err(char *msg)
1946         {
1947                 errs++;
1948                 fprintf(stderr, "%s\n", msg);
1949         }
1950
1951         static int kcmp(const void *ap, const void *bp)
1952         {
1953                 char * const *a = ap;
1954                 char * const *b = bp;
1955                 return strcmp(*a, *b);
1956         }
1957
1958         int main(int argc, char *argv[])
1959         {
1960                 int fd;
1961                 int len;
1962                 char *file;
1963                 char *filename = NULL;
1964                 struct token_state *state;
1965                 const char *known[] = {
1966                         "==",
1967                         "else",
1968                         "if",
1969                         "then",
1970                         "while",
1971                         "{",
1972                         "}",
1973                 };
1974                 struct token_config conf = {
1975                         .word_start = "_$",
1976                         .word_cont = "",
1977                         .words_marks = known,
1978                         .number_chars = "., _+-",
1979                         .known_count = sizeof(known)/sizeof(known[0]),
1980                         .ignored = 0,
1981                 };
1982                 static const struct option long_options[] = {
1983                         { "word-start",         1, NULL, 'W'},
1984                         { "word-cont",          1, NULL, 'w'},
1985                         { "number-chars",       1, NULL, 'n'},
1986                         { "ignore-numbers",     0, NULL, 'N'},
1987                         { "ignore-ident",       0, NULL, 'I'},
1988                         { "ignore-marks",       0, NULL, 'M'},
1989                         { "ignore-strings",     0, NULL, 'S'},
1990                         { "ignore-multi-strings",0, NULL, 'z'},
1991                         { "ignore-line-comment",0, NULL, 'c'},
1992                         { "ignore-newline",     0, NULL, 'l'},
1993                         { "ignore-block-comment", 0, NULL, 'C'},
1994                         { "ignore-indent",      0, NULL, 'i'},
1995                         { "file",               1, NULL, 'f'},
1996                         { NULL,                 0, NULL, 0},
1997                 };
1998                 static const char options[] = "W:w:n:NIMSzclCif:";
1999
2000                 struct section *table, *s, *prev;
2001                 int opt;
2002
2003                 setlocale(LC_ALL,"");
2004                 while ((opt = getopt_long(argc, argv, options, long_options, NULL))
2005                        != -1) {
2006                         switch(opt) {
2007                         case 'W': conf.word_start = optarg; break;
2008                         case 'w': conf.word_cont = optarg; break;
2009                         case 'n': conf.number_chars = optarg; break;
2010                         case 'N': conf.ignored |= 1 << TK_number; break;
2011                         case 'I': conf.ignored |= 1 << TK_ident; break;
2012                         case 'M': conf.ignored |= 1 << TK_mark; break;
2013                         case 'S': conf.ignored |= 1 << TK_string; break;
2014                         case 'z': conf.ignored |= 1 << TK_multi_string; break;
2015                         case 'c': conf.ignored |= 1 << TK_line_comment; break;
2016                         case 'C': conf.ignored |= 1 << TK_block_comment; break;
2017                         case 'l': conf.ignored |= 1 << TK_newline; break;
2018                         case 'i': conf.ignored |= 1 << TK_in; break;
2019                         case 'f': filename = optarg; break;
2020                         default: fprintf(stderr, "scanner: unknown option '%c'.\n",
2021                                          opt);
2022                                 exit(1);
2023                         }
2024                 }
2025
2026                 if (optind < argc) {
2027                         const char **wm = calloc(argc - optind, sizeof(char*));
2028                         int i;
2029                         for (i = optind; i < argc; i++)
2030                                 wm[i - optind] = argv[i];
2031                         qsort(wm, argc-optind, sizeof(char*), kcmp);
2032                         conf.words_marks = wm;
2033                         conf.known_count = argc - optind;
2034                 }
2035
2036                 if (filename)
2037                         fd = open(filename, O_RDONLY);
2038                 else
2039                         fd = 0;
2040                 if (fd < 0) {
2041                         fprintf(stderr, "scanner: cannot open %s: %s\n",
2042                                 filename, strerror(errno));
2043                         exit(1);
2044                 }
2045                 len = lseek(fd, 0, 2);
2046                 if (len <= 0) {
2047                         fprintf(stderr,"scanner: %s is empty or not seekable\n",
2048                                 filename ?: "stdin");
2049                         exit(1);
2050                 }
2051                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
2052                 table = code_extract(file, file+len, pr_err);
2053
2054                 for (s = table; s;
2055                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
2056                         printf("Tokenizing: %.*s\n", s->section.len,
2057                                 s->section.txt);
2058                         state = token_open(s->code, &conf);
2059                         while(1) {
2060                                 struct token tk = token_next(state);
2061                                 printf("%d:%d ", tk.line, tk.col);
2062                                 token_trace(stdout, tk, 20);
2063                                 if (tk.num == TK_number) {
2064                                         mpq_t num;
2065                                         char tail[3];
2066                                         if (number_parse(num, tail,tk.txt)) {
2067                                                 printf(" %s ", tail);
2068                                                 mpq_out_str(stdout, 10, num);
2069                                                 mpq_clear(num);
2070                                         } else
2071                                                 printf(" BAD NUMBER");
2072                                 }
2073                                 if (tk.num == TK_string ||
2074                                     tk.num == TK_multi_string) {
2075                                         char esc = '\\';
2076                                         struct text str;
2077                                         char tail[3];
2078                                         if (tk.txt.txt[0] == '`')
2079                                                 esc = 0;
2080                                         if (string_parse(&tk, esc,
2081                                                          &str, tail)) {
2082                                                 printf(" %s ", tail);
2083                                                 text_dump(stdout, str, 20);
2084                                                 free(str.txt);
2085                                         } else
2086                                                 printf(" BAD STRING");
2087                                 }
2088                                 printf("\n");
2089                                 if (tk.num == TK_error)
2090                                         errs = 1;
2091                                 if (tk.num == TK_eof)
2092                                         break;
2093                         }
2094                         token_close(state);
2095                 }
2096                 if (conf.words_marks != known)
2097                         free(conf.words_marks);
2098                 exit(!!errs);
2099         }
2100 ###### File: scanner.mk
2101         scanner.c : scanner.mdc
2102                 ./md2c scanner.mdc
2103         all :: scanner
2104         scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
2105                 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
2106                         libmdcode.o libnumber.o libstring.o -licuuc -lgmp
2107         scanner.o : scanner.c
2108                 $(CC) $(CFLAGS) -c scanner.c