ocean-lang.org Git - ocean/blob - csrc/scanner.mdc

   1 # Lexical Scanner #
   2
   3 ## The Task at Hand ##
   4
   5 The main task of the lexical scanner is to convert a stream of
   6 characters into a stream of tokens.  The tokens are then typically
   7 used by a parser to extract the syntactic structure.
   8
   9 The stream of characters are assumed to be in memory identified by a
  10 linked list of blocks, such as provided by the "[mdcode][]" literate
  11 program extractor.  A single token may never cross a block boundary.
  12
  13 [mdcode]: mdcode.html
  14
  15 ###### includes
  16         #include "mdcode.h"
  17
  18 The text is assumed to be UTF-8 though some matching assumes the
  19 ASCII subset.  If the text provided does not conform to UTF-8 an error
  20 will be reported and some number of bytes will be skipped.
  21
  22 ###### public types
  23         #include <wchar.h>
  24         #include <wctype.h>
  25         #include <unicode/uchar.h>
  26
  27 Tokens are returned by successive calls to the main interface
  28 function: `token_next()` which has a `state` structure to keep track
  29 of where it is up to.  Each token carries not just a numeric
  30 identifier but also the code block, the line and character within that
  31 block, and the actual start and length using the `struct text` from
  32 "mdcode".
  33
  34 ###### public types
  35         struct token {
  36                 int               num;
  37                 struct code_node *node;
  38                 struct text       txt;
  39                 int               line, col;
  40         };
  41         struct token_state;
  42
  43 ###### private types
  44         struct token_state {
  45                 ## state fields
  46         };
  47
  48 ###### exported functions
  49         struct token token_next(struct token_state *state);
  50
  51 ###### main functions
  52         struct token token_next(struct token_state *state)
  53         {
  54                 ## token_next init
  55                 while (1) {
  56                         wint_t ch;
  57                         struct token tk;
  58
  59                         ## one token
  60                 }
  61         }
  62
  63 The `line` and `col` offsets are useful for reporting errors.
  64 The `txt` provides the content when that is important.
  65
  66 ### Token types and configuration ##
  67
  68 The scanner is not completely general, yet not completely specified.
  69 There are a fixed set of token types, though particular tokens within
  70 those types can be distinguish via configuration.
  71
  72 Most token types may be explicitly ignored, as typically comments
  73 would be.  The exact consequence of ignoring each token type varies
  74 from token to token.
  75
  76 ###### public types
  77         struct token_config {
  78                 int ignored;    // bit set of ignored tokens.
  79                 ## token config parameters
  80         };
  81
  82 ###### state fields
  83         struct token_config *conf;
  84
  85 ###### token_next init
  86         int ignored = state->conf->ignored;
  87
  88
  89 The different tokens are numbers, words, marks, strings, comments,
  90 newlines, EOF, and indents, each of which is examined in detail below.
  91
  92 There are various cases where no token can be found in part of the
  93 input.  All of these will be reported as a `TK_error` token.
  94
  95 It is possible to declare a number of strings which form distinct
  96 tokens (rather than being grouped as e.g. 'word').  These are given
  97 token numbers from `TK_reserved` upwards.
  98
  99 ###### public types
 100         enum token_num {
 101                 TK_error,
 102                 ## token types
 103                 TK_reserved
 104         };
 105
 106 ### Numbers
 107
 108 Numbers are the messiest tokens to parse, primarily because they can
 109 contain characters that also have meaning outside of number and,
 110 particularly, immediately after numbers.
 111
 112 The obvious example is the '`-`' sign.  It can come inside a number for
 113 a negative exponent, or after a number as a subtraction operator.  To
 114 be sure we have parsed as best as possible we need to only allow the
 115 '`-`' inside a number if it is after an exponent character.  This can be
 116 `e` or `p` (for hex exponents), but `e` can also be a hexadecimal
 117 digit, so we don't allow '`-`' after just any `e`.
 118
 119 To make matters worse, our language designer has decided to experiment
 120 with allowing commas to be used as the decimal indicator, and spaces
 121 to be used to separate groups of digits in large numbers.  Both of
 122 these can reasonably be restricted to appear between two digits, so we
 123 have to add that condition to our tests.
 124
 125 So we cannot just treat numbers as starting with a digit and being
 126 followed by some set of characters.  We need more structure than that.
 127
 128 So:
 129
 130 - Numbers must start with a digit.
 131 - If the first digit is zero, the next character must be a base
 132   signifier (one of `xob`) or a decimal marker (`.` or `,`).
 133   In the first case the first `p` or `P` may be followed by a sign.
 134 - If the number doesn't start with `0` followed by one of `xob`, the
 135   first `e` may be followed by a sign.
 136 - Any digit or hex digit may be followed by a space or underscore
 137   providing that the subsequence character is also a (hex) digit.
 138   This rule will require an extra level of 'unget' to be
 139   supported when handling characters.
 140 - Otherwise any digits or ASCII letters are allowed.  We do not at
 141   this point check that the digits given are permitted by the base.
 142   That will happen when the token is converted to a number.
 143
 144 To allow easy configuration, the various non alphanumeric characters
 145 are only permitted if they are listed in a configuration parameter.
 146
 147 ###### token config parameters
 148         char *number_chars;
 149
 150 Note that numbers may not start with a period, so `.75` is not a
 151 number.  This is not the norm, but is not unheard of.  Excluding these
 152 numbers simplifies the rule at very little cost.
 153
 154 ###### token types
 155         TK_number,
 156
 157 If TK_number is ignored, digits will result in an error unless they
 158 are declared to be a start character for words.
 159
 160 ###### includes
 161
 162         #include <string.h>
 163
 164 ###### parse number
 165
 166         if (iswdigit(ch) && !(ignored & (1<<TK_number))) {
 167                 int prev_special = 0;
 168                 int expect_p = 0;
 169                 int decimal_mark = 0;
 170                 if (ch == '0') {
 171                         wchar_t ch2 = get_char(state);
 172                         if (strchr("xobXOB", ch2) != NULL)
 173                                 expect_p = 1;
 174                         unget_char(state);
 175                 }
 176                 while (1) {
 177                         int sign_ok = 0;
 178                         switch(expect_p) {
 179                         case 0:
 180                                 if (ch == 'e' || ch == 'E')
 181                                         sign_ok = 1;
 182                                 break;
 183                         case 1:
 184                                 if (ch == 'p' || ch == 'P')
 185                                         sign_ok = 1;
 186                                 break;
 187                         }
 188                         save_unget_state(state);
 189                         ch = get_char(state);
 190                         if (iswalnum(ch)) {
 191                                 prev_special = 0;
 192                                 continue;
 193                         }
 194                         if (ch == '+' || ch == '-') {
 195                                 if (!sign_ok)
 196                                         break;
 197                                 expect_p = -1;
 198                         }
 199                         if (ch == '.' || ch == ',') {
 200                                 if (decimal_mark)
 201                                         break;
 202                                 decimal_mark = 1;
 203                         }
 204                         if (prev_special) {
 205                                 /* Don't allow that special char,
 206                                  * need two 'ungets'
 207                                  */
 208                                 restore_unget_state(state);
 209                                 break;
 210                         }
 211                         if (strchr(state->conf->number_chars, ch)) {
 212                                 prev_special = 1;
 213                                 continue;
 214                         }
 215                         /* non-number char */
 216                         break;
 217                 }
 218                 /* We seem to have a "number" token */
 219                 unget_char(state);
 220                 close_token(state, &tk);
 221                 tk.num = TK_number;
 222                 return tk;
 223         }
 224
 225 ### Words
 226 Words start with a "start" character followed by the longest
 227 sequence of "continue" characters.  The Unicode ID_START and
 228 ID_CONTINUE sets are always permitted, but other ASCII characters
 229 can be added to these sets.
 230
 231 ###### token config parameters
 232         char *word_start;
 233         char *word_cont;
 234
 235 ###### internal functions
 236         static int is_word_start(wchar_t ch, struct token_config *conf)
 237         {
 238                 return iswalpha(ch) ||
 239                        strchr(conf->word_start, ch) != NULL ||
 240                        u_hasBinaryProperty(ch, UCHAR_ID_START);
 241         }
 242
 243         static int is_word_continue(wchar_t ch, struct token_config *conf)
 244         {
 245                 return iswalnum(ch) ||
 246                        strchr(conf->word_cont, ch) != NULL ||
 247                        u_hasBinaryProperty(ch, UCHAR_ID_CONTINUE);
 248         }
 249
 250 Words can be either known or unknown.  Known words are referred to as
 251 "reserved words" and get a unique token number.  Unknown words are
 252 "identifiers" and are syntactically a single token.
 253
 254 ###### token types
 255         TK_ident,
 256
 257 A list of known words must be provided.  This list is shared with the
 258 "marks" which are described next.  The list must be lexically sorted
 259 and the length of the list must be given (`known_count`).
 260 Tokens matching these known words are reported as the index of the
 261 list added to `TK_reserved`.
 262
 263 If identifiers are ignored, then any work which is not listed as a
 264 known word results in an error.
 265
 266 ###### token config parameters
 267         const char **words_marks;
 268         int known_count;
 269
 270 ###### parse word
 271
 272         if (is_word_start(ch, state->conf)) {
 273                 int n;
 274                 /* A word: identifier or reserved */
 275                 do
 276                         ch = get_char(state);
 277                 while (is_word_continue(ch, state->conf));
 278                 unget_char(state);
 279                 close_token(state, &tk);
 280                 tk.num = TK_ident;
 281                 if (ignored & (1<<TK_ident))
 282                         tk.num = TK_error;
 283                 n = find_known(state->conf, tk.txt);
 284                 if (n >= 0)
 285                         tk.num = TK_reserved + n;
 286                 return tk;
 287         }
 288
 289 ### Marks
 290
 291 Marks are generally one or more punctuation marks joined together.  It
 292 would be nice to use the term "symbol" for these, but that causes
 293 confusion in a subsequent discussion of the grammar, which has terminal
 294 symbols and non-terminal symbols which are conceptually quite
 295 different.  So strings of punctuation characters will be marks.
 296
 297 A "mark" consists of ASCII characters that are not white space, are not
 298 "start" characters for words, and are not digits.
 299 These will collectively be called mark characters.
 300
 301 ###### internal functions
 302         static int is_mark(wchar_t ch, struct token_config *conf)
 303         {
 304                 return ch > ' ' &&
 305                        ch < 0x7f &&
 306                        !iswalnum(ch) &&
 307                        strchr(conf->word_start, ch) == NULL;
 308         }
 309
 310 As with words, there can be known and unknown marks, though the rules
 311 are slightly different.
 312
 313 Two marks do not need to be separated by a non-mark characters.  This
 314 is different from words which do need to be separated by at least one
 315 non-continue character.
 316
 317 The scanner will normally prefer longer sequences of mark characters,
 318 but will more strongly prefer known marks over unknown marks.  So if
 319 it finds a known mark where adding one more character does not result
 320 in a known mark, it will return that first known mark.
 321
 322 If no known mark is found we will test against strings and comments
 323 below before giving up and assuming an unknown mark.
 324
 325 If an unknown mark contains a quote character or a comment marker, and
 326 that token is not being ignored, then we terminate the unknown mark
 327 before that quote or comment.  This ensure that an unknown mark
 328 immediately before a string is handled correctly.
 329
 330 If `TK_mark` is ignored, then unknown marks as returned as an error.
 331
 332 ###### token types
 333         TK_mark,
 334
 335 Known marks are included in the same list as the list of known words.
 336
 337 ###### parse mark
 338         tk.num = TK_error;
 339         while (is_mark(ch, state->conf)) {
 340                 int n;
 341                 wchar_t prev;
 342                 close_token(state, &tk);
 343                 n = find_known(state->conf, tk.txt);
 344                 if (n >= 0)
 345                         tk.num = TK_reserved + n;
 346                 else if (tk.num != TK_error) {
 347                         /* found a longest-known-mark, still need to
 348                          * check for comments
 349                          */
 350                         if (tk.txt.len == 2 && tk.txt.txt[0] == '/' &&
 351                             (ch == '/' || ch == '*')) {
 352                                 /* Yes, this is a comment, not a '/' */
 353                                 restore_unget_state(state);
 354                                 tk.num = TK_error;
 355                                 break;
 356                         }
 357                         unget_char(state);
 358                         close_token(state, &tk);
 359                         return tk;
 360                 }
 361                 prev = ch;
 362                 save_unget_state(state);
 363                 ch = get_char(state);
 364                 if (!(ignored && (1<<TK_string)) && is_quote(ch))
 365                         break;
 366                 if (prev == '#' && n < 0)
 367                         /* '#' is not a known mark, so assume it is a comment */
 368                         break;
 369                 if (prev == '/' && ch == '/' && tk.txt.len == 1 && n < 0) {
 370                         close_token(state, &tk);
 371                         restore_unget_state(state);
 372                         break;
 373                 }
 374                 if (prev == '/' && ch == '*' && tk.txt.len == 1 && n < 0) {
 375                         close_token(state, &tk);
 376                         restore_unget_state(state);
 377                         break;
 378                 }
 379         }
 380         unget_char(state);
 381         if (tk.num != TK_error) {
 382                 close_token(state, &tk);
 383                 return tk;
 384         }
 385
 386 If we don't find a known mark, we will check for strings and comments
 387 before assuming that we have an unknown mark
 388
 389 ###### parse mark
 390         ## parse string
 391         ## parse comment
 392         ## unknown mark
 393
 394 ###### unknown mark
 395         if (tk.txt.len) {
 396                 if (ignored & (1<<TK_mark))
 397                         tk.num = TK_error;
 398                 else
 399                         tk.num = TK_mark;
 400                 return tk;
 401         }
 402
 403 ### Strings
 404
 405 Strings start with one of single quote, double quote, or back quote
 406 and continue until a matching character on the same line.  Any of
 407 these characters can be included in the list of known marks and then
 408 they will not be used for identifying strings.
 409
 410 Immediately following the close quote one or two ASCII letters may
 411 appear.  These are somewhat like the arbitrary letters allowed in
 412 "Numbers" above.  They can be used by the language in various ways.
 413
 414 If 3 identical quote characters appear in a row and are
 415 followed by a newline, then this forms a multi-line string which
 416 continues until an identical triple quote appears on a line preceded
 417 only by whitespace and followed immediately by 0-2 ASCII letters and a newline.
 418
 419 Multi-line strings may not extend beyond the end of the `code_node` in
 420 which they start.
 421
 422 Normal strings and multi-line strings are encoded as two different
 423 token types.
 424
 425 ###### token types
 426         TK_string,
 427         TK_multi_string,
 428
 429 ###### internal functions
 430         static int is_quote(wchar_t ch)
 431         {
 432                 return ch == '\'' || ch == '"' || ch == '`';
 433         }
 434
 435 #### Multi-line strings
 436
 437 The multi-line string is checked for first.  If they are being
 438 ignored, we fall through and treat a triple quote as an empty string
 439 followed by the start of a new string.
 440
 441 ###### parse string
 442         if (tk.txt.len == 3 &&
 443             !(ignored & (1 << TK_multi_string)) &&
 444             is_quote(tk.txt.txt[0]) &&
 445             memcmp(tk.txt.txt, tk.txt.txt+1, 2) == 0 &&
 446             is_newline(tk.txt.txt[3])) {
 447                 // triple quote
 448                 wchar_t first = tk.txt.txt[0];
 449                 int qseen = 0;
 450                 int at_sol = 1;
 451                 while (!at_eon(state) && qseen < 3) {
 452                         ch = get_char(state);
 453                         if (is_newline(ch)) {
 454                                 at_sol = 1;
 455                                 qseen = 0;
 456                         } else if (at_sol && ch == first) {
 457                                 qseen += 1;
 458                         } else if (ch != ' ' && ch != '\t') {
 459                                 at_sol = 0;
 460                                 qseen = 0;
 461                         }
 462                 }
 463                 if (qseen != 3) {
 464                         /* Hit end of node - error.
 465                          * unget so the newline is seen,
 466                          * but return rest of string as an error.
 467                          */
 468                         if (is_newline(ch))
 469                                 unget_char(state);
 470                         close_token(state, &tk);
 471                         tk.num = TK_error;
 472                         return tk;
 473                 }
 474                 /* 2 letters are allowed */
 475                 ch = get_char(state);
 476                 if (iswalpha(ch))
 477                         ch = get_char(state);
 478                 if (iswalpha(ch))
 479                         ch = get_char(state);
 480                 /* Now we must have a newline, but we don't return it
 481                  * whatever it is.*/
 482                 unget_char(state);
 483                 close_token(state, &tk);
 484                 tk.num = TK_multi_string;
 485                 if (!is_newline(ch))
 486                         tk.num = TK_error;
 487                 return tk;
 488         }
 489
 490 #### Single-line strings
 491
 492 The sequence of marks collected may be more than a single-line
 493 string, so we reset to the start and collect characters until
 494 we find a close quote or a newline.
 495
 496 If `TK_string` is ignored, then quote characters will appear as `TK_mark`s.
 497
 498 ###### parse string
 499         if (tk.txt.len && is_quote(tk.txt.txt[0]) &&
 500             !(ignored & (1<<TK_string))) {
 501                 wchar_t first = tk.txt.txt[0];
 502                 reset_token(state, &tk);
 503                 ch = get_char(state);
 504                 tk.num = TK_error;
 505                 while (!at_eon(state) && !is_newline(ch)) {
 506                         ch = get_char(state);
 507                         if (ch == first) {
 508                                 tk.num = TK_string;
 509                                 break;
 510                         }
 511                         if (is_newline(ch)) {
 512                                 unget_char(state);
 513                                 break;
 514                         }
 515                 }
 516                 close_token(state, &tk);
 517                 return tk;
 518         }
 519
 520 ### Comments
 521
 522 Single line comments may start with '`//`' or '`#`' providing that these
 523 are not known marks.  They continue to the end of the line.
 524
 525 Block comments start with '`/*`' if this is not a known mark.  They
 526 continue to the first occurrence of '`*/`' and may not contain any
 527 occurrence of '`/*`'.
 528
 529 Block comments can be wholly within one line or can continue over
 530 multiple lines.  The multi-line version should be followed immediately
 531 by a newline.  The Linux kernel contains over 285000 multi-line
 532 comments are only 34 are followed by characters other than white space
 533 (which should be removed) or a backslash (only needed in macros).  So
 534 it would not suffer from this rule.
 535
 536 These two comment types are reported as two separate token types, and
 537 consequently can be ignored separately.  When ignored a comment is
 538 still parsed, but is discarded.
 539
 540 ###### token types
 541         TK_line_comment,
 542         TK_block_comment,
 543
 544 ###### internal functions
 545         static int is_line_comment(struct text txt)
 546         {
 547                 return (txt.len >= 1 && txt.txt[0] == '#') ||
 548                        (txt.len >= 2 && txt.txt[0] == '/' &&
 549                                         txt.txt[1] == '/');
 550         }
 551
 552         static int is_block_comment(struct text txt)
 553         {
 554                 return txt.len >= 2 && txt.txt[0] == '/' &&
 555                        txt.txt[1] == '*';
 556         }
 557
 558 #### Single line comments
 559
 560 A single-line comment continues up to, but not including the newline
 561 or end of node.
 562
 563 ###### parse comment
 564
 565         if (is_line_comment(tk.txt)) {
 566                 while (!is_newline(ch) && !at_eon(state))
 567                         ch = get_char(state);
 568                 if (is_newline(ch))
 569                         unget_char(state);
 570                 close_token(state, &tk);
 571                 tk.num = TK_line_comment;
 572                 if (ignored & (1 << TK_line_comment))
 573                         continue;
 574                 return tk;
 575         }
 576
 577 #### Block comments
 578
 579 The token text collected so far could exceed the comment, so we need
 580 to reset it first.
 581
 582 If we find an embedded `/*` we reset to just before the '/' and report
 583 an error.  That way the next thing to be parsed will be the rest of
 584 the comment.  This requires a double unget, so we need to save/restore
 585 the unget state (explained later).
 586
 587 ###### parse comment
 588
 589         if (is_block_comment(tk.txt)) {
 590                 wchar_t prev;
 591                 int newlines = 0;
 592                 reset_token(state, &tk);
 593                 get_char(state);
 594                 get_char(state);
 595                 save_unget_state(state);
 596                 ch = get_char(state);
 597                 prev = 0;
 598                 while (!at_eon(state) &&
 599                        (prev != '/' || ch != '*') &&
 600                        (prev != '*' || ch != '/')) {
 601                         if (is_newline(ch))
 602                                 newlines = 1;
 603                         prev = ch;
 604                         save_unget_state(state);
 605                         ch = get_char(state);
 606                 }
 607                 close_token(state, &tk);
 608                 if (at_eon(state)) {
 609                         tk.num = TK_error;
 610                         return tk;
 611                 }
 612                 if (prev == '/') {
 613                         /* embedded.  Need to unget twice! */
 614                         restore_unget_state(state);
 615                         unget_char(state);
 616                         tk.num = TK_error;
 617                         return tk;
 618                 }
 619                 tk.num = TK_block_comment;
 620                 if (newlines && !(ignored & (1<<TK_newline))) {
 621                         /* next char must be newline */
 622                         ch = get_char(state);
 623                         unget_char(state);
 624                         if (!is_newline(ch))
 625                                 tk.num = TK_error;
 626                 }
 627                 if (tk.num == TK_error ||
 628                     !(ignored & (1 << TK_block_comment)))
 629                         return tk;
 630                 continue;
 631         }
 632
 633 ### Indents, Newlines, and White Space.
 634
 635 Normally white space is ignored.  However newlines can be important as
 636 can indents, which are either after a newline or at the start of a
 637 node (detected by `at_son()`);
 638
 639 ###### exported functions
 640         static inline int is_newline(wchar_t ch)
 641         {
 642                 return ch == '\n' || ch == '\f' || ch == '\v';
 643         }
 644
 645 ###### white space
 646         if (ch <= ' ' && !is_newline(ch)
 647             && ! at_son(state))
 648                 continue;
 649
 650 If a line starts with more white-space than the previous non-blank
 651 line - or if the first non-blank line in the document starts with any
 652 white-space - then an "IN" is reported at the start of the line.
 653
 654 Before the next non-blank line which starts with less white space, or
 655 at the latest at the end of the document, a matching "OUT" token
 656 is reported.  There will always be an exact match between "IN" and
 657 "OUT" tokens.
 658
 659 It is possible for "OUT" to be followed (almost) immediately by an
 660 "IN".  This happens if, for example, the indent of three consecutive
 661 lines are 0, 8, 4 spaces.  Before the second line we report an
 662 "IN".  Before the third line we must report an "OUT", as 4 is less
 663 than 8, then also an Ident as 4 is greater than 0.
 664
 665 ###### token types
 666         TK_in,
 667         TK_out,
 668
 669 For the purpose of measuring the length of white space, a tab adds at
 670 least one space, and rounds up to a multiple of 8.
 671
 672 ###### exported functions
 673         static inline int indent_tab(int indent)
 674         {
 675                 return (indent|7)+1;
 676         }
 677
 678 We need to track the current levels of indent.  This requires some
 679 sort of stack as indent levels are pushed on and popped off.  In
 680 practice this stack is unlikely to often exceed 5 so we will used a
 681 fixed stack of 20 indent levels.  More than this will be silently
 682 ignored.
 683
 684 ###### state fields
 685         int     indent_level;
 686         int     indent_sizes[20];
 687
 688 #### Newlines
 689
 690 Newlines can optionally be reported.  Newlines within a block comment
 691 or a multi-line string are not reported separately, but each of these
 692 must be followed immediately by a newline so these constructs cannot
 693 hide the fact that a newline was present.
 694
 695 When indents are being reported, the Newline which would normally be
 696 reported immediately before the "IN" is delayed until after the
 697 matching "OUT".  This makes an indented section act like a
 698 continuation of the previous line to some extent.
 699
 700 A blank line would normally be reported simply as two consecutive Newline
 701 tokens.  However if the subsequent line is indented (and indents are being
 702 reported) then the right thing to do is less obvious as Newlines should be
 703 delayed - but how many Newlines?
 704
 705 The approach we will take is to report the extra Newlines immediately after
 706 the IN token, so the blank line is treated as though it were an indented
 707 blank line.
 708
 709 ###### token types
 710         TK_newline,
 711
 712 If we find a newline or white space at the start of a block, we keep
 713 collecting spaces, tabs, and newlines until we find some real text.
 714 Then depending on the indent we generate some number of tokens.  These
 715 will be a sequence of "Newline OUT" pairs representing a decrease
 716 in indent, then either a Newline or an IN depending on whether the
 717 next line is indented, then zero or more Newlines representing all the
 718 blank lines that have been skipped.
 719
 720 When a Newline leads to the next block of code there is a question of
 721 whether the various Newline and OUT/IN tokens should appear to
 722 pbelong to the earlier or later block.  This is addressed by processing
 723 the tokens in two stages based on the relative indent levels of the
 724 two blocks (each block has a base indent to which the actual indents
 725 are added).
 726
 727 Any "Newline OUT" pairs needed to reduce the current indent to the
 728 maximum of the base indents of the old and new blocks are generated
 729 against the old block.  Then if the next block does not have an
 730 increased indent, one more "Newline" is generated.
 731
 732 If further "Newline OUT" pairs are needed to get to the indent
 733 level of the 'next' block, they are generated against that block,
 734 though the first Newline is suppressed (it having already been
 735 generated).
 736
 737 Finally the Newline or IN for the first line of the new block is
 738 generated, unless the Newline needs to be suppressed because it
 739 appeared at the end of the previous block.
 740
 741 This means that a block may start with an OUT or an IN, but
 742 will only start with a Newline if it actually starts with a blank
 743 line.
 744
 745 We will need to represent in the `token_state` where in this sequence
 746 of delayed tokens we are.  As `state.col` records the target indent we
 747 don't need to record how many OUTs or INs are needed.  We do
 748 need to record the number of blank lines, and which of Newline and
 749 OUT is needed next in the initial sequence of pairs.
 750
 751 For this we store one more than the number of blank lines as
 752 `delayed_lines` and a flag for `out_next`.
 753
 754 ###### state fields
 755         int check_indent;
 756         int delayed_lines;
 757         int out_next;
 758
 759 Generating these tokens involve two separate pieces of code.
 760
 761 Firstly we need to recognise white space and count the indents and
 762 newlines.  These are recorded in the above state fields.
 763
 764 Separately we need, on each call to `token_next`, we need to check if
 765 there are some delayed tokens and if so we need to advance the state
 766 information and return one token.
 767
 768 ###### white space
 769         if (is_newline(ch) || (at_son(state) && ch <= ' ')) {
 770                 int newlines = 0;
 771                 int was_son = at_son(state);
 772                 if (ignored & (1<<TK_in)) {
 773                         if (!is_newline(ch))
 774                                 continue;
 775                         if (ignored & (1<<TK_newline))
 776                                 continue;
 777                         tk.num = TK_newline;
 778                         close_token(state, &tk);
 779                         return tk;
 780                 }
 781                 // Indents are needed, so check all white space.
 782                 while (ch <= ' ' && !at_eon(state)) {
 783                         if (is_newline(ch))
 784                                 newlines += 1;
 785                         ch = get_char(state);
 786                 }
 787                 if (at_eon(state)) {
 788                         newlines += 1;
 789                         if (state->node->next &&
 790                             state->node->next->indent > state->node->indent)
 791                                 state->col = state->node->next->indent;
 792                         else
 793                                 state->col = state->node->indent;
 794                 } else
 795                         unget_char(state);
 796                 state->delayed_lines = newlines;
 797                 state->out_next = was_son;
 798                 state->check_indent = 1;
 799                 continue;
 800         }
 801
 802
 803 ###### delayed tokens
 804
 805         if (state->check_indent || state->delayed_lines) {
 806                 if (state->col < state->indent_sizes[state->indent_level]) {
 807                         if (!state->out_next &&
 808                             !(ignored & (1<<TK_newline))) {
 809                                 state->out_next = 1;
 810                                 tk.num = TK_newline;
 811                                 return tk;
 812                         }
 813                         state->indent_level -= 1;
 814                         state->out_next = 0;
 815                         tk.num = TK_out;
 816                         return tk;
 817                 }
 818                 if (state->col > state->indent_sizes[state->indent_level] &&
 819                     state->indent_level < sizeof(state->indent_sizes)-1) {
 820                         state->indent_level += 1;
 821                         state->indent_sizes[state->indent_level] = state->col;
 822                         state->delayed_lines -= 1;
 823                         tk.num = TK_in;
 824                         return tk;
 825                 }
 826                 state->check_indent = 0;
 827                 if (state->delayed_lines && !(ignored & (1<<TK_newline))) {
 828                         tk.num = TK_newline;
 829                         state->delayed_lines -= 1;
 830                         return tk;
 831                 }
 832                 state->delayed_lines = 0;
 833                 continue;
 834         }
 835
 836 ### End of File
 837
 838 After the last newline in the file has been processed, a special
 839 end-of-file token will be returned.  any further attempts to get more
 840 tokens will continue to return the same end-of-file token.
 841
 842 ###### token types
 843         TK_eof,
 844
 845
 846 ###### white space
 847         if (ch == WEOF) {
 848                 if (state->col) {
 849                         state->col = 0;
 850                         state->check_indent = 1;
 851                         continue;
 852                 }
 853                 tk.num = TK_eof;
 854                 return tk;
 855         }
 856
 857 ### Unknown Marks, or errors.
 858
 859 We have now handled all the possible known mark-like tokens.
 860 If the token we have is not empty and `TK_mark` is allowed,
 861 we have an unknown mark, otherwise this must be an error.
 862
 863 ###### unknown mark
 864         /* one unknown character */
 865         close_token(state, &tk);
 866         tk.num = TK_error;
 867         return tk;
 868
 869 ## Tools For The Task
 870
 871 You may have noticed that are few gaps we left in the above -
 872 functions used without first defining them.  Doing so above would have
 873 broken the flow.
 874
 875 ### Character by character
 876
 877 As we walk through the various `code_node`s we need to process whole
 878 Unicode codepoints, and keep track of which line and column we are on.
 879 We will assume for now that any printing character uses one column,
 880 though that is not true in general.
 881
 882 As the text in a `code_node` may include an indent that identifies it as
 883 being code, we need to be careful to strip that.  The `code_node` has
 884 a flag that tells us whether or not we need to strip.
 885
 886 ###### includes
 887         #include <memory.h>
 888
 889 ###### state fields
 890         struct code_node *node;
 891         int    offset;
 892         int    line;
 893         int    col;
 894
 895 ###### internal functions
 896
 897         static void do_strip(struct token_state *state)
 898         {
 899                 if (state->node->needs_strip) {
 900                         int n = 4;
 901                         while (n && state->node->code.txt[state->offset] == ' ') {
 902                                 state->offset += 1;
 903                                 n -= 1;
 904                         }
 905                         while (n == 4 && state->node->code.txt[state->offset] == '\t') {
 906                                 state->offset += 1;
 907                                 n -= 4;
 908                         }
 909                 }
 910         }
 911
 912         static wint_t get_char(struct token_state *state)
 913         {
 914                 wchar_t next;
 915                 size_t n;
 916                 mbstate_t mbstate;
 917
 918                 if (state->node == NULL)
 919                         return WEOF;
 920                 if (state->node->code.len <= state->offset) {
 921                         do
 922                                 state->node = state->node->next;
 923                         while (state->node && state->node->code.txt == NULL);
 924                         state->offset = 0;
 925                         if (state->node == NULL)
 926                                 return WEOF;
 927                         do_strip(state);
 928                         state->line = state->node->line_no;
 929                         state->col = state->node->indent;
 930                 }
 931
 932                 ## before get_char
 933
 934                 memset(&mbstate, 0, sizeof(mbstate));
 935
 936                 n = mbrtowc(&next, state->node->code.txt + state->offset,
 937                             state->node->code.len - state->offset,
 938                             &mbstate);
 939                 if (n == -2 || n == 0) {
 940                         /* Not enough bytes - not really possible */
 941                         next = '\n';
 942                         state->offset = state->node->code.len;
 943                 } else if (n == -1) {
 944                         /* error */
 945                         state->offset += 1;
 946                         next = 0x7f; // an illegal character
 947                 } else
 948                         state->offset += n;
 949
 950                 if (next >= ' ') {
 951                         state->col += 1;
 952                 } else if (is_newline(next)) {
 953                         state->line += 1;
 954                         state->col = state->node->indent;
 955                         do_strip(state);
 956                 } else if (next == '\t') {
 957                         state->col = indent_tab(state->col);
 958                 }
 959                 return next;
 960         }
 961
 962 We will sometimes want to "unget" the last character as it needs to be
 963 considered again as part of the next token.  So we need to store a
 964 'previous' version of all metadata.
 965
 966 ###### state fields
 967         int    prev_offset;
 968         int    prev_line;
 969         int    prev_col;
 970
 971 ###### before get_char
 972         state->prev_offset = state->offset;
 973         state->prev_line   = state->line;
 974         state->prev_col    = state->col;
 975
 976 ###### internal functions
 977
 978         static void unget_char(struct token_state *state)
 979         {
 980                 if (state->node) {
 981                         state->offset = state->prev_offset;
 982                         state->line   = state->prev_line;
 983                         state->col    = state->prev_col;
 984                 }
 985         }
 986
 987 We occasionally need a double-unget, particularly for numbers and
 988 block comments.  We don't impose this cost on all scanning, but
 989 require those code sections that need it to call `save_unget_state`
 990 before each `get_char`, and then `restore_unget_state` when a
 991 double-unget is needed.
 992
 993 ###### state fields
 994         int     prev_offset2;
 995         int     prev_line2;
 996         int     prev_col2;
 997
 998 ###### internal functions
 999         static void save_unget_state(struct token_state *state)
1000         {
1001                 state->prev_offset2 = state->prev_offset;
1002                 state->prev_line2 = state->prev_line;
1003                 state->prev_col2 = state->prev_col;
1004         }
1005
1006         static void restore_unget_state(struct token_state *state)
1007         {
1008                 state->prev_offset = state->prev_offset2;
1009                 state->prev_line = state->prev_line2;
1010                 state->prev_col = state->prev_col2;
1011         }
1012
1013 At the start of a token we don't want to be at the end of a code block
1014 if we can help it.  To avoid this possibility, we 'get' and 'unget' a
1015 single character.  This will move into the next non-empty code block
1016 and leave the current pointer at the start of it.
1017
1018 This has to happen _after_ dealing with delayed tokens as some of them
1019 must appear in the previous node.  When we do this, we need to reset
1020 the data in the token.
1021
1022 ###### delayed tokens
1023         if (at_eon(state)) {
1024                 get_char(state);
1025                 unget_char(state);
1026                 tk.node = state->node;
1027                 if (state->node)
1028                         tk.txt.txt = state->node->code.txt + state->offset;
1029                 tk.line = state->line;
1030                 tk.col = state->col;
1031                 tk.txt.len = 0;
1032         }
1033
1034 ### Managing tokens
1035
1036 The current token is initialized to line up with the first character
1037 that we 'get' for each token.  When we have, or might have, a full
1038 token we can call `close_token` to set the `len` of the token
1039 appropriately.  This can safely be called multiple times.
1040
1041 Finally we occasionally (for single-line strings and block comments)
1042 need to reset to the beginning of the current token as we might have
1043 parsed too much already.  For that there is `reset_token`.
1044
1045 ###### one token
1046         tk.node = state->node;
1047         if (state->node)
1048                 tk.txt.txt = state->node->code.txt + state->offset;
1049         tk.line = state->line;
1050         tk.col = state->col;
1051         tk.txt.len = 0;
1052
1053 ###### internal functions
1054
1055         static void close_token(struct token_state *state,
1056                                 struct token *tk)
1057         {
1058                 tk->txt.len = (state->node->code.txt + state->offset)
1059                               - tk->txt.txt;
1060         }
1061
1062         static void reset_token(struct token_state *state, struct token *tok)
1063         {
1064                 state->prev_line = tok->line;
1065                 state->prev_col = tok->col;
1066                 state->prev_offset = tok->txt.txt - state->node->code.txt;
1067                 unget_char(state);
1068                 tok->txt.len = 0;
1069         }
1070
1071
1072 Tokens make not cross into the next `code_node`, and some tokens can
1073 include the newline at the and of a `code_node`, we must be able to
1074 easily check if we have reached the end.  Equally we need to know if
1075 we are at the start of a node, as white space is treated a little
1076 differently there.
1077
1078 ###### internal functions
1079
1080         static int at_son(struct token_state *state)
1081         {
1082                 return state->offset == 0;
1083         }
1084
1085         static int at_eon(struct token_state *state)
1086         {
1087                 // at end-of-node ??
1088                 return state->node == NULL ||
1089                        state->offset >= state->node->code.len;
1090         }
1091
1092 ### Find a known word
1093
1094 As the known-word list is sorted we can use a simple binary search.
1095 Following the pattern established in "mdcode", we will use a `struct
1096 text` with start and length to represent the code fragment we are
1097 searching for.
1098
1099 ###### internal functions
1100         static int find_known(struct token_config *conf, struct text txt)
1101         {
1102                 int lo = 0;
1103                 int hi = conf->known_count;
1104
1105                 while (lo + 1 < hi) {
1106                         int mid = (lo + hi) / 2;
1107                         int cmp = strncmp(conf->words_marks[mid],
1108                                           txt.txt, txt.len);
1109                         if (cmp == 0 && conf->words_marks[mid][txt.len])
1110                                 cmp = 1;
1111                         if (cmp <= 0)
1112                                 lo = mid;
1113                         else
1114                                 hi = mid;
1115                 }
1116                 if (strncmp(conf->words_marks[lo],
1117                            txt.txt, txt.len) == 0
1118                     && conf->words_marks[lo][txt.len] == 0)
1119                         return lo;
1120                 else
1121                         return -1;
1122         }
1123
1124 ### Bringing it all together
1125
1126 Now we have all the bits there is just one section missing:  combining
1127 all the token parsing code into one block.
1128
1129 The handling of delayed tokens (Newlines, INs, OUTs) must come
1130 first before we try getting another character.
1131
1132 Then we parse all the test, making sure that we check for known marks
1133 before strings and comments, but unknown marks after strings and comments.
1134
1135 This block of code will either return a token, or will choose to
1136 ignore one, in which case it will `continue` around to the top of the
1137 loop.
1138
1139 ###### one token
1140         ## delayed tokens
1141
1142         ch = get_char(state);
1143
1144         ## white space
1145         ## parse number
1146         ## parse word
1147         ## parse mark
1148
1149 ### Start and stop
1150
1151 As well as getting tokens, we need to be able to create the
1152 `token_state` to start with, and discard it later.
1153
1154 ###### includes
1155         #include <malloc.h>
1156
1157 ###### main functions
1158         struct token_state *token_open(struct code_node *code, struct
1159                                        token_config *conf)
1160         {
1161                 struct token_state *state = malloc(sizeof(*state));
1162                 memset(state, 0, sizeof(*state));
1163                 state->node = code;
1164                 state->line = code->line_no;
1165                 state->col = code->indent;
1166                 state->conf = conf;
1167                 do_strip(state);
1168                 return state;
1169         }
1170         void token_close(struct token_state *state)
1171         {
1172                 free(state);
1173         }
1174
1175 ###### exported functions
1176         struct token_state *token_open(struct code_node *code, struct
1177                                        token_config *conf);
1178         void token_close(struct token_state *state);
1179
1180 ### Trace tokens
1181
1182 Getting tokens is the main thing but it is also useful to be able to
1183 print out token information, particularly for tracing and testing.
1184
1185 Known tokens are printed verbatim.  Other tokens are printed as
1186 `type(content)` where content is truncated to a given number of characters.
1187
1188 The function for printing a truncated string (`text_dump`) is also exported
1189 so that it can be used to tracing processed strings too.
1190
1191 ###### includes
1192         #include <stdio.h>
1193
1194 ###### exported functions
1195         void token_trace(FILE *f, struct token tok, int max);
1196         void text_dump(FILE *f, struct text t, int max);
1197
1198 ###### main functions
1199
1200         void text_dump(FILE *f, struct text txt, int max)
1201         {
1202                 int i;
1203                 if (txt.len > max)
1204                         max -= 2;
1205                 else
1206                         max = txt.len;
1207                 for (i = 0; i < max; i++) {
1208                         char c = txt.txt[i];
1209                         if (c < ' ' || c > '~')
1210                                 fprintf(f, "\\x%02x", c & 0xff);
1211                         else if (c == '\\')
1212                                 fprintf(f, "\\\\");
1213                         else
1214                                 fprintf(f, "%c", c);
1215                 }
1216                 if (i < txt.len)
1217                         fprintf(f, "..");
1218         }
1219
1220         void token_trace(FILE *f, struct token tok, int max)
1221         {
1222                 static char *types[] = {
1223                         [TK_ident] = "ident",
1224                         [TK_mark] = "mark",
1225                         [TK_number] = "number",
1226                         [TK_string] = "string",
1227                         [TK_multi_string] = "mstring",
1228                         [TK_line_comment] = "lcomment",
1229                         [TK_block_comment] = "bcomment",
1230                         [TK_in] = "in",
1231                         [TK_out] = "out",
1232                         [TK_newline] = "newline",
1233                         [TK_eof] = "eof",
1234                         [TK_error] = "ERROR",
1235                         };
1236
1237                 switch (tok.num) {
1238                 default: /* known word or mark */
1239                         fprintf(f, "%.*s", tok.txt.len, tok.txt.txt);
1240                         break;
1241                 case TK_in:
1242                 case TK_out:
1243                 case TK_newline:
1244                 case TK_eof:
1245                         /* No token text included */
1246                         fprintf(f, "%s()", types[tok.num]);
1247                         break;
1248                 case TK_ident:
1249                 case TK_mark:
1250                 case TK_number:
1251                 case TK_string:
1252                 case TK_multi_string:
1253                 case TK_line_comment:
1254                 case TK_block_comment:
1255                 case TK_error:
1256                         fprintf(f, "%s(", types[tok.num]);
1257                         text_dump(f, tok.txt, max);
1258                         fprintf(f, ")");
1259                         break;
1260                 }
1261         }
1262
1263 ### And there we have it
1264
1265 We now have all the library functions defined for reading and printing
1266 tokens.  Now we just need C files to store them, and a mk file to make them.
1267
1268 ###### File: scanner.h
1269         ## public types
1270         ## exported functions
1271
1272 ###### File: libscanner.c
1273         ## includes
1274         #include "scanner.h"
1275         ## private types
1276         ## internal functions
1277         ## main functions
1278
1279 ###### File: scanner.mk
1280
1281         CFLAGS += -Wall -g
1282         all ::
1283         scanner.mk scanner.h libscanner.c : scanner.mdc
1284                 ./md2c scanner.mdc
1285         all :: libscanner.o
1286         libscanner.o : libscanner.c
1287                 $(CC) $(CFLAGS) -c libscanner.c
1288
1289 ## Processing numbers
1290
1291 Converting a `TK_number` token to a numerical value is a slightly
1292 higher level task than lexical analysis, and slightly lower than
1293 grammar parsing, so put it here - as an index if you like.
1294
1295 Importantly it will be used by the same testing rig that is used for
1296 testing the token scanner.
1297
1298 The numeric value that we will convert all numbers into is the `mpq_t`
1299 from the GNU high precision number library "libgmp".
1300
1301 ###### number includes
1302         #include <gmp.h>
1303         #include "mdcode.h"
1304
1305 Firstly we need to be able to parse a string of digits in a given base
1306 and possibly with a decimal marker.  We store this in an `mpz_t`
1307 integer and report the number of digits after the decimal mark.
1308
1309 On error we return zero and ensure that the 'mpz_t' has been freed, or
1310 had never been initialised.
1311
1312 ###### number functions
1313
1314         static int parse_digits(mpz_t num, struct text tok, int base,
1315                                 int *placesp)
1316         {
1317                 /* Accept digits up to 'base', ignore '_' and
1318                  * ' ' if they appear between two legal digits,
1319                  * and if `placesp` is not NULL, allow a single
1320                  * '.' or ',' and report the number of digits
1321                  * beyond there.
1322                  * Return number of characters processed (p),
1323                  * or 0 if something illegal was found.
1324                  */
1325                 int p;
1326                 int decimal = -1; // digits after marker
1327                 enum {Digit, Space, Other} prev = Other;
1328                 int digits = 0;
1329
1330                 for (p = 0; p < tok.len; p++) {
1331                         int dig;
1332                         char c = tok.txt[p];
1333
1334                         if (c == '_' || c == ' ') {
1335                                 if (prev != Digit)
1336                                         goto bad;
1337                                 prev = Space;
1338                                 continue;
1339                         }
1340                         if (c == '.' || c == ',') {
1341                                 if (prev != Digit)
1342                                         goto bad;
1343                                 if (!placesp || decimal >= 0)
1344                                         return p-1;
1345                                 decimal = 0;
1346                                 prev = Other;
1347                                 continue;
1348                         }
1349                         if (isdigit(c))
1350                                 dig = c - '0';
1351                         else if (isupper(c))
1352                                 dig = 10 + c - 'A';
1353                         else if (islower(c))
1354                                 dig = 10 + c - 'a';
1355                         else
1356                                 dig = base;
1357                         if (dig >= base) {
1358                                 if (prev == Space)
1359                                         p--;
1360                                 break;
1361                         }
1362                         prev = Digit;
1363                         if (digits)
1364                                 mpz_mul_ui(num, num, base);
1365                         else
1366                                 mpz_init(num);
1367                         digits += 1;
1368                         mpz_add_ui(num, num, dig);
1369                         if (decimal >= 0)
1370                                 decimal++;
1371                 }
1372                 if (digits == 0)
1373                         return 0;
1374                 if (placesp) {
1375                         if (decimal >= 0)
1376                                 *placesp = decimal;
1377                         else
1378                                 *placesp = 0;
1379                 }
1380                 return p;
1381         bad:
1382                 if (digits)
1383                         mpz_clear(num);
1384                 return 0;
1385         }
1386
1387 ###### number includes
1388         #include <ctype.h>
1389
1390 To parse a full number we need to consider the optional base, the
1391 mantissa, and the optional exponent.  We will treat these one at a
1392 time.
1393
1394 The base is indicated by a letter after a leading zero, which must be
1395 followed by a base letter or a period.  The base also determines the
1396 character which will mark an exponent.
1397
1398 ###### number vars
1399         int base = 10;
1400         char expc = 'e';
1401
1402 ###### parse base
1403
1404         if (tok.txt[0] == '0' && tok.len > 1) {
1405                 int skip = 0;
1406                 switch(tok.txt[1]) {
1407                 case 'x':
1408                 case 'X':
1409                         base = 16;
1410                         skip = 2;
1411                         expc = 'p';
1412                         break;
1413                 case 'o':
1414                 case 'O':
1415                         base = 8;
1416                         skip = 2;
1417                         expc = 'p';
1418                         break;
1419                 case 'b':
1420                 case 'B':
1421                         base = 2;
1422                         skip = 2;
1423                         expc = 'p';
1424                         break;
1425                 case '0':
1426                 case '1':
1427                 case '2':
1428                 case '3':
1429                 case '4':
1430                 case '5':
1431                 case '6':
1432                 case '7':
1433                 case '8':
1434                 case '9':
1435                 case '_':
1436                 case ' ':
1437                         // another digit is not permitted
1438                         // after a zero.
1439                         return 0;
1440                 default:
1441                         // must be decimal marker or trailing
1442                         // letter, which are OK;
1443                         break;
1444                 }
1445                 tok.txt += skip;
1446                 tok.len -= skip;
1447         }
1448
1449 After the base is the mantissa, which may contain a decimal mark, so
1450 we need to record the number of places.  We won't impose the number of
1451 places until we have the exponent as well.
1452
1453 ###### number vars
1454         int places =0;
1455         mpz_t mant;
1456         int d;
1457
1458 ###### parse mantissa
1459
1460         d = parse_digits(mant, tok, base, &places);
1461         if (d == 0)
1462                 return 0;
1463         tok.txt += d;
1464         tok.len -= d;
1465         mpq_init(num);
1466         mpq_set_z(num, mant);
1467         mpz_clear(mant);
1468
1469 After the mantissa number may come an exponent which may be positive
1470 or negative.  We assume at this point that we have seen the exponent
1471 character `expc`.
1472
1473 ###### number vars
1474         long lexp = 0;
1475         mpz_t exp;
1476         int esign = 1;
1477
1478 ###### parse exponent
1479         if (tok.len > 1) {
1480                 if (tok.txt[0] == '+') {
1481                         tok.txt++;
1482                         tok.len--;
1483                 } else if (tok.txt[0] == '-') {
1484                         esign = -1;
1485                         tok.txt++;
1486                         tok.len--;
1487                 }
1488         }
1489         d = parse_digits(exp, tok, 10, NULL);
1490         if (d == 0) {
1491                 mpq_clear(num);
1492                 return 0;
1493         }
1494         if (!mpz_fits_slong_p(exp)) {
1495                 mpq_clear(num);
1496                 mpz_clear(exp);
1497                 return 0;
1498         }
1499         lexp = mpz_get_si(exp) * esign;
1500         mpz_clear(exp);
1501         tok.txt += d;
1502         tok.len -= d;
1503
1504
1505 Now that we have the mantissa and the exponent we can multiply them
1506 together, also allowing for the number of digits after the decimal
1507 mark.
1508
1509 For base 10, we simply subtract the decimal places from the exponent.
1510 For the other bases, as the exponent is alway based on 2, even for
1511 octal and hex, we need a bit more detail.
1512 We then recover the sign from the exponent, as division is quite
1513 different from multiplication.
1514
1515 ###### calc exponent
1516         switch (base) {
1517         case 10:
1518         case 2:
1519                 lexp -= places;
1520                 break;
1521         case 16:
1522                 lexp -= 4*places;
1523                 break;
1524         case 8:
1525                 lexp -= 3*places;
1526                 break;
1527         }
1528         if (lexp < 0) {
1529                 lexp = -lexp;
1530                 esign = -1;
1531         } else
1532                 esign = 1;
1533
1534 Imposing the exponent on the number is also very different for base 10
1535 than for the others.  For the binary shift `gmp` provides a simple
1536 function.  For base 10 we use something like Russian Peasant
1537 Multiplication.
1538
1539 ###### calc exponent
1540         if (expc == 'e') {
1541                 mpq_t tens;
1542                 mpq_init(tens);
1543                 mpq_set_ui(tens, 10, 1);
1544                 while (1) {
1545                         if (lexp & 1) {
1546                                 if (esign > 0)
1547                                         mpq_mul(num, num, tens);
1548                                 else
1549                                         mpq_div(num, num, tens);
1550                         }
1551                         lexp >>= 1;
1552                         if (lexp == 0)
1553                                 break;
1554                         mpq_mul(tens, tens, tens);
1555                 }
1556                 mpq_clear(tens);
1557         } else {
1558                 if (esign > 0)
1559                         mpq_mul_2exp(num, num, lexp);
1560                 else
1561                         mpq_div_2exp(num, num, lexp);
1562         }
1563
1564 Now we are ready to parse a number: the base, mantissa, and exponent.
1565 If all goes well we check for the possible trailing letters and
1566 return.  Return value is 1 for success and 0 for failure.
1567
1568
1569 ###### number functions
1570         int number_parse(mpq_t num, char tail[3], struct text tok)
1571         {
1572                 ## number vars
1573                 int i;
1574
1575                 ## parse base
1576                 ## parse mantissa
1577                 if (tok.len > 1 && (tok.txt[0] == expc ||
1578                                     tok.txt[0] == toupper(expc))) {
1579                         tok.txt++;
1580                         tok.len--;
1581                         ## parse exponent
1582                 }
1583                 ## calc exponent
1584
1585                 for (i = 0; i < 2; i++) {
1586                         if (tok.len <= i)
1587                                 break;
1588                         if (!isalpha(tok.txt[i]))
1589                                 goto err;
1590                         tail[i] = tok.txt[i];
1591                 }
1592                 tail[i] = 0;
1593                 if (i == tok.len)
1594                         return 1;
1595         err:
1596                 mpq_clear(num);
1597                 return 0;
1598         }
1599
1600 Number parsing goes in `libnumber.c`
1601
1602 ###### File: libnumber.c
1603
1604         #include <unistd.h>
1605         #include <stdlib.h>
1606
1607         ## number includes
1608         ## number functions
1609
1610 ###### File: number.h
1611         int number_parse(mpq_t num, char tail[3], struct text tok);
1612
1613 ###### File: scanner.mk
1614         all :: libnumber.o
1615         libnumber.o : libnumber.c
1616                 $(CC) $(CFLAGS) -c libnumber.c
1617
1618 ## Processing strings
1619
1620 Both `TK_string` and `TK_multi_string` require post-processing which
1621 can be one of two types: literal or with escapes processed.
1622 Even literal processing is non-trivial as the file may contain indents
1623 which need to be stripped.
1624
1625 Errors can only occur when processing escapes.  Any unrecognised
1626 character following the escape character will cause an error.
1627
1628 Processing escapes and striping indents can only make the string
1629 shorter, not longer, so we allocate a buffer which is the same size as
1630 the string and process into that.
1631
1632 To request escape processing, we pass the character we want to use for
1633 quoting, usually '`\`'.  To avoid escape processing we pass a zero.
1634
1635 ###### string main
1636         int string_parse(struct token *tok, char escape,
1637                          struct text *str, char tail[3])
1638         {
1639                 ## string vars
1640                 struct text t = tok->txt;
1641
1642                 str->txt = NULL;
1643                 ## strip tail
1644                 if (tok->num == TK_string) {
1645                         ## strip single
1646                 } else {
1647                         ## strip multi
1648                 }
1649                 str->txt = malloc(t.len);
1650                 str->len = 0;
1651
1652                 ## process string
1653                 return 1;
1654         err:
1655                 free(str->txt);
1656                 str->txt = NULL;
1657                 return 0;
1658         }
1659
1660 ### strip tail
1661
1662 The tail of the string can be 0, 1, or 2 letters
1663
1664         i = t.len;
1665         if (i >= 0 && isalpha(t.txt[i-1]))
1666                 i -= 1;
1667         if (i >= 0 && isalpha(t.txt[i-1]))
1668                 i -= 1;
1669         strncpy(tail, t.txt+i, t.len-i);
1670         tail[t.len-i] = 0;
1671         t.len = i;
1672
1673 ###### string vars
1674         int i;
1675
1676 ### strip single
1677
1678 Stripping the quote of a single-line string is trivial.
1679 The only part that is at all interesting is that quote character must
1680 be remembered.
1681
1682         quote = t.txt[0];
1683         if (t.txt[t.len-1] != quote)
1684                 goto err;
1685         t.txt += 1;
1686         t.len -= 2;
1687
1688 ###### string vars
1689         char quote;
1690
1691 ### strip multi
1692
1693 For a multi-line string we have a little more work to do.  We need to
1694 remove 3 quotes, not 1, and need to count the indent of the close
1695 quote as it will need to be stripped from all lines.
1696
1697         quote = t.txt[0];
1698         if (t.len < 7 ||
1699             t.txt[1] != quote || t.txt[2] != quote ||
1700             !is_newline(t.txt[3]))
1701                 goto err;
1702         t.txt += 4;
1703         t.len -= 4;
1704         i = t.len;
1705         if (i <= 0 || t.txt[i-1] != quote)
1706                 goto err;
1707         i -= 1;
1708         if (i <= 0 || t.txt[i-1] != quote)
1709                 goto err;
1710         i -= 1;
1711         if (i <= 0 || t.txt[i-1] != quote)
1712                 goto err;
1713         i -= 1;
1714         t.len = i;
1715         while (i > 0 && !is_newline(t.txt[i-1]))
1716                 i--;
1717         indent = 0;
1718         while (i < t.len) {
1719                 if (t.txt[i] == ' ')
1720                         indent += 1;
1721                 if (t.txt[i] == '\t')
1722                         indent = indent_tab(indent);
1723                 i++;
1724         }
1725
1726 ###### string vars
1727         int indent = 0;
1728
1729 ### process string
1730
1731 Now we just take one byte at a time. trans-ASCII unicode won't look
1732 like anything we are interested in so it will just be copied byte by
1733 byte.
1734
1735         cp = str->txt;
1736         at_sol = 1;
1737         for (i = 0; i < t.len; i++) {
1738                 char c;
1739                 if (at_sol) {
1740                         at_sol = 0;
1741                         ## strip indent
1742                         if (i >= t.len)
1743                                 break;
1744                 }
1745                 c = t.txt[i];
1746                 if (c != escape) {
1747                         *cp = c;
1748                         cp += 1;
1749                         if (is_newline(c))
1750                                 at_sol = 1;
1751                 } else if (i+1 >= t.len) {
1752                         // escape and end of string
1753                         goto err;
1754                 } else {
1755                         i += 1;
1756                         c = t.txt[i];
1757                         ## parse escape
1758                 }
1759         }
1760         str->len = cp - str->txt;
1761
1762 ###### string vars
1763         char *cp;
1764         int at_sol;
1765
1766 ### strip indent
1767
1768 Every time we find a start of line, we strip spaces and tabs until the
1769 required indent is found.
1770
1771         int skipped = 0;
1772         while (i < t.len && skipped < indent) {
1773                 c = t.txt[i];
1774                 if (c == ' ')
1775                         skipped += 1;
1776                 else if (c == '\t')
1777                         skipped = indent_tab(skipped);
1778                 else
1779                         break;
1780                 i+= 1;
1781         }
1782
1783 ### parse escape
1784         switch (c) {
1785         case 'n':
1786                 *cp++ = '\n'; break;
1787         case 'r':
1788                 *cp++ = '\r'; break;
1789         case 't':
1790                 *cp++ = '\t'; break;
1791         case 'b':
1792                 *cp++ = '\b'; break;
1793         case 'q':
1794                 *cp++ = quote; break;
1795         case 'f':
1796                 *cp++ = '\f'; break;
1797         case 'v':
1798                 *cp++ = '\v'; break;
1799         case 'a':
1800                 *cp++ = '\a'; break;
1801         case '0':
1802         case '1':
1803         case '2':
1804         case '3':
1805                 // 3 digit octal number
1806                 if (i+2 >= t.len)
1807                         goto err;
1808                 if (t.txt[i+1] < '0' || t.txt[i+1] > '7' ||
1809                     t.txt[i+2] < '0' || t.txt[i+1] > '7')
1810                         goto err;
1811                 n = (t.txt[i  ]-'0') * 64 +
1812                     (t.txt[i+1]-'0') *  8 +
1813                     (t.txt[i+2]-'0') *  1;
1814                 *cp++ = n;
1815                 i += 2;
1816                 break;
1817         case 'x':
1818                 // 2 hex digits
1819                 n = take_hex(2, t.txt+i+1, t.len-i-1);
1820                 if (n < 0)
1821                         goto err;
1822                 *cp++ = n;
1823                 i += 2;
1824                 break;
1825         case 'u':
1826         case 'U':
1827                 // 4 or 8 hex digits for unicode
1828                 n = take_hex(c == 'u'?4:8, t.txt+i+1, t.len-i-1);
1829                 if (n < 0)
1830                         goto err;
1831                 memset(&pstate, 0, sizeof(pstate));
1832                 n = wcrtomb(cp, n, &pstate);
1833                 if (n <= 0)
1834                         goto err;
1835                 cp += n;
1836                 i += c == 'u' ? 4 : 8;
1837                 break;
1838         default:
1839                 if (c == escape)
1840                         *cp++ = c;
1841                 else if (is_newline(c))
1842                         at_sol = 1;
1843                 else
1844                         goto err;
1845         }
1846
1847 ###### string vars
1848         long n;
1849         mbstate_t pstate;
1850
1851 For `\x` `\u` and `\U` we need to collect a specific number of
1852 hexadecimal digits
1853
1854 ###### string functions
1855
1856         static long take_hex(int digits, char *cp, int l)
1857         {
1858                 long n = 0;
1859                 if (l < digits)
1860                         return -1;
1861                 while (digits) {
1862                         char  c = *cp;
1863                         int d;
1864                         if (!isxdigit(c))
1865                                 return -1;
1866                         if (isdigit(c))
1867                                 d = c - '0';
1868                         else if (isupper(c))
1869                                 d = 10 + c - 'A';
1870                         else
1871                                 d = 10 + c - 'a';
1872                         n = n * 16 + d;
1873                         digits--;
1874                         cp++;
1875                 }
1876                 return n;
1877         }
1878
1879 #### File: libstring.c
1880
1881 String parsing goes in `libstring.c`
1882
1883         #include <unistd.h>
1884         #include <stdlib.h>
1885         #include <stdio.h>
1886         #include <string.h>
1887         #include <ctype.h>
1888         #include <wchar.h>
1889         #include "mdcode.h"
1890         #include "scanner.h"
1891         ## string functions
1892         ## string main
1893
1894 ###### File: string.h
1895         int string_parse(struct token *tok, char escape,
1896                          struct text *str, char tail[3]);
1897
1898 ###### File: scanner.mk
1899         all :: libstring.o
1900         libstring.o : libstring.c
1901                 $(CC) $(CFLAGS) -c libstring.c
1902
1903
1904 ## Testing
1905
1906 As "untested code is buggy code" we need a program to easily test
1907 the scanner library.  This will simply parse a given file and report
1908 the tokens one per line.
1909
1910 ###### File: scanner.c
1911
1912         #include <unistd.h>
1913         #include <stdlib.h>
1914         #include <fcntl.h>
1915         #include <errno.h>
1916         #include <sys/mman.h>
1917         #include <string.h>
1918         #include <stdio.h>
1919         #include <gmp.h>
1920         #include <locale.h>
1921         #include "mdcode.h"
1922         #include "scanner.h"
1923         #include "number.h"
1924         #include "string.h"
1925
1926         static int errs;
1927         static void pr_err(char *msg)
1928         {
1929                 errs++;
1930                 fprintf(stderr, "%s\n", msg);
1931         }
1932
1933         int main(int argc, char *argv[])
1934         {
1935                 int fd;
1936                 int len;
1937                 char *file;
1938                 struct token_state *state;
1939                 const char *known[] = {
1940                         "==",
1941                         "else",
1942                         "if",
1943                         "then",
1944                         "while",
1945                         "{",
1946                         "}",
1947                 };
1948                 struct token_config conf = {
1949                         .word_start = "_$",
1950                         .word_cont = "",
1951                         .words_marks = known,
1952                         .number_chars = "., _+-",
1953                         .known_count = sizeof(known)/sizeof(known[0]),
1954                         .ignored = (0 << TK_line_comment)
1955                                   |(0 << TK_block_comment),
1956                 };
1957                 struct section *table, *s, *prev;
1958                 setlocale(LC_ALL,"");
1959                 if (argc != 2) {
1960                         fprintf(stderr, "Usage: scanner file\n");
1961                         exit(2);
1962                 }
1963                 fd = open(argv[1], O_RDONLY);
1964                 if (fd < 0) {
1965                         fprintf(stderr, "scanner: cannot open %s: %s\n",
1966                                 argv[1], strerror(errno));
1967                         exit(1);
1968                 }
1969                 len = lseek(fd, 0, 2);
1970                 file = mmap(NULL, len, PROT_READ, MAP_SHARED, fd, 0);
1971                 table = code_extract(file, file+len, pr_err);
1972
1973                 for (s = table; s;
1974                         (code_free(s->code), prev = s, s = s->next, free(prev))) {
1975                         printf("Tokenizing: %.*s\n", s->section.len,
1976                                 s->section.txt);
1977                         state = token_open(s->code, &conf);
1978                         while(1) {
1979                                 struct token tk = token_next(state);
1980                                 printf("%d:%d ", tk.line, tk.col);
1981                                 token_trace(stdout, tk, 20);
1982                                 if (tk.num == TK_number) {
1983                                         mpq_t num;
1984                                         char tail[3];
1985                                         if (number_parse(num, tail,tk.txt)) {
1986                                                 printf(" %s ", tail);
1987                                                 mpq_out_str(stdout, 10, num);
1988                                                 mpq_clear(num);
1989                                         } else
1990                                                 printf(" BAD NUMBER");
1991                                 }
1992                                 if (tk.num == TK_string ||
1993                                     tk.num == TK_multi_string) {
1994                                         char esc = '\\';
1995                                         struct text str;
1996                                         char tail[3];
1997                                         if (tk.txt.txt[0] == '`')
1998                                                 esc = 0;
1999                                         if (string_parse(&tk, esc,
2000                                                          &str, tail)) {
2001                                                 printf(" %s ", tail);
2002                                                 text_dump(stdout, str, 20);
2003                                                 free(str.txt);
2004                                         } else
2005                                                 printf(" BAD STRING");
2006                                 }
2007                                 printf("\n");
2008                                 if (tk.num == TK_error)
2009                                         errs = 1;
2010                                 if (tk.num == TK_eof)
2011                                         break;
2012                         }
2013                 }
2014                 exit(!!errs);
2015         }
2016 ###### File: scanner.mk
2017         scanner.c : scanner.mdc
2018                 ./md2c scanner.mdc
2019         all :: scanner
2020         scanner : scanner.o scanner.h libscanner.o libmdcode.o mdcode.h
2021                 $(CC) $(CFLAGS) -o scanner scanner.o libscanner.o \
2022                         libmdcode.o libnumber.o libstring.o -licuuc -lgmp
2023         scanner.o : scanner.c
2024                 $(CC) $(CFLAGS) -c scanner.c
2025